ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (12 years, 7 months ago) by gpertea
File size: 80629 byte(s)
Log Message:
massive update with Daehwan's work

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21 gpertea 154 #include <seqan/sequence.h>
22     #include <seqan/find.h>
23     #include <seqan/file.h>
24     #include <seqan/modifier.h>
25    
26 gpertea 29 #include "common.h"
27     #include "bwt_map.h"
28     #include "tokenize.h"
29     #include "reads.h"
30    
31     using namespace std;
32    
33     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
34     {
35     uint32_t reference_id = bh.ref_id();
36    
37     pair<RefHits::iterator, bool> ret =
38     _hits_for_ref.insert(make_pair(reference_id, HitList()));
39     HitList& hl = ret.first->second;
40    
41     if (check_uniqueness)
42     {
43     // Check uniqueness, in case we are adding spliced hits from
44     // several spliced alignment sources (e.g. de novo hashing + Bowtie
45     // against a user-supplied index). We don't want to count the same
46     // alignment twice if it happened to be found by more than one method
47     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
48     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
49     for (; lb != ub && lb != hl.end(); ++lb)
50     {
51     if (*lb == bh)
52     {
53     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
54     return;
55     }
56    
57     if (lb->insert_id() == bh.insert_id() &&
58     lb->ref_id() == bh.ref_id() &&
59     lb->antisense_align() == bh.antisense_align())
60     {
61     // If we get here, we may be looking at the same alignment
62     // However, spanning_reads may report a shorter, trimmed alignment
63     // so not all fields will be equal. If they just disagree on the
64     // ends, and don't indicate a different junction coord, the
65     // alignments are the same.
66    
67     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
68     (bh.left() <= lb->left() && bh.right() >= lb->right()))
69     {
70     vector<pair<int,int> > lb_gaps, bh_gaps;
71     lb->gaps(lb_gaps);
72     bh.gaps(bh_gaps);
73     if (lb_gaps == bh_gaps)
74     {
75     // One alignment is contained in the other, they agree on
76     // where the gaps, if any, are, and they share an id
77     // => this is a redundant aligment, so toss it
78     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
79     return;
80     }
81     }
82     }
83     }
84     }
85     _total_hits++;
86     hl.push_back(bh);
87     }
88    
89     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
90     {
91     return h1.insert_id() < h2.insert_id();
92     }
93    
94     void LineHitFactory::openStream(HitStream& hs)
95     {
96     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
97     //open the file for HitStream here
98     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
99     if (hs._hit_file==NULL)
100     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
101     return;
102     }
103     if (hs._fzpipe!=NULL) {
104     hs._hit_file=hs._fzpipe->file;
105     }
106     }
107    
108     void LineHitFactory::rewind(HitStream& hs)
109     {
110     if (hs._fzpipe!=NULL) {
111     hs._fzpipe->rewind();
112     hs._hit_file=hs._fzpipe->file;
113     }
114     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
115     }
116    
117 gpertea 154 void LineHitFactory::seek(HitStream& hs, int64_t offset)
118     {
119     // daehwan - implement this later
120     if (hs._fzpipe != NULL) {
121     hs._fzpipe->seek(offset);
122     hs._hit_file=hs._fzpipe->file;
123     }
124     // else if (hs._hit_file) ::seek((FILE*)(hs._hit_file));
125     }
126    
127 gpertea 29 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
128     FILE* f=(FILE *)(hs._hit_file);
129     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
130     if (!new_rec || feof(f)) {
131     hs._eof=true;
132     return false;
133     }
134     ++_line_num;
135     char* nl = strrchr(_hit_buf, '\n');
136     if (nl) *nl = 0;
137     buf = _hit_buf;
138     buf_size = _hit_buf_max_sz - 1;
139     return true;
140     }
141    
142     void LineHitFactory::closeStream(HitStream& hs) {
143     if (hs._fzpipe!=NULL) {
144     hs._fzpipe->close();
145     return;
146     }
147     if (hs._hit_file!=NULL) {
148     fclose((FILE*)(hs._hit_file));
149     hs._hit_file=NULL;
150     }
151     }
152     void BAMHitFactory::openStream(HitStream& hs) {
153     if (hs._hit_file==NULL) {
154     if (hs._hit_file_name.empty())
155     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
156     return; //invalid stream, could be just a place holder
157     //open the file here if not already open
158     string fext=getFext(hs._hit_file_name);
159     if (fext=="sam")
160     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
161     else
162     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
163    
164     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
165    
166     if (sam_file == NULL)
167     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
168     if (sam_file->header == NULL)
169     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
170     memset(&_next_hit, 0, sizeof(_next_hit));
171     //_beginning = bgzf_tell(sam_file->x.bam);
172     _sam_header=sam_file->header;
173     if (inspect_header(hs) == false)
174     err_die("Error: invalid SAM header for file %s\n",
175     hs._hit_file_name.c_str());
176     }
177     }
178    
179     void BAMHitFactory::closeStream(HitStream& hs) {
180     if (hs._hit_file) {
181     samclose((samfile_t*)(hs._hit_file));
182     }
183     hs._hit_file=NULL;
184     _sam_header=NULL;
185     }
186    
187     void BAMHitFactory::rewind(HitStream& hs)
188     {
189     /*
190     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
191     {
192     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
193     _eof = false;
194     }
195     */
196     this->closeStream(hs);
197     this->openStream(hs);
198     }
199    
200 gpertea 154 void BAMHitFactory::seek(HitStream& hs, int64_t offset)
201     {
202     if (hs._hit_file) {
203     bgzf_seek(((samfile_t*)hs._hit_file)->x.bam, offset, SEEK_SET);
204     }
205     }
206    
207 gpertea 29 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
208     const bam1_t* bamrec=(const bam1_t*)hit_buf;
209     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
210     string sam_line(tamline);
211     free(tamline);
212     return sam_line;
213     }
214    
215     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
216     if (_next_hit.data) {
217     free(_next_hit.data);
218     _next_hit.data = NULL;
219     }
220     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
221     if (hs.eof() || !hs.ready()) return false;
222    
223     //mark_curr_pos();
224    
225     memset(&_next_hit, 0, sizeof(_next_hit));
226    
227     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
228     if (bytes_read <= 0) {
229     hs._eof = true;
230     return false;
231     }
232     buf = (const char*)&_next_hit;
233     buf_size = bytes_read;
234     return true;
235     }
236    
237    
238     BowtieHit HitFactory::create_hit(const string& insert_name,
239     const string& ref_name,
240 gpertea 154 const string& ref_name2,
241 gpertea 29 int left,
242     const vector<CigarOp>& cigar,
243     bool antisense_aln,
244     bool antisense_splice,
245     unsigned char edit_dist,
246     unsigned char splice_mms,
247     bool end)
248     {
249     uint64_t insert_id = _insert_table.get_id(insert_name);
250 gpertea 154
251 gpertea 29 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
252 gpertea 154 uint32_t reference_id2 = reference_id;
253    
254     if (ref_name2.length() > 0)
255     reference_id2 = _ref_table.get_id(ref_name2, NULL, 0);
256 gpertea 29
257     return BowtieHit(reference_id,
258 gpertea 154 reference_id2,
259 gpertea 29 insert_id,
260     left,
261     cigar,
262     antisense_aln,
263     antisense_splice,
264     edit_dist,
265     splice_mms,
266     end);
267     }
268    
269     BowtieHit HitFactory::create_hit(const string& insert_name,
270     const string& ref_name,
271     uint32_t left,
272     uint32_t read_len,
273     bool antisense_aln,
274     unsigned char edit_dist,
275     bool end)
276     {
277     uint64_t insert_id = _insert_table.get_id(insert_name);
278     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
279    
280     return BowtieHit(reference_id,
281 gpertea 154 reference_id,
282 gpertea 29 insert_id,
283     left,
284     read_len,
285     antisense_aln,
286     edit_dist,
287     end);
288     }
289    
290     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
291     BowtieHit& bh,
292     bool strip_slash,
293     char* name_out,
294     char* name_tags,
295     char* seq,
296     char* qual)
297     {
298     if (!orig_bwt_buf || !*orig_bwt_buf)
299     return false;
300    
301     static const int buf_size = 2048;
302    
303     char bwt_buf[buf_size];
304     strcpy(bwt_buf, orig_bwt_buf);
305     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
306    
307     char orientation;
308    
309     //int bwtf_ret = 0;
310     //uint32_t seqid = 0;
311    
312     char text_name[buf_size];
313     unsigned int text_offset;
314    
315    
316     //unsigned int other_occs;
317     char mismatches[buf_size];
318     //memset(mismatches, 0, sizeof(mismatches));
319    
320     const char* buf = bwt_buf;
321     char* name = get_token((char**)&buf,"\t");
322     char* orientation_str = get_token((char**)&buf,"\t");
323     char* text_name_str = get_token((char**)&buf,"\t");
324    
325     strcpy(text_name, text_name_str);
326    
327     char* text_offset_str = get_token((char**)&buf,"\t");
328     char* seq_str = get_token((char**)&buf,"\t");
329     if (seq)
330     strcpy(seq, seq_str);
331    
332     const char* qual_str = get_token((char**)&buf,"\t");
333     if (qual)
334     strcpy(qual, qual_str);
335    
336     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
337     mismatches[0] = 0;
338     char* mismatches_str = get_token((char**)&buf, "\t");
339     if (mismatches_str)
340     strcpy(mismatches, mismatches_str);
341    
342     orientation = orientation_str[0];
343     text_offset = atoi(text_offset_str);
344    
345     bool end = true;
346     unsigned int seg_offset = 0;
347     unsigned int seg_num = 0;
348     unsigned int num_segs = 0;
349    
350     // Copy the tag out of the name field before we might wipe it out
351     char* pipe = strrchr(name, '|');
352     if (pipe)
353     {
354     if (name_tags)
355     strcpy(name_tags, pipe);
356    
357     char* tag_buf = pipe + 1;
358     if (strchr(tag_buf, ':'))
359     {
360     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
361     if (seg_num + 1 == num_segs)
362     end = true;
363     else
364     end = false;
365     }
366    
367     *pipe = 0;
368     }
369     // Stripping the slash and number following it gives the insert name
370     char* slash = strrchr(name, '/');
371     if (strip_slash && slash)
372     *slash = 0;
373    
374     size_t read_len = strlen(seq_str);
375    
376     // Add this alignment to the table of hits for this half of the
377     // Bowtie map
378    
379     //vector<string> mismatch_toks;
380     char* pch = strtok (mismatches,",");
381     unsigned char num_mismatches = 0;
382     while (pch != NULL)
383     {
384     char* colon = strchr(pch, ':');
385     if (colon)
386     {
387     num_mismatches++;
388     }
389     //mismatch_toks.push_back(pch);
390     pch = strtok (NULL, ",");
391     }
392    
393     bh = create_hit(name,
394     text_name,
395     text_offset,
396     (int)read_len,
397     orientation == '-',
398     num_mismatches,
399     end);
400    
401     return true;
402     }
403    
404     int anchor_mismatch = 0;
405    
406 gpertea 70
407     void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
408     bool &end, unsigned int &seg_offset, unsigned int& seg_num,
409     unsigned int & num_segs) {
410     char* pipe = strrchr(name, '|');
411     if (pipe)
412     {
413     if (name_tags)
414     strcpy(name_tags, pipe);
415    
416     char* tag_buf = pipe + 1;
417     if (strchr(tag_buf, ':'))
418     {
419     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
420     if (seg_num + 1 == num_segs)
421     end = true;
422     else
423     end = false;
424     }
425    
426     *pipe = 0;
427     }
428    
429     // Stripping the slash and number following it gives the insert name
430     char* slash = strrchr(name, '/');
431     if (strip_slash && slash)
432     *slash = 0;
433     }
434    
435    
436 gpertea 29 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
437     BowtieHit& bh,
438     bool strip_slash,
439     char* name_out,
440     char* name_tags,
441     char* seq,
442     char* qual)
443     {
444     if (!orig_bwt_buf || !*orig_bwt_buf)
445     return false;
446    
447     static const int buf_size = 2048;
448    
449     char bwt_buf[buf_size];
450     strcpy(bwt_buf, orig_bwt_buf);
451     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
452    
453     char orientation;
454     char text_name[buf_size];
455     unsigned int text_offset;
456     char mismatches[buf_size];
457     //memset(mismatches, 0, sizeof(mismatches));
458    
459     char* buf = bwt_buf;
460     char* name = get_token((char**)&buf,"\t");
461     char* orientation_str = get_token((char**)&buf,"\t");
462     char* text_name_str = get_token((char**)&buf,"\t");
463     strcpy(text_name, text_name_str);
464    
465     char* text_offset_str = get_token((char**)&buf,"\t");
466     char* seq_str = get_token((char**)&buf,"\t");
467     if (seq)
468     strcpy(seq, seq_str);
469    
470     const char* qual_str = get_token((char**)&buf,"\t");
471     if (qual)
472     strcpy(qual, qual_str);
473    
474     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
475     mismatches[0] = 0;
476     char* mismatches_str = get_token((char**)&buf, "\t");
477     if (mismatches_str)
478     strcpy(mismatches, mismatches_str);
479    
480     orientation = orientation_str[0];
481     text_offset = atoi(text_offset_str);
482    
483     bool end = true;
484     unsigned int seg_offset = 0;
485     unsigned int seg_num = 0;
486     unsigned int num_segs = 0;
487    
488     // Copy the tag out of the name field before we might wipe it out
489 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
490 gpertea 29
491     // Add this alignment to the table of hits for this half of the
492     // Bowtie map
493    
494     // Parse the text_name field to recover the splice coords
495     vector<string> toks;
496    
497     tokenize_strict(text_name, "|", toks);
498    
499     int num_extra_toks = (int)toks.size() - 6;
500    
501     if (num_extra_toks >= 0)
502     {
503     static const uint8_t left_window_edge_field = 1;
504     static const uint8_t splice_field = 2;
505     //static const uint8_t right_window_edge_field = 3;
506     static const uint8_t junction_type_field = 4;
507     static const uint8_t strand_field = 5;
508    
509     string contig = toks[0];
510     for (int t = 1; t <= num_extra_toks; ++t)
511     {
512     contig += "|";
513     contig += toks[t];
514     }
515    
516     vector<string> splice_toks;
517     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
518     if (splice_toks.size() != 2)
519     {
520     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
521     //fprintf(stderr, "%s (token: %s)\n", text_name,
522     // toks[num_extra_toks + splice_field].c_str());
523     return false;
524     }
525    
526     //
527     // check for an insertion hit
528     //
529     if(toks[num_extra_toks + junction_type_field] == "ins")
530     {
531     int8_t spliced_read_len = strlen(seq_str);
532     /*
533     * The 0-based position of the left edge of the alignment. Note that this
534     * value may need to be futher corrected to account for the presence of
535     * of the insertion.
536     */
537     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
538     uint32_t right = left + spliced_read_len - 1;
539    
540 gpertea 137
541 gpertea 29 /*
542     * The 0-based position of the last genomic sequence before the insertion
543     */
544     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
545    
546     string insertedSequence = splice_toks[1];
547     /*
548     * The 0-based position of the first genomic sequence after teh insertion
549     */
550     uint32_t right_splice_pos = left_splice_pos + 1;
551     if(left > left_splice_pos){
552     /*
553     * The genomic position of the left edge of the alignment needs to be corrected
554     * If the alignment does not extend into the insertion, simply subtract the length
555     * of the inserted sequence, otherwise, just set it equal to the right edge
556     */
557     left = left - insertedSequence.length();
558     if(left < right_splice_pos){
559     left = right_splice_pos;
560     }
561     }
562     if(right > left_splice_pos){
563     right = right - insertedSequence.length();
564     if(right < left_splice_pos){
565     right = left_splice_pos;
566     }
567     }
568     /*
569     * Now, right and left should be properly transformed into genomic coordinates
570     * We should be able to deduce how much the alignment matches the insertion
571     * simply based on the length of the read
572     */
573     int left_match_length = 0;
574     if(left <= left_splice_pos){
575     left_match_length = left_splice_pos - left + 1;
576     }
577     int right_match_length = 0;
578     if(right >= right_splice_pos){
579     right_match_length = right - right_splice_pos + 1;
580     }
581     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
582    
583     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
584     return false;
585    
586     string junction_strand = toks[num_extra_toks + strand_field];
587     if(junction_strand != "rev" && junction_strand != "fwd"){
588     fprintf(stderr, "Malformed insertion record\n");
589     return false;
590     }
591    
592     char* pch = strtok( mismatches, ",");
593     unsigned char num_mismatches = 0;
594    
595     /*
596     * remember that text_offset holds the left end of the
597     * alignment, relative to the start of the contig
598     */
599    
600     /*
601     * The 0-based relative position of the left-most character
602     * before the insertion in the contig
603     */
604     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
605     while (pch != NULL)
606     {
607     char* colon = strchr(pch, ':');
608     if (colon)
609     {
610     *colon = 0;
611     int mismatch_pos = atoi(pch);
612    
613     /*
614     * for reversely mapped reads,
615     * find the correct mismatched position.
616     */
617     if(orientation == '-'){
618     mismatch_pos = spliced_read_len - mismatch_pos - 1;
619     }
620    
621     /*
622     * Only count mismatches outside of the insertion region
623     * If there is a mismatch within the insertion,
624     * disallow this hit
625     */
626     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
627     num_mismatches++;
628     }else{
629     return false;
630     }
631     }
632     pch = strtok (NULL, ",");
633     }
634    
635    
636     vector<CigarOp> cigar;
637     cigar.push_back(CigarOp(MATCH, left_match_length));
638     cigar.push_back(CigarOp(INS, insertion_match_length));
639     cigar.push_back(CigarOp(MATCH, right_match_length));
640    
641     /*
642     * For now, disallow hits that don't span
643     * the insertion. If we allow these types of hits,
644     * then long_spanning.cpp needs to be updated
645     * in order to intelligently merge these kinds
646     * of reads back together
647     *
648     * Following code has been changed to allow segment that end
649     * in an insertion
650     */
651     bh = create_hit(name,
652     contig,
653 gpertea 154 "",
654 gpertea 29 left,
655     cigar,
656     orientation == '-',
657     junction_strand == "rev",
658     num_mismatches,
659     0,
660     end);
661     return true;
662     }
663    
664     else
665     {
666 gpertea 154 const string& junction_type = toks[num_extra_toks + junction_type_field];
667     string junction_strand = toks[num_extra_toks + strand_field];
668    
669 gpertea 29 int spliced_read_len = strlen(seq_str);
670 gpertea 154 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str());
671     int8_t left_splice_pos = atoi(splice_toks[0].c_str());
672     if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
673     {
674     left += text_offset;
675     left_splice_pos = left_splice_pos - left + 1;
676     }
677     else
678     {
679     left -= text_offset;
680     left_splice_pos = left - left_splice_pos + 1;
681     }
682    
683 gpertea 29 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
684     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
685 gpertea 154
686     int gap_len = 0;
687     if (junction_type == "fus")
688     gap_len = atoi(splice_toks[1].c_str());
689     else
690     gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
691    
692 gpertea 29 if (right_splice_pos <= 0 || left_splice_pos <= 0)
693     return false;
694    
695     if (orientation == '+')
696     {
697     if (left_splice_pos + seg_offset < _anchor_length){
698     return false;
699     }
700     }
701     else
702     {
703     if (right_splice_pos + seg_offset < _anchor_length)
704     return false;
705     }
706 gpertea 154
707     if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")||
708 gpertea 29 !(orientation == '-' || orientation == '+'))
709     {
710     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
711     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
712     // junction_strand.c_str(), orientation);
713     return false;
714     }
715 gpertea 154
716 gpertea 29 char* pch = strtok (mismatches,",");
717     int mismatches_in_anchor = 0;
718     unsigned char num_mismatches = 0;
719     while (pch != NULL)
720     {
721     char* colon = strchr(pch, ':');
722     if (colon)
723     {
724     *colon = 0;
725     num_mismatches++;
726     int mismatch_pos = atoi(pch);
727     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
728     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
729     mismatches_in_anchor++;
730     }
731     pch = strtok (NULL, ",");
732     }
733    
734     // FIXME: we probably should exclude these hits somewhere, but this
735     // isn't the right place
736     vector<CigarOp> cigar;
737 gpertea 154 if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
738     cigar.push_back(CigarOp(MATCH, left_splice_pos));
739     else
740     cigar.push_back(CigarOp(mATCH, left_splice_pos));
741    
742     if(junction_type == "del")
743 gpertea 29 cigar.push_back(CigarOp(DEL, gap_len));
744 gpertea 154 else if(junction_type == "fus")
745     {
746     if (junction_strand == "ff")
747     cigar.push_back(CigarOp(FUSION_FF, gap_len));
748     else if (junction_strand == "fr")
749     cigar.push_back(CigarOp(FUSION_FR, gap_len));
750     else if (junction_strand == "rf")
751     cigar.push_back(CigarOp(FUSION_RF, gap_len));
752     else
753     cigar.push_back(CigarOp(FUSION_RR, gap_len));
754     }
755     else
756 gpertea 29 cigar.push_back(CigarOp(REF_SKIP, gap_len));
757 gpertea 154
758     if (junction_type != "fus" || (junction_strand != "fr" && junction_strand != "rr"))
759     cigar.push_back(CigarOp(MATCH, right_splice_pos));
760     else
761     cigar.push_back(CigarOp(mATCH, right_splice_pos));
762    
763     string contig2 = "";
764     if (junction_type == "fus")
765     {
766     vector<string> contigs;
767     tokenize(contig, "-", contigs);
768     if (contigs.size() != 2)
769     return false;
770    
771     contig = contigs[0];
772     contig2 = contigs[1];
773    
774     if (junction_strand == "rf" || junction_strand == "rr")
775     orientation = (orientation == '+' ? '-' : '+');
776     }
777    
778 gpertea 29 bh = create_hit(name,
779     contig,
780 gpertea 154 contig2,
781     left,
782 gpertea 29 cigar,
783     orientation == '-',
784     junction_strand == "rev",
785     num_mismatches,
786     mismatches_in_anchor,
787     end);
788     return true;
789     }
790     }
791     else
792     {
793     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
794     //fprintf(stderr, "%s\n", orig_bwt_buf);
795     // continue;
796     return false;
797     }
798    
799     return false;
800     }
801    
802 gpertea 70
803 gpertea 137 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
804 gpertea 70 bool &spliced_alignment) {
805     const char* p_cig = cigar_str;
806 gpertea 137 int refspan=0; //alignment span on reference sequence
807 gpertea 154
808 gpertea 70 while (*p_cig)
809     {
810     char* t;
811 gpertea 137 int op_len = (int)strtol(p_cig, &t, 10);
812     if (op_len <= 0)
813 gpertea 70 {
814 gpertea 137 fprintf (stderr, "Error: CIGAR op has zero length\n");
815     return 0;
816 gpertea 70 }
817     char op_char = toupper(*t);
818     CigarOpCode opcode;
819 gpertea 137 switch (op_char) {
820     case '=':
821     case 'X':
822     case 'M': opcode = MATCH;
823     refspan+=op_len;
824     break;
825     case 'I': opcode = INS;
826     break;
827     case 'D': opcode = DEL;
828     refspan+=op_len;
829     break;
830     case 'N': if (op_len > max_report_intron_length)
831     return 0;
832     opcode = REF_SKIP;
833     spliced_alignment = true;
834     refspan+=op_len;
835     break;
836     case 'S': opcode = SOFT_CLIP;
837     break;
838     case 'H': opcode = HARD_CLIP;
839     break;
840     case 'P': opcode = PAD;
841     break;
842     default: fprintf (stderr, "Error: invalid CIGAR operation\n");
843     return 0;
844     }
845 gpertea 70 p_cig = t + 1;
846 gpertea 137 cigar.push_back(CigarOp(opcode, op_len));
847     } //while cigar codes
848 gpertea 70 if (*p_cig) {
849 gpertea 137 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
850     return 0;
851 gpertea 70 }
852 gpertea 137 return refspan;
853 gpertea 70 }
854    
855 gpertea 154 int getBAMmismatches(const bam1_t* buf, vector<CigarOp>& cigar,
856     vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
857     int gspan=0;//genomic span of the alignment
858     sam_nm=0;
859     int num_mismatches=0;
860    
861     uint8_t* ptr = bam_aux_get(buf, "XS");
862     if (ptr) {
863     char src_strand_char = bam_aux2A(ptr);
864     if (src_strand_char == '-')
865     antisense_splice = true;
866     }
867    
868     ptr = bam_aux_get(buf, "MD");
869     if (ptr) {
870     const char* p = bam_aux2Z(ptr);
871     int bi=0; //base offset position in the read
872     while (*p != 0) {
873     if (isdigit(*p)) {
874     int v=atoi(p);
875     do { p++; } while (isdigit(*p));
876     bi+=v;
877     }
878     while (isalpha(*p)) {
879     p++;
880     num_mismatches++;
881     //mismatches.push_back(bi);
882     mismatches[bi]=true;
883     bi++;
884     }
885     if (*p=='^') { //reference deletion
886     p++;
887     while (isalpha(*p)) { //insert read bases
888     p++; bi++;
889     }
890     }
891     }
892     }
893    
894     /* By convention,the NM field of the SAM record
895     * counts an insertion or deletion. I dont' think
896     * we want the mismatch count in the BowtieHit
897     * record to reflect this. Therefore, subtract out
898     * the mismatches due to in/dels
899     */
900     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
901     switch (itr->opcode)
902     {
903     case MATCH:
904     case REF_SKIP:
905     case PAD:
906     gspan += itr->length;
907     break;
908     case DEL:
909     gspan += itr->length;
910     sam_nm -= itr->length;
911     break;
912     case INS:
913     sam_nm -= itr->length;
914     break;
915     default:
916     break;
917     }
918     }
919     return num_mismatches;
920     }
921    
922 gpertea 137 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
923 gpertea 139 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
924 gpertea 137 int gspan=0;//genomic span of the alignment
925     const char* tag_buf = buf;
926 gpertea 70 sam_nm=0;
927 gpertea 139 int num_mismatches=0;
928 gpertea 70 while((tag_buf = get_token((char**)&buf,"\t")))
929     {
930     vector<string> tuple_fields;
931     tokenize(tag_buf,":", tuple_fields);
932     if (tuple_fields.size() == 3)
933     {
934     if (tuple_fields[0] == "XS")
935     {
936     if (tuple_fields[2] == "-")
937     antisense_splice = true;
938     }
939     else if (tuple_fields[0] == "NM")
940     {
941     sam_nm = atoi(tuple_fields[2].c_str());
942     }
943     else if (tuple_fields[0] == "NS")
944     {
945     //ignored for now
946     }
947     else if (tuple_fields[0] == "MD")
948     {
949     const char* p=tuple_fields[2].c_str();
950     int bi=0; //base offset position in the read
951     while (*p != 0) {
952     if (isdigit(*p)) {
953     int v=atoi(p);
954     do { p++; } while (isdigit(*p));
955     bi+=v;
956     }
957 gpertea 137 while (isalpha(*p)) {
958 gpertea 70 p++;
959 gpertea 139 num_mismatches++;
960     //mismatches.push_back(bi);
961     mismatches[bi]=true;
962 gpertea 70 bi++;
963     }
964     if (*p=='^') { //reference deletion
965     p++;
966 gpertea 137 while (isalpha(*p)) { //insert read bases
967 gpertea 70 p++; bi++;
968 gpertea 137 }
969 gpertea 70 }
970     }
971     }
972 gpertea 139 //else
973     //{
974 gpertea 70 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
975     //return false;
976 gpertea 139 //}
977 gpertea 70 }
978     }
979     /* By convention,the NM field of the SAM record
980     * counts an insertion or deletion. I dont' think
981     * we want the mismatch count in the BowtieHit
982     * record to reflect this. Therefore, subtract out
983     * the mismatches due to in/dels
984     */
985     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
986 gpertea 137 switch (itr->opcode)
987     {
988     case MATCH:
989     case REF_SKIP:
990     case PAD:
991     gspan += itr->length;
992     break;
993     case DEL:
994     gspan += itr->length;
995     sam_nm -= itr->length;
996     break;
997     case INS:
998     sam_nm -= itr->length;
999     break;
1000 gpertea 154 default:
1001     break;
1002 gpertea 70 }
1003 gpertea 137 }
1004 gpertea 139 return num_mismatches;
1005 gpertea 70 }
1006    
1007 gpertea 29 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1008     BowtieHit& bh,
1009     bool strip_slash,
1010     char* name_out,
1011     char* name_tags,
1012     char* seq,
1013     char* qual)
1014     {
1015     if (!orig_bwt_buf || !*orig_bwt_buf)
1016     return false;
1017     char bwt_buf[2048];
1018     strcpy(bwt_buf, orig_bwt_buf);
1019     // Are we still in the header region?
1020     if (bwt_buf[0] == '@')
1021     return false;
1022 gpertea 154
1023 gpertea 29 char* buf = bwt_buf;
1024     char* name = get_token((char**)&buf,"\t");
1025     char* sam_flag_str = get_token((char**)&buf,"\t");
1026     char* text_name = get_token((char**)&buf,"\t");
1027     char* text_offset_str = get_token((char**)&buf,"\t");
1028     const char* map_qual_str = get_token((char**)&buf,"\t");
1029     char* cigar_str = get_token((char**)&buf,"\t");
1030     const char* mate_ref_str = get_token((char**)&buf,"\t");
1031     const char* mate_pos_str = get_token((char**)&buf,"\t");
1032     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1033    
1034     const char* seq_str = get_token((char**)&buf,"\t");
1035     if (seq)
1036     strcpy(seq, seq_str);
1037     const char* qual_str = get_token((char**)&buf,"\t");
1038     if (qual)
1039     strcpy(qual, qual_str);
1040    
1041     if (!name ||
1042     !sam_flag_str ||
1043     !text_name ||
1044     !text_offset_str ||
1045     !map_qual_str ||
1046     !cigar_str ||
1047     !mate_ref_str ||
1048     !mate_pos_str ||
1049     !inferred_insert_sz_str ||
1050     !seq_str ||
1051     !qual_str)
1052     {
1053     // truncated or malformed SAM record
1054     return false;
1055 gpertea 154 }
1056    
1057 gpertea 29 int sam_flag = atoi(sam_flag_str);
1058 gpertea 154 string ref_name = text_name, ref_name2 = "";
1059 gpertea 29 int text_offset = atoi(text_offset_str);
1060    
1061     bool end = true;
1062     unsigned int seg_offset = 0;
1063     unsigned int seg_num = 0;
1064     unsigned int num_segs = 0;
1065    
1066     // Copy the tag out of the name field before we might wipe it out
1067 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1068 gpertea 154
1069 gpertea 29 vector<CigarOp> cigar;
1070     bool spliced_alignment = false;
1071 gpertea 154
1072 gpertea 137 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
1073     if (refspan==0)
1074 gpertea 70 return false;
1075 gpertea 29 //vector<string> attributes;
1076     //tokenize(tag_buf, " \t",attributes);
1077 gpertea 70
1078 gpertea 29 bool antisense_splice = false;
1079 gpertea 70 int sam_nm = 0; //the value of the NM tag (edit distance)
1080 gpertea 137 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
1081 gpertea 139 vector<bool> mismatches;
1082     mismatches.resize(strlen(seq_str), false);
1083     int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
1084 gpertea 154
1085 gpertea 29 if (spliced_alignment)
1086     {
1087     bh = create_hit(name,
1088 gpertea 154 ref_name,
1089     ref_name2,
1090 gpertea 29 text_offset - 1,
1091     cigar,
1092     sam_flag & 0x0010,
1093     antisense_splice,
1094 gpertea 139 num_mismatches,
1095 gpertea 70 0,
1096 gpertea 29 end);
1097     }
1098     else
1099     {
1100     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1101     bh = create_hit(name,
1102 gpertea 154 ref_name,
1103     ref_name2,
1104 gpertea 29 text_offset - 1, // SAM files are 1-indexed
1105     cigar,
1106     sam_flag & 0x0010,
1107     false,
1108 gpertea 139 num_mismatches,
1109 gpertea 29 0,
1110     end);
1111     }
1112 gpertea 70 return true;
1113 gpertea 29 }
1114    
1115 gpertea 137 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
1116     if (op.length<=0) return;
1117     if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
1118     cigar.back().length+=op.length;
1119     }
1120     cigar.push_back(op);
1121     }
1122 gpertea 70
1123 gpertea 139 int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches,
1124 gpertea 137 int &left, int spl_start, int spl_len, CigarOpCode spl_code) {
1125 gpertea 70 //merge the original 'cigar' with the new insert/gap operation
1126 gpertea 137 //at position spl_start and place the result into splcigar;
1127 gpertea 139 //TODO: ideally this should also get and rebuild the MD string (alignment mismatches)
1128     int spl_mismatches=0;
1129     //return value: mismatches in the insert region for INS case,
1130     //or number of mismatches in the anchor region
1131     //return -1 if somehow the hit seems bad
1132    
1133 gpertea 137 //these offsets are relative to the beginning of alignment on reference
1134 gpertea 154 int spl_ofs=abs(spl_start-left); //relative position of splice op
1135 gpertea 137 int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
1136 gpertea 154 CigarOp gapop(spl_code, spl_len); //for DEL, REF_SKIP, FUSIONS
1137 gpertea 137 if (spl_code==INS)
1138     spl_ofs_end += spl_len;
1139     int ref_ofs=0; //working offset on reference
1140     int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
1141     bool xfound=false;
1142     //if (left<=spl_start+spl_len) {
1143     if (spl_ofs_end>0) {
1144     int prev_opcode=0;
1145     int prev_oplen=0;
1146     for (size_t c = 0 ; c < cigar.size(); ++c)
1147     {
1148     int prev_read_ofs=read_ofs;
1149     int cur_op_ofs=ref_ofs;
1150     int cur_opcode=cigar[c].opcode;
1151     int cur_oplen=cigar[c].length;
1152 gpertea 154
1153 gpertea 137 switch (cur_opcode) {
1154     case MATCH:
1155     ref_ofs+=cur_oplen;
1156     read_ofs+=cur_oplen;
1157 gpertea 154 if (spl_code==REF_SKIP || spl_code==DEL ||
1158     spl_code==FUSION_FF || spl_code==FUSION_FR ||
1159     spl_code==FUSION_RF || spl_code==FUSION_RR) {
1160     for (int o=cur_op_ofs;o<ref_ofs;o++) {
1161     int rofs=prev_read_ofs+(o-cur_op_ofs);
1162     if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs])
1163     spl_mismatches++;
1164     }
1165     }
1166     else if (spl_code==INS) {
1167     for (int o=cur_op_ofs;o<ref_ofs;o++) {
1168     int rofs=prev_read_ofs+(o-cur_op_ofs);
1169     if (o>=spl_ofs && o<spl_ofs_end && mismatches[rofs])
1170     spl_mismatches++;
1171     }
1172     }
1173 gpertea 137 break;
1174     case DEL:
1175     case REF_SKIP:
1176     case PAD:
1177     ref_ofs+=cur_oplen;
1178     break;
1179     case SOFT_CLIP:
1180     case INS:
1181     read_ofs+=cur_oplen;
1182     break;
1183     //case HARD_CLIP:
1184     }
1185 gpertea 139
1186 gpertea 137 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1187 gpertea 139 if (cur_op_ofs==spl_ofs_end && spl_code!=INS)
1188 gpertea 137 {
1189 gpertea 139 xfound=true;
1190 gpertea 137 //we have to insert the gap here first
1191 gpertea 139 cigar_add(splcigar, gapop);
1192     //also, check
1193 gpertea 137 }
1194     cigar_add(splcigar, cigar[c]);
1195     }
1196     else //if (ref_ofs>spl_ofs) {
1197     { //op intersection
1198     xfound=true;
1199     if (spl_code==INS) {
1200     //we have to shorten cur_opcode
1201     // find the overlap between current range
1202     int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1203     int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1204 gpertea 154
1205     CigarOp op(cigar[c]);
1206     op.length=spl_ofs-cur_op_ofs;
1207     if (spl_ofs>cur_op_ofs)
1208     cigar_add(splcigar, op);
1209     if (spl_ofs<0)
1210     {
1211     CigarOp temp = gapop;
1212     temp.length += spl_ofs;
1213     if (temp.length>0)
1214     cigar_add(splcigar, temp);
1215     }
1216     else
1217     cigar_add(splcigar, gapop);
1218     op.length=ref_ofs-spl_ofs_end;
1219     if (ref_ofs>spl_ofs_end)
1220     cigar_add(splcigar,op);
1221 gpertea 137 }
1222 gpertea 154 else {//DEL or REF_SKIP or FUSION_[FR][FR]
1223 gpertea 137 //spl_ofs == spl_ofs_end
1224     //we have to split cur_opcode
1225 gpertea 139 //look for mismatches within min_anchor_len distance from splice point
1226 gpertea 137 CigarOp op(cigar[c]);
1227 gpertea 154 CigarOpCode opcode = op.opcode;
1228 gpertea 137 op.length=spl_ofs-cur_op_ofs;
1229 gpertea 154 if (gapop.opcode == FUSION_RF || gapop.opcode == FUSION_RR)
1230     {
1231     if (opcode == MATCH)
1232     op.opcode = mATCH;
1233     else if (opcode == INS)
1234     op.opcode = iNS;
1235     else if (opcode == DEL)
1236     op.opcode = dEL;
1237     else if (opcode == REF_SKIP)
1238     op.opcode = rEF_SKIP;
1239     }
1240 gpertea 137 cigar_add(splcigar, op);
1241 gpertea 154
1242     cigar_add(splcigar, gapop);
1243    
1244     op.opcode = opcode;
1245     if (gapop.opcode == FUSION_FR || gapop.opcode == FUSION_RR)
1246     {
1247     if (opcode == MATCH)
1248     op.opcode = mATCH;
1249     else if (opcode == INS)
1250     op.opcode = iNS;
1251     else if (opcode == DEL)
1252     op.opcode = dEL;
1253     else if (opcode == REF_SKIP)
1254     op.opcode = rEF_SKIP;
1255     }
1256 gpertea 137 op.length=ref_ofs-spl_ofs;
1257     cigar_add(splcigar,op);
1258     }
1259     } //op intersection
1260     prev_opcode=cur_opcode;
1261     prev_oplen=cur_oplen;
1262     } //for each cigar opcode
1263     } //intersection possible
1264 gpertea 139
1265     //if (!xfound) {//no intersection found between splice event and alignment
1266 gpertea 137 if (spl_ofs_end<=0) {
1267     //alignment starts after the splice event
1268     if (spl_code==INS) left-=spl_len;
1269     else left+=spl_len;
1270 gpertea 154
1271     splcigar = cigar;
1272 gpertea 137 }
1273     //else {
1274     //alignment ends before the splice event
1275     //nothing to do
1276     // }
1277 gpertea 139 //return spl_mismatches;
1278     // }
1279 gpertea 97
1280 gpertea 139 return spl_mismatches;
1281 gpertea 70 }
1282    
1283 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1284     BowtieHit& bh,
1285     bool strip_slash,
1286     char* name_out,
1287     char* name_tags,
1288     char* seq,
1289     char* qual)
1290     {
1291     if (!orig_bwt_buf || !*orig_bwt_buf)
1292     return false;
1293 gpertea 29
1294 gpertea 69 char bwt_buf[2048];
1295     strcpy(bwt_buf, orig_bwt_buf);
1296     // Are we still in the header region?
1297     if (bwt_buf[0] == '@')
1298     return false;
1299 gpertea 29
1300 gpertea 69 char* buf = bwt_buf;
1301     char* name = get_token((char**)&buf,"\t");
1302     char* sam_flag_str = get_token((char**)&buf,"\t");
1303     char* text_name = get_token((char**)&buf,"\t");
1304     char* text_offset_str = get_token((char**)&buf,"\t");
1305     const char* map_qual_str = get_token((char**)&buf,"\t");
1306     char* cigar_str = get_token((char**)&buf,"\t");
1307     const char* mate_ref_str = get_token((char**)&buf,"\t");
1308     const char* mate_pos_str = get_token((char**)&buf,"\t");
1309     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1310 gpertea 137 //int num_mismatches=0;
1311     //int mismatches[1024]; //list of 0-based mismatch positions in this read
1312 gpertea 70 //parsed from SAM's MD:Z: tag
1313 gpertea 69 const char* seq_str = get_token((char**)&buf,"\t");
1314     if (seq)
1315     strcpy(seq, seq_str);
1316    
1317     const char* qual_str = get_token((char**)&buf,"\t");
1318     if (qual)
1319     strcpy(qual, qual_str);
1320    
1321     if (!name ||
1322     !sam_flag_str ||
1323     !text_name ||
1324     !text_offset_str ||
1325     !map_qual_str ||
1326     !cigar_str ||
1327     !mate_ref_str ||
1328     !mate_pos_str ||
1329     !inferred_insert_sz_str ||
1330     !seq_str ||
1331     !qual_str)
1332     {
1333     // truncated or malformed SAM record
1334     return false;
1335     }
1336    
1337     int sam_flag = atoi(sam_flag_str);
1338     int text_offset = atoi(text_offset_str);
1339 gpertea 137 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1340 gpertea 69 bool end = true;
1341     unsigned int seg_offset = 0;
1342     unsigned int seg_num = 0;
1343     unsigned int num_segs = 0;
1344    
1345     // Copy the tag out of the name field before we might wipe it out
1346 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1347 gpertea 69
1348 gpertea 70 vector<CigarOp> samcigar;
1349     bool spliced_alignment = false;
1350 gpertea 137 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1351 gpertea 154
1352 gpertea 137 if (refspan==0)
1353     return false;
1354 gpertea 70 bool antisense_splice = false;
1355     int sam_nm = 0;
1356 gpertea 139 vector<bool> mismatches;
1357     mismatches.resize(strlen(seq_str), false);
1358 gpertea 69
1359 gpertea 139 int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1360    
1361 gpertea 70 //##############################################
1362 gpertea 69
1363     // Add this alignment to the table of hits for this half of the
1364     // Bowtie map
1365    
1366     // Parse the text_name field to recover the splice coords
1367     vector<string> toks;
1368     tokenize_strict(text_name, "|", toks);
1369    
1370     int num_extra_toks = (int)toks.size() - 6;
1371    
1372     if (num_extra_toks >= 0)
1373     {
1374     static const uint8_t left_window_edge_field = 1;
1375     static const uint8_t splice_field = 2;
1376     //static const uint8_t right_window_edge_field = 3;
1377     static const uint8_t junction_type_field = 4;
1378     static const uint8_t strand_field = 5;
1379    
1380     string contig = toks[0];
1381     for (int t = 1; t <= num_extra_toks; ++t)
1382     {
1383     contig += "|";
1384     contig += toks[t];
1385     }
1386    
1387     vector<string> splice_toks;
1388     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1389     if (splice_toks.size() != 2)
1390     {
1391     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1392 gpertea 70 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1393 gpertea 69 // toks[num_extra_toks + splice_field].c_str());
1394     return false;
1395     }
1396 gpertea 137
1397     string junction_strand = toks[num_extra_toks + strand_field];
1398     if(junction_strand != "rev" && junction_strand != "fwd"){
1399     fprintf(stderr, "Malformed insertion record\n");
1400     return false;
1401     }
1402    
1403 gpertea 154 //
1404     // check for an insertion hit
1405     //
1406     if(toks[num_extra_toks + junction_type_field] == "ins")
1407     {
1408     //int8_t spliced_read_len = strlen(seq_str);
1409     //TODO FIXME: use the CIGAR instead of seq length!
1410     // The 0-based position of the left edge of the alignment. Note that this
1411     // value may need to be further corrected to account for the presence of
1412     // of the insertion.
1413     int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1414    
1415     // The 0-based position of the last genomic sequence before the insertion
1416     int left_splice_pos = atoi(splice_toks[0].c_str());
1417    
1418     string insertedSequence = splice_toks[1];
1419     // The 0-based position of the first genomic sequence after the insertion
1420 gpertea 69
1421 gpertea 154 vector<CigarOp> splcigar;
1422     //this also updates left to the adjusted genomic coordinates
1423     int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
1424     left, left_splice_pos+1, insertedSequence.length(), INS);
1425 gpertea 69
1426 gpertea 154 if (spl_num_mismatches<0) return false;
1427     num_mismatches-=spl_num_mismatches;
1428     /*
1429     uint32_t right_splice_pos = left_splice_pos + 1;
1430    
1431     //uint32_t right = left + spliced_read_len - 1;
1432     int right = left + refspan - 1;
1433    
1434     if(left > left_splice_pos){
1435     //The genomic position of the left edge of the alignment needs to be corrected
1436     //If the alignment does not extend into the insertion, simply subtract the length
1437     //of the inserted sequence, otherwise, just set it equal to the right edge
1438     left = left - insertedSequence.length();
1439     if(left < right_splice_pos){
1440     left = right_splice_pos;
1441     }
1442     }
1443     if(right > left_splice_pos){
1444     right = right - insertedSequence.length();
1445     if(right < left_splice_pos){
1446     right = left_splice_pos;
1447     }
1448     }
1449     // Now, right and left should be properly transformed into genomic coordinates
1450     // We should be able to deduce how much the alignment matches the insertion
1451     // simply based on the length of the read
1452     int left_match_length = 0;
1453     if(left <= left_splice_pos){
1454     left_match_length = left_splice_pos - left + 1;
1455     }
1456     int right_match_length = 0;
1457     if(right >= right_splice_pos){
1458     right_match_length = right - right_splice_pos + 1;
1459     }
1460     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1461    
1462     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1463     return false;
1464    
1465     //char* pch = strtok( mismatches, ",");
1466     //unsigned char num_mismatches = 0;
1467     //text_offset holds the left end of the alignment,
1468     //RELATIVE TO the start of the contig
1469    
1470     //The 0-based relative position of the left-most character
1471     //before the insertion in the contig
1472     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1473     for (size_t i=0;i<mismatches.size();++i) {
1474     int mismatch_pos = mismatches[i];
1475     // for reversely mapped reads,
1476     //find the correct mismatched position.
1477     if (sam_flag & 0x0010){
1478     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1479     }
1480    
1481     //Only count mismatches outside of the insertion region
1482     // If there is a mismatch within the insertion,
1483     // disallow this hit
1484     if(mismatch_pos + text_offset <= relative_splice_pos ||
1485     mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1486     spl_num_mismatches++;
1487     }else{
1488     return false;
1489     }
1490     }
1491     */
1492     //vector<CigarOp> splcigar;
1493     //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1494     //splcigar.push_back(CigarOp(MATCH, left_match_length));
1495     //splcigar.push_back(CigarOp(INS, insertion_match_length));
1496     //splcigar.push_back(CigarOp(MATCH, right_match_length));
1497    
1498     bh = create_hit(name,
1499     contig,
1500     "",
1501     left,
1502     //splcigar,
1503     splcigar,
1504     sam_flag & 0x0010,
1505     junction_strand == "rev",
1506     num_mismatches,
1507     0,
1508     end);
1509 gpertea 70
1510 gpertea 154 return true;
1511     } //"ins"
1512     else //"del" or intron
1513     {
1514     // The 0-based position of the left edge of the alignment.
1515     int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1516    
1517     // The 0-based position of the last genomic sequence before the deletion
1518     int left_splice_pos = atoi(splice_toks[0].c_str());
1519    
1520 gpertea 139 int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
1521     /*
1522 gpertea 154 if ((sam_flag & 0x0010) == 0) //######
1523     {
1524     if (left_splice_ofs + seg_offset < _anchor_length)
1525     return false;
1526 gpertea 135 }
1527 gpertea 154 else
1528 gpertea 135 {
1529 gpertea 154 if (right_splice_ofs + seg_offset < _anchor_length)
1530     return false;
1531 gpertea 135 }
1532 gpertea 154 */
1533     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1534     //atoi(toks[right_window_edge_field].c_str());
1535    
1536 gpertea 139 /*
1537     //offset of deletion point, relative to the beginning of the alignment
1538     int left_splice_ofs = left_splice_pos-left+1;
1539 gpertea 154
1540     int mismatches_in_anchor = 0;
1541     for (size_t i=0;i<mismatches.size();++i) {
1542     spl_num_mismatches++;
1543     int mismatch_pos = mismatches[i];
1544     if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) ||
1545     ((sam_flag & 0x0010) != 0 &&
1546     abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len)
1547     mismatches_in_anchor++;
1548     }
1549     */
1550     vector<CigarOp> splcigar;
1551 gpertea 139
1552 gpertea 154 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP;
1553    
1554     int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left,
1555     left_splice_pos+1, gap_len, opcode);
1556 gpertea 140
1557 gpertea 154 if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
1558     return false;
1559 gpertea 139 /*
1560 gpertea 154 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1561     if(toks[num_extra_toks + junction_type_field] == "del"){
1562     splcigar.push_back(CigarOp(DEL, gap_len));
1563     }else{
1564     splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1565     }
1566     splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1567 gpertea 139 */
1568 gpertea 154 bh = create_hit(name,
1569     contig,
1570     "",
1571     left,
1572     splcigar,
1573     (sam_flag & 0x0010),
1574     junction_strand == "rev",
1575     num_mismatches,
1576     spl_num_mismatches,
1577     end);
1578    
1579     return true;
1580     }
1581 gpertea 70 } //parse splice data
1582 gpertea 69 else
1583 gpertea 154 {
1584     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1585     //fprintf(stderr, "%s\n", orig_bwt_buf);
1586     // continue;
1587     return false;
1588     }
1589    
1590 gpertea 69 return false;
1591     }
1592    
1593    
1594 gpertea 135
1595 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1596     BowtieHit& bh, bool strip_slash,
1597     char* name_out, char* name_tags,
1598     char* seq, char* qual) {
1599 gpertea 154 if (_sam_header==NULL)
1600     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1601     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1602    
1603     uint32_t sam_flag = hit_buf->core.flag;
1604    
1605     int text_offset = hit_buf->core.pos;
1606     int text_mate_pos = hit_buf->core.mpos;
1607     int target_id = hit_buf->core.tid;
1608     int mate_target_id = hit_buf->core.mtid;
1609 gpertea 29
1610 gpertea 154 vector<CigarOp> cigar;
1611     bool spliced_alignment = false;
1612     int num_hits = 1;
1613    
1614     double mapQ = hit_buf->core.qual;
1615     long double error_prob;
1616     if (mapQ > 0)
1617     {
1618     long double p = (-1.0 * mapQ) / 10.0;
1619     error_prob = pow(10.0L, p);
1620     }
1621     else
1622     {
1623     error_prob = 1.0;
1624     }
1625    
1626     bool end = true;
1627     unsigned int seg_offset = 0;
1628     unsigned int seg_num = 0;
1629     unsigned int num_segs = 0;
1630     // Copy the tag out of the name field before we might wipe it out
1631     char* qname = bam1_qname(hit_buf);
1632     char* pipe = strrchr(qname, '|');
1633     if (pipe)
1634     {
1635     if (name_tags)
1636     strcpy(name_tags, pipe);
1637    
1638     char* tag_buf = pipe + 1;
1639     if (strchr(tag_buf, ':'))
1640 gpertea 29 {
1641 gpertea 154 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1642     if (seg_num + 1 == num_segs)
1643     end = true;
1644     else
1645     end = false;
1646 gpertea 29 }
1647 gpertea 154
1648     *pipe = 0;
1649     }
1650 gpertea 29
1651 gpertea 154 if (target_id < 0) {
1652     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1653     bh = create_hit(qname,
1654     "*", //ref_name
1655     0, //left coord
1656     0, //read_len
1657     false, //antisense_aln
1658     0, //edit_dist
1659     end);
1660     return true;
1661     }
1662 gpertea 29
1663 gpertea 154 if (seq!=NULL) {
1664     char *bseq = (char*)bam1_seq(hit_buf);
1665     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1666     char v = bam1_seqi(bseq,i);
1667     seq[i]=bam_nt16_rev_table[v];
1668 gpertea 29 }
1669 gpertea 154 seq[hit_buf->core.l_qseq]=0;
1670     }
1671     if (qual!=NULL) {
1672     char *bq = (char*)bam1_qual(hit_buf);
1673     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1674     qual[i]=bq[i]+33;
1675     }
1676     qual[hit_buf->core.l_qseq]=0;
1677     }
1678 gpertea 29
1679 gpertea 154 bool antisense_splice=false;
1680     unsigned char num_mismatches = 0;
1681     unsigned char num_splice_anchor_mismatches = 0;
1682    
1683     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1684     if (ptr) {
1685     char src_strand_char = bam_aux2A(ptr);
1686     if (src_strand_char == '-')
1687     antisense_splice = true;
1688     }
1689    
1690     ptr = bam_aux_get(hit_buf, "NM");
1691     if (ptr) {
1692     num_mismatches = bam_aux2i(ptr);
1693     }
1694    
1695     ptr = bam_aux_get(hit_buf, "NH");
1696     if (ptr) {
1697     num_hits = bam_aux2i(ptr);
1698     }
1699 gpertea 29
1700 gpertea 154 string text_name = _sam_header->target_name[target_id];
1701     string text_name2 = "";
1702    
1703     bool fusion_alignment = false;
1704     string fusion_cigar_str;
1705     ptr = bam_aux_get(hit_buf, "XF");
1706     if (ptr) {
1707     fusion_alignment = true;
1708     char* xf = bam_aux2Z(ptr);
1709    
1710     // ignore the second part of a fusion alignment
1711     if (xf[0] == '2')
1712     return false;
1713    
1714     vector<string> fields;
1715     tokenize(xf, " ", fields);
1716    
1717     vector<string> contigs;
1718     tokenize(fields[1], "-", contigs);
1719     if (contigs.size() >= 2)
1720     {
1721     text_name = contigs[0];
1722     text_name2 = contigs[1];
1723     }
1724    
1725     text_offset = atoi(fields[2].c_str()) - 1;
1726     fusion_cigar_str = fields[3].c_str();
1727    
1728     if (seq)
1729     strcpy(seq, fields[4].c_str());
1730     if (qual)
1731     strcpy(qual, fields[5].c_str());
1732     }
1733    
1734     if (fusion_alignment) {
1735     const char* p_cig = fusion_cigar_str.c_str();
1736     while (*p_cig) {
1737     char* t;
1738     int length = (int)strtol(p_cig, &t, 10);
1739     if (length <= 0)
1740     {
1741     //fprintf (stderr, "CIGAR op has zero length\n");
1742     return false;
1743 gpertea 29 }
1744 gpertea 154 char op_char = *t;
1745     CigarOpCode opcode;
1746     if (op_char == 'M') opcode = MATCH;
1747     else if(op_char == 'm') opcode = mATCH;
1748     else if (op_char == 'I') opcode = INS;
1749     else if (op_char == 'i') opcode = iNS;
1750     else if (op_char == 'D') opcode = DEL;
1751     else if (op_char == 'd') opcode = dEL;
1752     else if (op_char == 'N' || op_char == 'n')
1753 gpertea 29 {
1754 gpertea 154 if (length > max_report_intron_length)
1755     return false;
1756    
1757     if (op_char == 'N')
1758     opcode = REF_SKIP;
1759     else
1760     opcode = rEF_SKIP;
1761     spliced_alignment = true;
1762     }
1763     else if (op_char == 'F')
1764     {
1765     opcode = FUSION_FF;
1766     length = length - 1;
1767     }
1768     else if (op_char == 'S') opcode = SOFT_CLIP;
1769     else if (op_char == 'H') opcode = HARD_CLIP;
1770     else if (op_char == 'P') opcode = PAD;
1771     else
1772     {
1773     fprintf (stderr, "(%d-%d) invalid CIGAR operation\n", length, (int)op_char);
1774     return false;
1775     }
1776     p_cig = t + 1;
1777     cigar.push_back(CigarOp(opcode, length));
1778 gpertea 29
1779 gpertea 154 if(opcode == INS) {
1780     num_mismatches -= length;
1781     }
1782     else if(opcode == DEL) {
1783     num_mismatches -= length;
1784     }
1785    
1786     /*
1787     * update fusion direction.
1788     */
1789     size_t cigar_size = cigar.size();
1790     if (cigar_size >= 3 && cigar[cigar_size - 2].opcode == FUSION_FF)
1791     {
1792     CigarOpCode prev = cigar[cigar_size - 3].opcode;
1793     CigarOpCode next = cigar[cigar_size - 1].opcode;
1794    
1795     bool increase1 = false, increase2 = false;
1796     if (prev == MATCH || prev == DEL || prev == INS || prev == REF_SKIP)
1797     increase1 = true;
1798     if (next == MATCH || next == DEL || next == INS || next == REF_SKIP)
1799     increase2 = true;
1800    
1801     if (increase1 && !increase2)
1802     cigar[cigar_size - 2].opcode = FUSION_FR;
1803     else if (!increase1 && increase2)
1804     cigar[cigar_size - 2].opcode = FUSION_RF;
1805     else if (!increase1 && !increase2)
1806     cigar[cigar_size - 2].opcode = FUSION_RR;
1807 gpertea 29 }
1808 gpertea 154 }
1809     }
1810     else {
1811     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1812     {
1813     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1814     if (length <= 0)
1815     {
1816     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1817     return false;
1818     }
1819 gpertea 29
1820 gpertea 154 CigarOpCode opcode;
1821     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1822     {
1823     case BAM_CMATCH: opcode = MATCH; break;
1824     case BAM_CINS: opcode = INS; break;
1825     case BAM_CDEL: opcode = DEL; break;
1826     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1827     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1828     case BAM_CPAD: opcode = PAD; break;
1829     case BAM_CREF_SKIP:
1830     opcode = REF_SKIP;
1831     spliced_alignment = true;
1832     if (length > (int)max_report_intron_length)
1833     {
1834     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1835     return false;
1836     }
1837     break;
1838     default:
1839     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1840     return false;
1841     }
1842    
1843     if (opcode != HARD_CLIP)
1844     cigar.push_back(CigarOp(opcode, length));
1845    
1846     /*
1847     * By convention,the NM field of the SAM record
1848     * counts an insertion or deletion. I dont' think
1849     * we want the mismatch count in the BowtieHit
1850     * record to reflect this. Therefore, subtract out
1851     * the mismatches due to in/dels
1852     */
1853     if(opcode == INS){
1854     num_mismatches -= length;
1855     }
1856     else if(opcode == DEL){
1857     num_mismatches -= length;
1858     }
1859     }
1860     }
1861 gpertea 29
1862 gpertea 154
1863     string mrnm;
1864     if (mate_target_id >= 0) {
1865     if (mate_target_id == target_id) {
1866     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1867     mrnm = _sam_header->target_name[mate_target_id];
1868     }
1869     else {
1870     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1871     return false;
1872     }
1873     }
1874     else {
1875     text_mate_pos = 0;
1876     }
1877 gpertea 29
1878 gpertea 154 if (spliced_alignment) {
1879     bh = create_hit(qname,
1880     text_name,
1881     text_name2,
1882     text_offset, // BAM files are 0-indexed
1883     cigar,
1884     sam_flag & 0x0010,
1885     antisense_splice,
1886     num_mismatches,
1887     num_splice_anchor_mismatches,
1888     end);
1889 gpertea 29
1890 gpertea 154 }
1891     else {
1892     bh = create_hit(qname,
1893     text_name,
1894     text_name2,
1895     text_offset, // BAM files are 0-indexed
1896     cigar,
1897     sam_flag & 0x0010,
1898     false,
1899     num_mismatches,
1900     0,
1901     end);
1902     }
1903    
1904     return true;
1905 gpertea 29 }
1906    
1907     bool BAMHitFactory::inspect_header(HitStream& hs)
1908     {
1909     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1910    
1911     if (header == NULL) {
1912     fprintf(stderr, "Warning: No BAM header\n");
1913     return false;
1914     }
1915     if (header->l_text == 0) {
1916     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1917     return false;
1918     }
1919     return true;
1920     }
1921    
1922 gpertea 154 bool SplicedBAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1923     BowtieHit& bh, bool strip_slash,
1924     char* name_out, char* name_tags,
1925     char* seq, char* qual) {
1926     if (_sam_header==NULL)
1927     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1928    
1929     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1930     uint32_t sam_flag = hit_buf->core.flag;
1931    
1932     int text_offset = hit_buf->core.pos;
1933     int text_mate_pos = hit_buf->core.mpos;
1934     int target_id = hit_buf->core.tid;
1935     int mate_target_id = hit_buf->core.mtid;
1936    
1937     vector<CigarOp> samcigar;
1938     bool spliced_alignment = false;
1939     int num_hits = 1;
1940    
1941     double mapQ = hit_buf->core.qual;
1942     long double error_prob;
1943     if (mapQ > 0)
1944     {
1945     long double p = (-1.0 * mapQ) / 10.0;
1946     error_prob = pow(10.0L, p);
1947     }
1948     else
1949     {
1950     error_prob = 1.0;
1951     }
1952    
1953     if (seq!=NULL) {
1954     char *bseq = (char*)bam1_seq(hit_buf);
1955     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1956     char v = bam1_seqi(bseq,i);
1957     seq[i]=bam_nt16_rev_table[v];
1958     }
1959     seq[hit_buf->core.l_qseq]=0;
1960     }
1961     if (qual!=NULL) {
1962     char *bq = (char*)bam1_qual(hit_buf);
1963     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1964     qual[i]=bq[i]+33;
1965     }
1966     qual[hit_buf->core.l_qseq]=0;
1967     }
1968 gpertea 29
1969 gpertea 154 bool end = true;
1970     unsigned int seg_offset = 0;
1971     unsigned int seg_num = 0;
1972     unsigned int num_segs = 0;
1973     // Copy the tag out of the name field before we might wipe it out
1974     char* name = bam1_qname(hit_buf);
1975     parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1976    
1977     if (target_id < 0) {
1978     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1979     bh = create_hit(name,
1980     "*", //ref_name
1981     0, //left coord
1982     0, //read_len
1983     false, //antisense_aln
1984     0, //edit_dist
1985     end);
1986     return true;
1987     }
1988    
1989     string text_name = _sam_header->target_name[target_id];
1990     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1991     {
1992     //char* t;
1993    
1994     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1995     if (length <= 0)
1996     {
1997     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1998     return false;
1999     }
2000    
2001     CigarOpCode opcode;
2002     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
2003     {
2004     case BAM_CMATCH: opcode = MATCH; break;
2005     case BAM_CINS: opcode = INS; break;
2006     case BAM_CDEL: opcode = DEL; break;
2007     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
2008     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
2009     case BAM_CPAD: opcode = PAD; break;
2010     case BAM_CREF_SKIP:
2011     opcode = REF_SKIP;
2012     spliced_alignment = true;
2013     if (length > (int)max_report_intron_length)
2014     {
2015     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
2016     return false;
2017     }
2018     break;
2019     default:
2020     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
2021     return false;
2022     }
2023     if (opcode != HARD_CLIP)
2024     samcigar.push_back(CigarOp(opcode, length));
2025     }
2026    
2027     string mrnm;
2028     if (mate_target_id >= 0) {
2029     if (mate_target_id == target_id) {
2030     mrnm = _sam_header->target_name[mate_target_id];
2031     }
2032     else {
2033     return false;
2034     }
2035     }
2036     else {
2037     text_mate_pos = 0;
2038     }
2039    
2040     bool antisense_splice = false;
2041     int sam_nm = 0;
2042     vector<bool> mismatches;
2043     mismatches.resize(strlen(seq), false);
2044     int num_mismatches=getBAMmismatches(hit_buf, samcigar, mismatches, sam_nm, antisense_splice);
2045     unsigned char num_splice_anchor_mismatches = 0;
2046    
2047     //##############################################
2048    
2049     // Add this alignment to the table of hits for this half of the
2050     // Bowtie map
2051    
2052     // Parse the text_name field to recover the splice coords
2053     vector<string> toks;
2054     tokenize_strict(text_name.c_str(), "|", toks);
2055     int num_extra_toks = (int)toks.size() - 6;
2056     if (num_extra_toks >= 0)
2057     {
2058     static const uint8_t left_window_edge_field = 1;
2059     static const uint8_t splice_field = 2;
2060     //static const uint8_t right_window_edge_field = 3;
2061     static const uint8_t junction_type_field = 4;
2062     static const uint8_t strand_field = 5;
2063    
2064     string contig = toks[0];
2065     for (int t = 1; t <= num_extra_toks; ++t)
2066     {
2067     contig += "|";
2068     contig += toks[t];
2069     }
2070    
2071     vector<string> splice_toks;
2072     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
2073     if (splice_toks.size() != 2)
2074     {
2075     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
2076     //fprintf(stderr, "\t%s (token: %s)\n", text_name,
2077     // toks[num_extra_toks + splice_field].c_str());
2078     return false;
2079     }
2080    
2081     const string& junction_type = toks[num_extra_toks + junction_type_field];
2082     const string junction_strand = toks[num_extra_toks + strand_field];
2083    
2084     //
2085     // check for an insertion hit
2086     //
2087     if(junction_type == "ins")
2088     {
2089     //int8_t spliced_read_len = strlen(seq_str);
2090     //TODO FIXME: use the CIGAR instead of seq length!
2091     // The 0-based position of the left edge of the alignment. Note that this
2092     // value may need to be further corrected to account for the presence of
2093     // of the insertion.
2094     int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
2095    
2096     // The 0-based position of the last genomic sequence before the insertion
2097     int left_splice_pos = atoi(splice_toks[0].c_str());
2098    
2099     string insertedSequence = splice_toks[1];
2100     // The 0-based position of the first genomic sequence after the insertion
2101    
2102     vector<CigarOp> splcigar;
2103     //this also updates left to the adjusted genomic coordinates
2104     int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
2105     left, left_splice_pos+1, insertedSequence.length(), INS);
2106    
2107     if (spl_num_mismatches<0) return false;
2108     num_mismatches-=spl_num_mismatches;
2109     bh = create_hit(name,
2110     contig,
2111     "",
2112     left,
2113     splcigar,
2114     sam_flag & 0x0010,
2115     junction_strand == "rev",
2116     num_mismatches,
2117     0,
2118     end);
2119    
2120     return true;
2121     } //"ins"
2122     else //"del", "intron", or "fusion"
2123     {
2124     char orientation = (sam_flag & 0x0010 ? '-' : '+');
2125     if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")||
2126     !(orientation == '-' || orientation == '+'))
2127     {
2128     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
2129     return false;
2130     }
2131    
2132     // The 0-based position of the left edge of the alignment.
2133     int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str());
2134     if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
2135     left += text_offset;
2136     else
2137     left -= text_offset;
2138    
2139     vector<CigarOp> splcigar;
2140     CigarOpCode opcode;
2141     if(junction_type == "del")
2142     opcode = DEL;
2143     else if(junction_type == "fus")
2144     {
2145     if (junction_strand == "ff")
2146     opcode = FUSION_FF;
2147     else if (junction_strand == "fr")
2148     opcode = FUSION_FR;
2149     else if (junction_strand == "rf")
2150     opcode = FUSION_RF;
2151     else
2152     opcode = FUSION_RR;
2153     }
2154     else
2155     opcode = REF_SKIP;
2156    
2157     int left_splice_pos = atoi(splice_toks[0].c_str());
2158    
2159     // The 0-based position of the last genomic sequence before the deletion
2160     int gap_len = 0;
2161     if (junction_type == "fus")
2162     gap_len = atoi(splice_toks[1].c_str());
2163     else
2164     gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
2165    
2166     if (opcode == FUSION_RF || opcode == FUSION_RR)
2167     left_splice_pos -= 1;
2168     else
2169     left_splice_pos += 1;
2170    
2171     int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left,
2172     left_splice_pos, gap_len, opcode);
2173    
2174     if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
2175     return false;
2176    
2177     string contig2 = "";
2178     if (junction_type == "fus")
2179     {
2180     vector<string> contigs;
2181     tokenize(contig, "-", contigs);
2182     if (contigs.size() != 2)
2183     return false;
2184    
2185     contig = contigs[0];
2186     contig2 = contigs[1];
2187    
2188     if (junction_strand == "rf" || junction_strand == "rr")
2189     orientation = (orientation == '+' ? '-' : '+');
2190     }
2191    
2192     bh = create_hit(name,
2193     contig,
2194     contig2,
2195     left,
2196     splcigar,
2197     orientation == '-',
2198     junction_strand == "rev",
2199     num_mismatches,
2200     spl_num_mismatches,
2201     end);
2202    
2203     return true;
2204     }
2205     } //parse splice data
2206     else
2207     {
2208     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
2209     //fprintf(stderr, "%s\n", orig_bwt_buf);
2210     // continue;
2211     return false;
2212     }
2213    
2214     return false;
2215     }
2216    
2217 gpertea 29 void get_mapped_reads(FILE* bwtf,
2218     HitTable& hits,
2219     HitFactory& hit_factory,
2220     bool strip_slash,
2221     bool verbose)
2222     {
2223    
2224    
2225     char bwt_buf[2048];
2226     uint32_t reads_extracted = 0;
2227    
2228     while (fgets(bwt_buf, 2048, bwtf))
2229     {
2230     // Chomp the newline
2231     char* nl = strrchr(bwt_buf, '\n');
2232     if (nl) *nl = 0;
2233     if (*bwt_buf == 0)
2234     continue;
2235     // Get a new record from the tab-delimited Bowtie map
2236     BowtieHit bh;
2237     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
2238     {
2239     // Only check uniqueness if these hits are spliced
2240     hits.add_hit(bh, true);
2241     }
2242     reads_extracted++;
2243     }
2244    
2245     // This will sort the map by insert id.
2246     hits.finalize();
2247     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
2248     }
2249    
2250    
2251     /*
2252     AlignStatus status(const BowtieHit* align)
2253     {
2254     if (!align)
2255     return UNALIGNED;
2256     if (align->contiguous())
2257     return CONTIGUOUS;
2258     return SPLICED;
2259     }
2260     */
2261    
2262     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
2263     {
2264     int max_hit_pos = -1;
2265     for (size_t i = 0; i < hits.size(); ++i)
2266     {
2267     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
2268     }
2269    
2270     if ((int)DoC.size() < max_hit_pos)
2271     DoC.resize(max_hit_pos);
2272    
2273     for (size_t i = 0; i < hits.size(); ++i)
2274     {
2275     const BowtieHit& bh = hits[i];
2276    
2277     // split up the coverage contibution for this reads
2278     size_t j = bh.left();
2279     const vector<CigarOp>& cigar = bh.cigar();
2280    
2281     for (size_t c = 0 ; c < cigar.size(); ++c)
2282     {
2283     switch(cigar[c].opcode)
2284     {
2285     case MATCH:
2286     for (size_t m = 0; m < cigar[c].length; ++m)
2287     {
2288     if (DoC[j + m] < 0xFFFF)
2289     DoC[j + m]++;
2290     }
2291     //fall through this case to REF_SKIP is intentional
2292     case REF_SKIP:
2293     j += cigar[c].length;
2294     break;
2295     default:
2296     break;
2297     }
2298    
2299     }
2300     }
2301     }
2302    
2303     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
2304     {
2305     if ((int)DoC.size() < bh.right())
2306     DoC.resize(bh.right());
2307    
2308     // split up the coverage contibution for this reads
2309     size_t j = bh.left();
2310     const vector<CigarOp>& cigar = bh.cigar();
2311    
2312     for (size_t c = 0 ; c < cigar.size(); ++c)
2313     {
2314     switch(cigar[c].opcode)
2315     {
2316     case MATCH:
2317     for (size_t m = 0; m < cigar[c].length; ++m)
2318     {
2319 gpertea 135 if (DoC[j + m] < VMAXINT32)
2320 gpertea 29 DoC[j + m]++;
2321     }
2322     //fall through this case to REF_SKIP is intentional
2323     case REF_SKIP:
2324     j += cigar[c].length;
2325     break;
2326     default:
2327     break;
2328     }
2329    
2330     }
2331     }
2332    
2333     void print_hit(FILE* fout,
2334     const char* read_name,
2335     const BowtieHit& bh,
2336     const char* ref_name,
2337 gpertea 154 const char* ref_name2,
2338 gpertea 29 const char* sequence,
2339 gpertea 154 const char* qualities)
2340 gpertea 29 {
2341 gpertea 154 string seq;
2342     string quals;
2343     if (sequence)
2344     {
2345     seq = sequence;
2346     quals = qualities;
2347     seq.resize(bh.read_len());
2348     quals.resize(bh.read_len());
2349     }
2350     else
2351     {
2352     seq = "*";
2353     }
2354    
2355     if (qualities)
2356     {
2357     quals = qualities;
2358     quals.resize(bh.read_len());
2359     }
2360     else
2361     {
2362     quals = "*";
2363     }
2364    
2365     uint32_t sam_flag = 0;
2366     if (bh.antisense_align())
2367     {
2368     sam_flag |= 0x0010; // BAM_FREVERSE
2369     }
2370    
2371     int left = bh.left();
2372    
2373     uint32_t map_quality = 255;
2374     char cigar[256] = {0};
2375     string mate_ref_name = "*";
2376     uint32_t mate_pos = 0;
2377     uint32_t insert_size = 0;
2378    
2379     const vector<CigarOp>& bh_cigar = bh.cigar();
2380    
2381     /*
2382     * In addition to calculating the cigar string,
2383     * we need to figure out how many in/dels are in the
2384     * sequence, so that we can give the correct
2385     * value for the NM tag
2386     */
2387     int indel_distance = 0;
2388     CigarOpCode fusion_dir = FUSION_NOTHING;
2389     for (size_t c = 0; c < bh_cigar.size(); ++c)
2390     {
2391     const CigarOp& op = bh_cigar[c];
2392    
2393     char ibuf[64];
2394     sprintf(ibuf, "%d", op.length);
2395     switch(op.opcode)
2396 gpertea 29 {
2397 gpertea 154 case MATCH:
2398     case mATCH:
2399     strcat(cigar, ibuf);
2400     if (bh_cigar[c].opcode == MATCH)
2401     strcat(cigar, "M");
2402     else
2403     strcat(cigar, "m");
2404     break;
2405     case INS:
2406     case iNS:
2407     strcat(cigar, ibuf);
2408     if (bh_cigar[c].opcode == INS)
2409     strcat(cigar, "I");
2410     else
2411     strcat(cigar, "i");
2412     indel_distance += bh_cigar[c].length;
2413     break;
2414     case DEL:
2415     case dEL:
2416     strcat(cigar, ibuf);
2417     if (bh_cigar[c].opcode == DEL)
2418     strcat(cigar, "D");
2419     else
2420     strcat(cigar, "d");
2421     indel_distance += bh_cigar[c].length;
2422     break;
2423     case REF_SKIP:
2424     case rEF_SKIP:
2425     strcat(cigar, ibuf);
2426     if (bh_cigar[c].opcode == REF_SKIP)
2427     strcat(cigar, "N");
2428     else
2429     strcat(cigar, "n");
2430     break;
2431     case FUSION_FF:
2432     case FUSION_FR:
2433     case FUSION_RF:
2434     case FUSION_RR:
2435     fusion_dir = op.opcode;
2436     sprintf(ibuf, "%d", bh_cigar[c].length + 1);
2437     strcat(cigar, ibuf);
2438     strcat(cigar, "F");
2439     break;
2440     default:
2441     break;
2442 gpertea 29 }
2443 gpertea 154 }
2444    
2445     char cigar1[256] = {0}, cigar2[256] = {0};
2446     string left_seq, right_seq, left_qual, right_qual;
2447     int left1 = -1, left2 = -1;
2448     extract_partial_hits(bh, seq, quals,
2449     cigar1, cigar2, left_seq, right_seq,
2450     left_qual, right_qual, left1, left2);
2451    
2452    
2453     bool containsSplice = false;
2454     for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
2455     {
2456     if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP)
2457 gpertea 29 {
2458 gpertea 154 containsSplice = true;
2459     break;
2460 gpertea 29 }
2461 gpertea 154 }
2462    
2463     if (fusion_dir != FUSION_NOTHING)
2464     {
2465     fprintf(fout,
2466     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t",
2467     read_name,
2468     sam_flag,
2469     ref_name,
2470     left1 + 1,
2471     map_quality,
2472     cigar1,
2473     mate_ref_name.c_str(),
2474     mate_pos,
2475     insert_size,
2476     left_seq.c_str(),
2477     left_qual.c_str(),
2478     bh.edit_dist() + indel_distance);
2479    
2480     if (containsSplice)
2481 gpertea 29 {
2482 gpertea 154 fprintf(fout,
2483     "XS:A:%c\t",
2484     bh.antisense_splice() ? '-' : '+');
2485 gpertea 29 }
2486    
2487 gpertea 154 fprintf(fout,
2488     "FR:Z:1 %s-%s %d %s %s %s\n",
2489     ref_name,
2490     ref_name2,
2491     left + 1,
2492     cigar,
2493     seq.c_str(),
2494     quals.c_str());
2495    
2496     fprintf(fout,
2497     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t",
2498     read_name,
2499     sam_flag,
2500     ref_name2,
2501     left2 + 1,
2502     map_quality,
2503     cigar2,
2504     mate_ref_name.c_str(),
2505     mate_pos,
2506     insert_size,
2507     right_seq.c_str(),
2508     right_qual.c_str(),
2509     bh.edit_dist() + indel_distance);
2510    
2511     if (containsSplice)
2512 gpertea 29 {
2513 gpertea 154 fprintf(fout,
2514     "XS:A:%c\t",
2515     bh.antisense_splice() ? '-' : '+');
2516     }
2517 gpertea 29
2518 gpertea 154 fprintf(fout,
2519     "FR:Z:2 %s-%s %d %s %s %s\n",
2520     ref_name,
2521     ref_name2,
2522     left + 1,
2523     cigar,
2524     seq.c_str(),
2525     quals.c_str());
2526     }
2527     else
2528     {
2529     fprintf(fout,
2530     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t",
2531     read_name,
2532     sam_flag,
2533     ref_name,
2534     left + 1,
2535     map_quality,
2536     cigar,
2537     mate_ref_name.c_str(),
2538     mate_pos,
2539     insert_size,
2540     seq.c_str(),
2541     quals.c_str(),
2542     bh.edit_dist() + indel_distance);
2543    
2544     if (containsSplice)
2545 gpertea 29 {
2546 gpertea 154 fprintf(fout,
2547     "XS:A:%c\t",
2548     bh.antisense_splice() ? '-' : '+');
2549 gpertea 29 }
2550    
2551 gpertea 154 fprintf(fout, "\n");
2552     }
2553 gpertea 29 }
2554    
2555     void print_bamhit(GBamWriter& wbam,
2556 gpertea 154 const char* read_name,
2557     const BowtieHit& bh,
2558     const char* ref_name,
2559     const char* ref_name2,
2560     const char* sequence,
2561     const char* qualities,
2562     bool from_bowtie)
2563 gpertea 29 {
2564 gpertea 154 string seq;
2565     string quals;
2566     if (sequence) {
2567     seq = sequence;
2568     quals = qualities;
2569     seq.resize(bh.read_len());
2570     quals.resize(bh.read_len());
2571     }
2572     else {
2573     seq = "*";
2574     }
2575     if (qualities) {
2576     quals = qualities;
2577     quals.resize(bh.read_len());
2578     }
2579     else {
2580     quals = "*";
2581     }
2582    
2583     uint32_t sam_flag = 0;
2584     if (bh.antisense_align())
2585     {
2586     sam_flag |= 0x0010; // BAM_FREVERSE
2587     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
2588     {
2589     reverse_complement(seq);
2590     reverse(quals.begin(), quals.end());
2591     }
2592 gpertea 29 }
2593 gpertea 154
2594     uint32_t sam_pos = bh.left() + 1;
2595     uint32_t map_quality = 255;
2596     char cigar[256];
2597     cigar[0] = 0;
2598     string mate_ref_name = "*";
2599     uint32_t mate_pos = 0;
2600     uint32_t insert_size = 0;
2601     //string qualities = "*";
2602    
2603     const vector<CigarOp>& bh_cigar = bh.cigar();
2604     /*
2605     * In addition to calculating the cigar string,
2606     * we need to figure out how many in/dels are in the
2607     * sequence, so that we can give the correct
2608     * value for the NM tag
2609     */
2610     int indel_distance = 0;
2611     CigarOpCode fusion_dir = FUSION_NOTHING;
2612     for (size_t c = 0; c < bh_cigar.size(); ++c)
2613 gpertea 29 {
2614 gpertea 154 const CigarOp& op = bh_cigar[c];
2615    
2616     char ibuf[64];
2617     sprintf(ibuf, "%d", op.length);
2618     switch(op.opcode)
2619     {
2620     case MATCH:
2621     case mATCH:
2622     strcat(cigar, ibuf);
2623     if (bh_cigar[c].opcode == MATCH)
2624     strcat(cigar, "M");
2625     else
2626     strcat(cigar, "m");
2627     break;
2628     case INS:
2629     case iNS:
2630     strcat(cigar, ibuf);
2631     if (bh_cigar[c].opcode == INS)
2632     strcat(cigar, "I");
2633     else
2634     strcat(cigar, "i");
2635     indel_distance += bh_cigar[c].length;
2636     break;
2637     case DEL:
2638     case dEL:
2639     strcat(cigar, ibuf);
2640     if (bh_cigar[c].opcode == DEL)
2641     strcat(cigar, "D");
2642     else
2643     strcat(cigar, "d");
2644     indel_distance += bh_cigar[c].length;
2645     break;
2646     case REF_SKIP:
2647     case rEF_SKIP:
2648     strcat(cigar, ibuf);
2649     if (bh_cigar[c].opcode == REF_SKIP)
2650     strcat(cigar, "N");
2651     else
2652     strcat(cigar, "n");
2653     break;
2654     case FUSION_FF:
2655     case FUSION_FR:
2656     case FUSION_RF:
2657     case FUSION_RR:
2658     fusion_dir = op.opcode;
2659     sprintf(ibuf, "%d", bh_cigar[c].length + 1);
2660     strcat(cigar, ibuf);
2661     strcat(cigar, "F");
2662     break;
2663     default:
2664     break;
2665     }
2666 gpertea 29 }
2667    
2668 gpertea 154 char cigar1[256] = {0}, cigar2[256] = {0};
2669     string left_seq, right_seq, left_qual, right_qual;
2670     int left1 = -1, left2 = -1;
2671     extract_partial_hits(bh, seq, quals,
2672     cigar1, cigar2, left_seq, right_seq,
2673     left_qual, right_qual, left1, left2);
2674    
2675     bool containsSplice = false;
2676     for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
2677 gpertea 29 {
2678 gpertea 154 if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP)
2679     {
2680     containsSplice = true;
2681     break;
2682     }
2683 gpertea 29 }
2684    
2685 gpertea 154 vector<string> auxdata;
2686     if (!sam_readgroup_id.empty())
2687 gpertea 29 {
2688 gpertea 154 string nm("RG:Z:");
2689     nm += sam_readgroup_id;
2690     auxdata.push_back(nm);
2691 gpertea 29 }
2692 gpertea 154
2693     string nm("NM:i:");
2694     str_appendInt(nm, bh.edit_dist() + indel_distance);
2695     auxdata.push_back(nm);
2696    
2697     if (containsSplice) {
2698     nm="XS:A:";
2699     nm+=(char)(bh.antisense_splice() ? '-' : '+');
2700     auxdata.push_back(nm);
2701     }
2702    
2703     if (fusion_dir != FUSION_NOTHING)
2704     {
2705     char XF[2048] = {0};
2706     sprintf(XF,
2707     "XF:Z:1 %s-%s %u %s %s %s",
2708     ref_name,
2709     ref_name2,
2710     sam_pos,
2711     cigar,
2712     seq.c_str(),
2713     quals.c_str());
2714     auxdata.push_back(XF);
2715    
2716     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, left1 + 1, map_quality,
2717     cigar1, mate_ref_name.c_str(), mate_pos,
2718     insert_size, left_seq.c_str(), left_qual.c_str(), &auxdata);
2719    
2720     wbam.write(brec);
2721     delete brec;
2722 gpertea 29
2723 gpertea 154 sprintf(XF,
2724     "XF:Z:2 %s-%s %u %s %s %s",
2725     ref_name,
2726     ref_name2,
2727     sam_pos,
2728     cigar,
2729     seq.c_str(),
2730     quals.c_str());
2731     auxdata.back() = XF;
2732    
2733     brec = wbam.new_record(read_name, sam_flag, ref_name, left2 + 1, map_quality,
2734     cigar2, mate_ref_name.c_str(), mate_pos,
2735     insert_size, right_seq.c_str(), right_qual.c_str(), &auxdata);
2736    
2737     wbam.write(brec);
2738     delete brec;
2739     }
2740     else
2741 gpertea 29 {
2742 gpertea 154 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
2743     cigar, mate_ref_name.c_str(), mate_pos,
2744     insert_size, seq.c_str(), quals.c_str(), &auxdata);
2745    
2746     wbam.write(brec);
2747     delete brec;
2748 gpertea 29 }
2749     }
2750    
2751     /**
2752     * Print a vector of cigar operations to a file.
2753     * @param bh_cigar A vector of CigarOps
2754     * @return a string representation of the cigar string
2755     */
2756 gpertea 154 std::string print_cigar(const vector<CigarOp>& bh_cigar){
2757 gpertea 29 char cigar[256];
2758     cigar[0] = 0;
2759     for (size_t c = 0; c < bh_cigar.size(); ++c)
2760     {
2761     char ibuf[64];
2762     sprintf(ibuf, "%d", bh_cigar[c].length);
2763 gpertea 154 strcat(cigar, ibuf);
2764 gpertea 29 switch(bh_cigar[c].opcode)
2765     {
2766 gpertea 154 case MATCH:
2767     strcat(cigar, "M");
2768     break;
2769     case mATCH:
2770     strcat(cigar, "m");
2771     break;
2772     case INS:
2773     strcat(cigar, "I");
2774     break;
2775     case iNS:
2776     strcat(cigar, "i");
2777     break;
2778     case DEL:
2779     strcat(cigar, "D");
2780     break;
2781     case dEL:
2782     strcat(cigar, "d");
2783     break;
2784     case REF_SKIP:
2785     strcat(cigar, "N");
2786     break;
2787     case rEF_SKIP:
2788     strcat(cigar, "n");
2789     break;
2790     case FUSION_FF:
2791     case FUSION_FR:
2792     case FUSION_RF:
2793     case FUSION_RR:
2794     strcat(cigar, "F");
2795     break;
2796     default:
2797     break;
2798 gpertea 29 }
2799     }
2800     string result(cigar);
2801     return result;
2802     }
2803    
2804 gpertea 154 void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual,
2805     char* cigar1, char* cigar2, string& seq1, string& seq2,
2806     string& qual1, string& qual2, int& left1, int& left2)
2807     {
2808     const int left = bh.left();
2809     int right = left;
2810     int fusion_left = -1, fusion_right = -1;
2811    
2812     const vector<CigarOp>& bh_cigar = bh.cigar();
2813    
2814     CigarOpCode fusion_dir = FUSION_NOTHING;
2815     size_t fusion_idx = 0;
2816     size_t left_part_len = 0;
2817     for (size_t c = 0; c < bh_cigar.size(); ++c)
2818     {
2819     const CigarOp& op = bh_cigar[c];
2820     switch(op.opcode)
2821     {
2822     case MATCH:
2823     case REF_SKIP:
2824     case DEL:
2825     right += op.length;
2826     break;
2827     case mATCH:
2828     case rEF_SKIP:
2829     case dEL:
2830     right -= op.length;
2831     break;
2832     case FUSION_FF:
2833     case FUSION_FR:
2834     case FUSION_RF:
2835     case FUSION_RR:
2836     {
2837     fusion_dir = op.opcode;
2838     fusion_idx = c;
2839     if (op.opcode == FUSION_FF || op.opcode == FUSION_FR)
2840     fusion_left = right - 1;
2841     else
2842     fusion_left = right + 1;
2843     fusion_right = right = op.length;
2844     }
2845     break;
2846     default:
2847     break;
2848     }
2849    
2850     if (fusion_dir == FUSION_NOTHING)
2851     {
2852     if (op.opcode == MATCH || op.opcode == mATCH || op.opcode == INS || op.opcode == iNS)
2853     {
2854     left_part_len += op.length;
2855     }
2856     }
2857     }
2858    
2859     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_FR)
2860     {
2861     for (size_t c = 0; c < fusion_idx; ++c)
2862     {
2863     const CigarOp& op = bh_cigar[c];
2864     char ibuf[64];
2865     sprintf(ibuf, "%d", op.length);
2866     strcat(cigar1, ibuf);
2867    
2868     switch (op.opcode)
2869     {
2870     case MATCH:
2871     strcat(cigar1, "M");
2872     break;
2873     case INS:
2874     strcat(cigar1, "I");
2875     break;
2876     case DEL:
2877     strcat(cigar1, "D");
2878     break;
2879     case REF_SKIP:
2880     strcat(cigar1, "N");
2881     break;
2882     default:
2883     assert (0);
2884     break;
2885     }
2886     }
2887     }
2888     else if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2889     {
2890     assert (fusion_idx > 0);
2891     for (int c = fusion_idx - 1; c >=0; --c)
2892     {
2893     const CigarOp& op = bh_cigar[c];
2894     char ibuf[64];
2895     sprintf(ibuf, "%d", op.length);
2896     strcat(cigar1, ibuf);
2897    
2898     switch (op.opcode)
2899     {
2900     case mATCH:
2901     strcat(cigar1, "M");
2902     break;
2903     case iNS:
2904     strcat(cigar1, "I");
2905     break;
2906     case dEL:
2907     strcat(cigar1, "D");
2908     break;
2909     case rEF_SKIP:
2910     strcat(cigar1, "N");
2911     break;
2912     default:
2913     assert (0);
2914     break;
2915     }
2916     }
2917     }
2918    
2919     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RF)
2920     {
2921     for (size_t c = fusion_idx + 1; c < bh_cigar.size(); ++c)
2922     {
2923     const CigarOp& op = bh_cigar[c];
2924     char ibuf[64];
2925     sprintf(ibuf, "%d", op.length);
2926     strcat(cigar2, ibuf);
2927    
2928     switch (op.opcode)
2929     {
2930     case MATCH:
2931     strcat(cigar2, "M");
2932     break;
2933     case INS:
2934     strcat(cigar2, "I");
2935     break;
2936     case DEL:
2937     strcat(cigar2, "D");
2938     break;
2939     case REF_SKIP:
2940     strcat(cigar2, "N");
2941     break;
2942     default:
2943     assert (0);
2944     break;
2945     }
2946     }
2947     }
2948     else if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR)
2949     {
2950     assert (bh_cigar.size() > 0);
2951     for (size_t c = bh_cigar.size() - 1; c > fusion_idx; --c)
2952     {
2953     const CigarOp& op = bh_cigar[c];
2954     char ibuf[64];
2955     sprintf(ibuf, "%d", op.length);
2956     strcat(cigar2, ibuf);
2957    
2958     switch (op.opcode)
2959     {
2960     case mATCH:
2961     strcat(cigar2, "M");
2962     break;
2963     case iNS:
2964     strcat(cigar2, "I");
2965     break;
2966     case dEL:
2967     strcat(cigar2, "D");
2968     break;
2969     case rEF_SKIP:
2970     strcat(cigar2, "N");
2971     break;
2972     default:
2973     assert (0);
2974     break;
2975     }
2976     }
2977     }
2978    
2979     if (fusion_dir != FUSION_NOTHING)
2980     {
2981     seq1 = seq.substr(0, left_part_len);
2982     qual1 = qual.substr(0, left_part_len);
2983    
2984     if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2985     {
2986     reverse_complement(seq1);
2987     reverse(qual1.begin(), qual1.end());
2988     }
2989    
2990     seq2 = seq.substr(left_part_len);
2991     qual2 = qual.substr(left_part_len);
2992    
2993     if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR)
2994     {
2995     reverse_complement(seq2);
2996     reverse(qual2.begin(), qual2.end());
2997     }
2998    
2999     left1 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_FR) ? left : fusion_left);
3000     left2 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_RF) ? fusion_right : right + 1);
3001     }
3002     }
3003    
3004    
3005 gpertea 29 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
3006     {
3007 gpertea 154 RefSequenceTable::Sequence* ref_str1 = rt.get_seq(_ref_id);
3008     RefSequenceTable::Sequence* ref_str2 = rt.get_seq(_ref_id2);
3009    
3010     if (!ref_str1 || !ref_str2)
3011 gpertea 29 return false;
3012    
3013 gpertea 154 RefSequenceTable::Sequence* ref_str = ref_str1;
3014 gpertea 29
3015     size_t pos_seq = 0;
3016 gpertea 154 size_t pos_ref = _left;
3017 gpertea 29 size_t mismatch = 0;
3018 gpertea 154 bool bSawFusion = false;
3019 gpertea 29 for (size_t i = 0; i < _cigar.size(); ++i)
3020     {
3021     CigarOp cigar = _cigar[i];
3022     switch(cigar.opcode)
3023     {
3024     case MATCH:
3025     {
3026 gpertea 154 seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length);
3027 gpertea 29 for (size_t j = 0; j < cigar.length; ++j)
3028     {
3029 gpertea 154 if (_seq[pos_seq] != ref_seq[j])
3030 gpertea 29 ++mismatch;
3031 gpertea 154 ++pos_seq;
3032 gpertea 29 }
3033 gpertea 154
3034     pos_ref += cigar.length;
3035 gpertea 29 }
3036     break;
3037 gpertea 154 case mATCH:
3038     {
3039     seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1);
3040     seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>());
3041     seqan::reverseInPlace(ref_seq);
3042    
3043     for (size_t j = 0; j < cigar.length; ++j)
3044     {
3045     if (_seq[pos_seq] != ref_seq[j])
3046     ++mismatch;
3047     ++pos_seq;
3048     }
3049    
3050     pos_ref -= cigar.length;
3051     }
3052     break;
3053 gpertea 29 case INS:
3054 gpertea 154 case iNS:
3055 gpertea 29 {
3056     pos_seq += cigar.length;
3057     }
3058     break;
3059    
3060     case DEL:
3061     case REF_SKIP:
3062     {
3063     pos_ref += cigar.length;
3064     }
3065     break;
3066    
3067 gpertea 154 case dEL:
3068     case rEF_SKIP:
3069     {
3070     pos_ref -= cigar.length;
3071     }
3072     break;
3073    
3074     case FUSION_FF:
3075     case FUSION_FR:
3076     case FUSION_RF:
3077     case FUSION_RR:
3078     {
3079     // We don't allow a read spans more than two chromosomes.
3080     if (bSawFusion)
3081     return false;
3082    
3083     ref_str = ref_str2;
3084     pos_ref = cigar.length;
3085    
3086     bSawFusion = true;
3087     }
3088     break;
3089    
3090 gpertea 29 default:
3091     break;
3092     }
3093     }
3094 gpertea 154
3095 gpertea 29 return mismatch == _edit_dist;
3096     }