ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (12 years, 7 months ago) by gpertea
File size: 82253 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21 gpertea 154 #include <seqan/sequence.h>
22     #include <seqan/find.h>
23     #include <seqan/file.h>
24     #include <seqan/modifier.h>
25    
26 gpertea 29 #include "common.h"
27     #include "bwt_map.h"
28     #include "tokenize.h"
29     #include "reads.h"
30    
31     using namespace std;
32    
33     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
34     {
35     uint32_t reference_id = bh.ref_id();
36    
37     pair<RefHits::iterator, bool> ret =
38     _hits_for_ref.insert(make_pair(reference_id, HitList()));
39     HitList& hl = ret.first->second;
40    
41     if (check_uniqueness)
42     {
43     // Check uniqueness, in case we are adding spliced hits from
44     // several spliced alignment sources (e.g. de novo hashing + Bowtie
45     // against a user-supplied index). We don't want to count the same
46     // alignment twice if it happened to be found by more than one method
47     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
48     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
49     for (; lb != ub && lb != hl.end(); ++lb)
50     {
51     if (*lb == bh)
52     {
53     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
54     return;
55     }
56    
57     if (lb->insert_id() == bh.insert_id() &&
58     lb->ref_id() == bh.ref_id() &&
59     lb->antisense_align() == bh.antisense_align())
60     {
61     // If we get here, we may be looking at the same alignment
62     // However, spanning_reads may report a shorter, trimmed alignment
63     // so not all fields will be equal. If they just disagree on the
64     // ends, and don't indicate a different junction coord, the
65     // alignments are the same.
66    
67     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
68     (bh.left() <= lb->left() && bh.right() >= lb->right()))
69     {
70     vector<pair<int,int> > lb_gaps, bh_gaps;
71     lb->gaps(lb_gaps);
72     bh.gaps(bh_gaps);
73     if (lb_gaps == bh_gaps)
74     {
75     // One alignment is contained in the other, they agree on
76     // where the gaps, if any, are, and they share an id
77     // => this is a redundant aligment, so toss it
78     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
79     return;
80     }
81     }
82     }
83     }
84     }
85     _total_hits++;
86     hl.push_back(bh);
87     }
88    
89     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
90     {
91     return h1.insert_id() < h2.insert_id();
92     }
93    
94     void LineHitFactory::openStream(HitStream& hs)
95     {
96     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
97     //open the file for HitStream here
98     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
99     if (hs._hit_file==NULL)
100     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
101     return;
102     }
103     if (hs._fzpipe!=NULL) {
104     hs._hit_file=hs._fzpipe->file;
105     }
106     }
107    
108     void LineHitFactory::rewind(HitStream& hs)
109     {
110     if (hs._fzpipe!=NULL) {
111     hs._fzpipe->rewind();
112     hs._hit_file=hs._fzpipe->file;
113     }
114     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
115     }
116    
117 gpertea 154 void LineHitFactory::seek(HitStream& hs, int64_t offset)
118     {
119     // daehwan - implement this later
120     if (hs._fzpipe != NULL) {
121     hs._fzpipe->seek(offset);
122     hs._hit_file=hs._fzpipe->file;
123     }
124     // else if (hs._hit_file) ::seek((FILE*)(hs._hit_file));
125     }
126    
127 gpertea 29 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
128     FILE* f=(FILE *)(hs._hit_file);
129     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
130     if (!new_rec || feof(f)) {
131     hs._eof=true;
132     return false;
133     }
134     ++_line_num;
135     char* nl = strrchr(_hit_buf, '\n');
136     if (nl) *nl = 0;
137     buf = _hit_buf;
138     buf_size = _hit_buf_max_sz - 1;
139     return true;
140     }
141    
142     void LineHitFactory::closeStream(HitStream& hs) {
143     if (hs._fzpipe!=NULL) {
144     hs._fzpipe->close();
145     return;
146     }
147     if (hs._hit_file!=NULL) {
148     fclose((FILE*)(hs._hit_file));
149     hs._hit_file=NULL;
150     }
151     }
152     void BAMHitFactory::openStream(HitStream& hs) {
153     if (hs._hit_file==NULL) {
154     if (hs._hit_file_name.empty())
155     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
156     return; //invalid stream, could be just a place holder
157     //open the file here if not already open
158     string fext=getFext(hs._hit_file_name);
159     if (fext=="sam")
160     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
161     else
162     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
163    
164     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
165    
166     if (sam_file == NULL)
167     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
168     if (sam_file->header == NULL)
169     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
170     memset(&_next_hit, 0, sizeof(_next_hit));
171     //_beginning = bgzf_tell(sam_file->x.bam);
172     _sam_header=sam_file->header;
173     if (inspect_header(hs) == false)
174     err_die("Error: invalid SAM header for file %s\n",
175     hs._hit_file_name.c_str());
176     }
177     }
178    
179     void BAMHitFactory::closeStream(HitStream& hs) {
180     if (hs._hit_file) {
181     samclose((samfile_t*)(hs._hit_file));
182     }
183     hs._hit_file=NULL;
184     _sam_header=NULL;
185     }
186    
187     void BAMHitFactory::rewind(HitStream& hs)
188     {
189     /*
190     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
191     {
192     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
193     _eof = false;
194     }
195     */
196     this->closeStream(hs);
197     this->openStream(hs);
198     }
199    
200 gpertea 154 void BAMHitFactory::seek(HitStream& hs, int64_t offset)
201     {
202     if (hs._hit_file) {
203     bgzf_seek(((samfile_t*)hs._hit_file)->x.bam, offset, SEEK_SET);
204     }
205     }
206    
207 gpertea 29 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
208     const bam1_t* bamrec=(const bam1_t*)hit_buf;
209     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
210     string sam_line(tamline);
211     free(tamline);
212     return sam_line;
213     }
214    
215     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
216     if (_next_hit.data) {
217     free(_next_hit.data);
218     _next_hit.data = NULL;
219     }
220     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
221     if (hs.eof() || !hs.ready()) return false;
222    
223     //mark_curr_pos();
224    
225     memset(&_next_hit, 0, sizeof(_next_hit));
226    
227     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
228     if (bytes_read <= 0) {
229     hs._eof = true;
230     return false;
231     }
232     buf = (const char*)&_next_hit;
233     buf_size = bytes_read;
234     return true;
235     }
236    
237    
238     BowtieHit HitFactory::create_hit(const string& insert_name,
239     const string& ref_name,
240 gpertea 154 const string& ref_name2,
241 gpertea 29 int left,
242     const vector<CigarOp>& cigar,
243     bool antisense_aln,
244     bool antisense_splice,
245     unsigned char edit_dist,
246     unsigned char splice_mms,
247     bool end)
248     {
249     uint64_t insert_id = _insert_table.get_id(insert_name);
250 gpertea 154
251 gpertea 29 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
252 gpertea 154 uint32_t reference_id2 = reference_id;
253    
254     if (ref_name2.length() > 0)
255     reference_id2 = _ref_table.get_id(ref_name2, NULL, 0);
256 gpertea 29
257     return BowtieHit(reference_id,
258 gpertea 154 reference_id2,
259 gpertea 29 insert_id,
260     left,
261     cigar,
262     antisense_aln,
263     antisense_splice,
264     edit_dist,
265     splice_mms,
266     end);
267     }
268    
269     BowtieHit HitFactory::create_hit(const string& insert_name,
270     const string& ref_name,
271     uint32_t left,
272     uint32_t read_len,
273     bool antisense_aln,
274     unsigned char edit_dist,
275     bool end)
276     {
277     uint64_t insert_id = _insert_table.get_id(insert_name);
278     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
279    
280     return BowtieHit(reference_id,
281 gpertea 154 reference_id,
282 gpertea 29 insert_id,
283     left,
284     read_len,
285     antisense_aln,
286     edit_dist,
287     end);
288     }
289    
290     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
291     BowtieHit& bh,
292     bool strip_slash,
293     char* name_out,
294     char* name_tags,
295     char* seq,
296     char* qual)
297     {
298     if (!orig_bwt_buf || !*orig_bwt_buf)
299     return false;
300    
301     static const int buf_size = 2048;
302    
303     char bwt_buf[buf_size];
304     strcpy(bwt_buf, orig_bwt_buf);
305     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
306    
307     char orientation;
308    
309     //int bwtf_ret = 0;
310     //uint32_t seqid = 0;
311    
312     char text_name[buf_size];
313     unsigned int text_offset;
314    
315    
316     //unsigned int other_occs;
317     char mismatches[buf_size];
318     //memset(mismatches, 0, sizeof(mismatches));
319    
320     const char* buf = bwt_buf;
321     char* name = get_token((char**)&buf,"\t");
322     char* orientation_str = get_token((char**)&buf,"\t");
323     char* text_name_str = get_token((char**)&buf,"\t");
324    
325     strcpy(text_name, text_name_str);
326    
327     char* text_offset_str = get_token((char**)&buf,"\t");
328     char* seq_str = get_token((char**)&buf,"\t");
329     if (seq)
330     strcpy(seq, seq_str);
331    
332     const char* qual_str = get_token((char**)&buf,"\t");
333     if (qual)
334     strcpy(qual, qual_str);
335    
336     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
337     mismatches[0] = 0;
338     char* mismatches_str = get_token((char**)&buf, "\t");
339     if (mismatches_str)
340     strcpy(mismatches, mismatches_str);
341    
342     orientation = orientation_str[0];
343     text_offset = atoi(text_offset_str);
344    
345     bool end = true;
346     unsigned int seg_offset = 0;
347     unsigned int seg_num = 0;
348     unsigned int num_segs = 0;
349    
350     // Copy the tag out of the name field before we might wipe it out
351     char* pipe = strrchr(name, '|');
352     if (pipe)
353     {
354     if (name_tags)
355     strcpy(name_tags, pipe);
356    
357     char* tag_buf = pipe + 1;
358     if (strchr(tag_buf, ':'))
359     {
360     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
361     if (seg_num + 1 == num_segs)
362     end = true;
363     else
364     end = false;
365     }
366    
367     *pipe = 0;
368     }
369     // Stripping the slash and number following it gives the insert name
370     char* slash = strrchr(name, '/');
371     if (strip_slash && slash)
372     *slash = 0;
373    
374     size_t read_len = strlen(seq_str);
375    
376     // Add this alignment to the table of hits for this half of the
377     // Bowtie map
378    
379     //vector<string> mismatch_toks;
380     char* pch = strtok (mismatches,",");
381     unsigned char num_mismatches = 0;
382     while (pch != NULL)
383     {
384     char* colon = strchr(pch, ':');
385     if (colon)
386     {
387     num_mismatches++;
388     }
389     //mismatch_toks.push_back(pch);
390     pch = strtok (NULL, ",");
391     }
392    
393     bh = create_hit(name,
394     text_name,
395     text_offset,
396     (int)read_len,
397     orientation == '-',
398     num_mismatches,
399     end);
400    
401     return true;
402     }
403    
404     int anchor_mismatch = 0;
405    
406 gpertea 70
407     void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
408     bool &end, unsigned int &seg_offset, unsigned int& seg_num,
409     unsigned int & num_segs) {
410     char* pipe = strrchr(name, '|');
411     if (pipe)
412     {
413     if (name_tags)
414     strcpy(name_tags, pipe);
415    
416     char* tag_buf = pipe + 1;
417     if (strchr(tag_buf, ':'))
418     {
419     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
420     if (seg_num + 1 == num_segs)
421     end = true;
422     else
423     end = false;
424     }
425    
426     *pipe = 0;
427     }
428    
429     // Stripping the slash and number following it gives the insert name
430     char* slash = strrchr(name, '/');
431     if (strip_slash && slash)
432     *slash = 0;
433     }
434    
435    
436 gpertea 29 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
437     BowtieHit& bh,
438     bool strip_slash,
439     char* name_out,
440     char* name_tags,
441     char* seq,
442     char* qual)
443     {
444     if (!orig_bwt_buf || !*orig_bwt_buf)
445     return false;
446    
447     static const int buf_size = 2048;
448    
449     char bwt_buf[buf_size];
450     strcpy(bwt_buf, orig_bwt_buf);
451     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
452    
453     char orientation;
454     char text_name[buf_size];
455     unsigned int text_offset;
456     char mismatches[buf_size];
457     //memset(mismatches, 0, sizeof(mismatches));
458    
459     char* buf = bwt_buf;
460     char* name = get_token((char**)&buf,"\t");
461     char* orientation_str = get_token((char**)&buf,"\t");
462     char* text_name_str = get_token((char**)&buf,"\t");
463     strcpy(text_name, text_name_str);
464    
465     char* text_offset_str = get_token((char**)&buf,"\t");
466     char* seq_str = get_token((char**)&buf,"\t");
467     if (seq)
468     strcpy(seq, seq_str);
469    
470     const char* qual_str = get_token((char**)&buf,"\t");
471     if (qual)
472     strcpy(qual, qual_str);
473    
474     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
475     mismatches[0] = 0;
476     char* mismatches_str = get_token((char**)&buf, "\t");
477     if (mismatches_str)
478     strcpy(mismatches, mismatches_str);
479    
480     orientation = orientation_str[0];
481     text_offset = atoi(text_offset_str);
482    
483     bool end = true;
484     unsigned int seg_offset = 0;
485     unsigned int seg_num = 0;
486     unsigned int num_segs = 0;
487    
488     // Copy the tag out of the name field before we might wipe it out
489 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
490 gpertea 29
491     // Add this alignment to the table of hits for this half of the
492     // Bowtie map
493    
494     // Parse the text_name field to recover the splice coords
495     vector<string> toks;
496    
497     tokenize_strict(text_name, "|", toks);
498    
499     int num_extra_toks = (int)toks.size() - 6;
500    
501     if (num_extra_toks >= 0)
502     {
503     static const uint8_t left_window_edge_field = 1;
504     static const uint8_t splice_field = 2;
505     //static const uint8_t right_window_edge_field = 3;
506     static const uint8_t junction_type_field = 4;
507     static const uint8_t strand_field = 5;
508    
509     string contig = toks[0];
510     for (int t = 1; t <= num_extra_toks; ++t)
511     {
512     contig += "|";
513     contig += toks[t];
514     }
515    
516     vector<string> splice_toks;
517     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
518     if (splice_toks.size() != 2)
519     {
520     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
521     //fprintf(stderr, "%s (token: %s)\n", text_name,
522     // toks[num_extra_toks + splice_field].c_str());
523     return false;
524     }
525    
526     //
527     // check for an insertion hit
528     //
529     if(toks[num_extra_toks + junction_type_field] == "ins")
530     {
531     int8_t spliced_read_len = strlen(seq_str);
532     /*
533     * The 0-based position of the left edge of the alignment. Note that this
534     * value may need to be futher corrected to account for the presence of
535     * of the insertion.
536     */
537     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
538     uint32_t right = left + spliced_read_len - 1;
539    
540 gpertea 137
541 gpertea 29 /*
542     * The 0-based position of the last genomic sequence before the insertion
543     */
544     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
545    
546     string insertedSequence = splice_toks[1];
547     /*
548     * The 0-based position of the first genomic sequence after teh insertion
549     */
550     uint32_t right_splice_pos = left_splice_pos + 1;
551     if(left > left_splice_pos){
552     /*
553     * The genomic position of the left edge of the alignment needs to be corrected
554     * If the alignment does not extend into the insertion, simply subtract the length
555     * of the inserted sequence, otherwise, just set it equal to the right edge
556     */
557     left = left - insertedSequence.length();
558     if(left < right_splice_pos){
559     left = right_splice_pos;
560     }
561     }
562     if(right > left_splice_pos){
563     right = right - insertedSequence.length();
564     if(right < left_splice_pos){
565     right = left_splice_pos;
566     }
567     }
568     /*
569     * Now, right and left should be properly transformed into genomic coordinates
570     * We should be able to deduce how much the alignment matches the insertion
571     * simply based on the length of the read
572     */
573     int left_match_length = 0;
574     if(left <= left_splice_pos){
575     left_match_length = left_splice_pos - left + 1;
576     }
577     int right_match_length = 0;
578     if(right >= right_splice_pos){
579     right_match_length = right - right_splice_pos + 1;
580     }
581     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
582    
583     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
584     return false;
585    
586     string junction_strand = toks[num_extra_toks + strand_field];
587     if(junction_strand != "rev" && junction_strand != "fwd"){
588     fprintf(stderr, "Malformed insertion record\n");
589     return false;
590     }
591    
592     char* pch = strtok( mismatches, ",");
593     unsigned char num_mismatches = 0;
594    
595     /*
596     * remember that text_offset holds the left end of the
597     * alignment, relative to the start of the contig
598     */
599    
600     /*
601     * The 0-based relative position of the left-most character
602     * before the insertion in the contig
603     */
604     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
605     while (pch != NULL)
606     {
607     char* colon = strchr(pch, ':');
608     if (colon)
609     {
610     *colon = 0;
611     int mismatch_pos = atoi(pch);
612    
613     /*
614     * for reversely mapped reads,
615     * find the correct mismatched position.
616     */
617     if(orientation == '-'){
618     mismatch_pos = spliced_read_len - mismatch_pos - 1;
619     }
620    
621     /*
622     * Only count mismatches outside of the insertion region
623     * If there is a mismatch within the insertion,
624     * disallow this hit
625     */
626     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
627     num_mismatches++;
628     }else{
629     return false;
630     }
631     }
632     pch = strtok (NULL, ",");
633     }
634    
635    
636     vector<CigarOp> cigar;
637     cigar.push_back(CigarOp(MATCH, left_match_length));
638     cigar.push_back(CigarOp(INS, insertion_match_length));
639     cigar.push_back(CigarOp(MATCH, right_match_length));
640    
641     /*
642     * For now, disallow hits that don't span
643     * the insertion. If we allow these types of hits,
644     * then long_spanning.cpp needs to be updated
645     * in order to intelligently merge these kinds
646     * of reads back together
647     *
648     * Following code has been changed to allow segment that end
649     * in an insertion
650     */
651     bh = create_hit(name,
652     contig,
653 gpertea 154 "",
654 gpertea 29 left,
655     cigar,
656     orientation == '-',
657     junction_strand == "rev",
658     num_mismatches,
659     0,
660     end);
661     return true;
662     }
663    
664     else
665     {
666 gpertea 154 const string& junction_type = toks[num_extra_toks + junction_type_field];
667     string junction_strand = toks[num_extra_toks + strand_field];
668    
669 gpertea 29 int spliced_read_len = strlen(seq_str);
670 gpertea 154 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str());
671     int8_t left_splice_pos = atoi(splice_toks[0].c_str());
672     if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
673     {
674     left += text_offset;
675     left_splice_pos = left_splice_pos - left + 1;
676     }
677     else
678     {
679     left -= text_offset;
680     left_splice_pos = left - left_splice_pos + 1;
681     }
682    
683 gpertea 29 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
684     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
685 gpertea 154
686     int gap_len = 0;
687     if (junction_type == "fus")
688     gap_len = atoi(splice_toks[1].c_str());
689     else
690     gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
691    
692 gpertea 29 if (right_splice_pos <= 0 || left_splice_pos <= 0)
693     return false;
694    
695     if (orientation == '+')
696     {
697     if (left_splice_pos + seg_offset < _anchor_length){
698     return false;
699     }
700     }
701     else
702     {
703     if (right_splice_pos + seg_offset < _anchor_length)
704     return false;
705     }
706 gpertea 154
707     if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")||
708 gpertea 29 !(orientation == '-' || orientation == '+'))
709     {
710     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
711     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
712     // junction_strand.c_str(), orientation);
713     return false;
714     }
715 gpertea 154
716 gpertea 29 char* pch = strtok (mismatches,",");
717     int mismatches_in_anchor = 0;
718     unsigned char num_mismatches = 0;
719     while (pch != NULL)
720     {
721     char* colon = strchr(pch, ':');
722     if (colon)
723     {
724     *colon = 0;
725     num_mismatches++;
726     int mismatch_pos = atoi(pch);
727     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
728     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
729     mismatches_in_anchor++;
730     }
731     pch = strtok (NULL, ",");
732     }
733    
734     // FIXME: we probably should exclude these hits somewhere, but this
735     // isn't the right place
736     vector<CigarOp> cigar;
737 gpertea 154 if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
738     cigar.push_back(CigarOp(MATCH, left_splice_pos));
739     else
740     cigar.push_back(CigarOp(mATCH, left_splice_pos));
741    
742     if(junction_type == "del")
743 gpertea 29 cigar.push_back(CigarOp(DEL, gap_len));
744 gpertea 154 else if(junction_type == "fus")
745     {
746     if (junction_strand == "ff")
747     cigar.push_back(CigarOp(FUSION_FF, gap_len));
748     else if (junction_strand == "fr")
749     cigar.push_back(CigarOp(FUSION_FR, gap_len));
750     else if (junction_strand == "rf")
751     cigar.push_back(CigarOp(FUSION_RF, gap_len));
752     else
753     cigar.push_back(CigarOp(FUSION_RR, gap_len));
754     }
755     else
756 gpertea 29 cigar.push_back(CigarOp(REF_SKIP, gap_len));
757 gpertea 154
758     if (junction_type != "fus" || (junction_strand != "fr" && junction_strand != "rr"))
759     cigar.push_back(CigarOp(MATCH, right_splice_pos));
760     else
761     cigar.push_back(CigarOp(mATCH, right_splice_pos));
762    
763     string contig2 = "";
764     if (junction_type == "fus")
765     {
766     vector<string> contigs;
767     tokenize(contig, "-", contigs);
768     if (contigs.size() != 2)
769     return false;
770    
771     contig = contigs[0];
772     contig2 = contigs[1];
773    
774     if (junction_strand == "rf" || junction_strand == "rr")
775     orientation = (orientation == '+' ? '-' : '+');
776     }
777    
778 gpertea 29 bh = create_hit(name,
779     contig,
780 gpertea 154 contig2,
781     left,
782 gpertea 29 cigar,
783     orientation == '-',
784     junction_strand == "rev",
785     num_mismatches,
786     mismatches_in_anchor,
787     end);
788     return true;
789     }
790     }
791     else
792     {
793     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
794     //fprintf(stderr, "%s\n", orig_bwt_buf);
795     // continue;
796     return false;
797     }
798    
799     return false;
800     }
801    
802 gpertea 70
803 gpertea 137 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
804 gpertea 70 bool &spliced_alignment) {
805     const char* p_cig = cigar_str;
806 gpertea 137 int refspan=0; //alignment span on reference sequence
807 gpertea 154
808 gpertea 70 while (*p_cig)
809     {
810     char* t;
811 gpertea 137 int op_len = (int)strtol(p_cig, &t, 10);
812     if (op_len <= 0)
813 gpertea 70 {
814 gpertea 137 fprintf (stderr, "Error: CIGAR op has zero length\n");
815     return 0;
816 gpertea 70 }
817     char op_char = toupper(*t);
818     CigarOpCode opcode;
819 gpertea 137 switch (op_char) {
820     case '=':
821     case 'X':
822     case 'M': opcode = MATCH;
823     refspan+=op_len;
824     break;
825     case 'I': opcode = INS;
826     break;
827     case 'D': opcode = DEL;
828     refspan+=op_len;
829     break;
830     case 'N': if (op_len > max_report_intron_length)
831     return 0;
832     opcode = REF_SKIP;
833     spliced_alignment = true;
834     refspan+=op_len;
835     break;
836     case 'S': opcode = SOFT_CLIP;
837     break;
838     case 'H': opcode = HARD_CLIP;
839     break;
840     case 'P': opcode = PAD;
841     break;
842     default: fprintf (stderr, "Error: invalid CIGAR operation\n");
843     return 0;
844     }
845 gpertea 70 p_cig = t + 1;
846 gpertea 137 cigar.push_back(CigarOp(opcode, op_len));
847     } //while cigar codes
848 gpertea 70 if (*p_cig) {
849 gpertea 137 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
850     return 0;
851 gpertea 70 }
852 gpertea 137 return refspan;
853 gpertea 70 }
854    
855 gpertea 154 int getBAMmismatches(const bam1_t* buf, vector<CigarOp>& cigar,
856     vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
857     int gspan=0;//genomic span of the alignment
858     sam_nm=0;
859     int num_mismatches=0;
860    
861     uint8_t* ptr = bam_aux_get(buf, "XS");
862     if (ptr) {
863     char src_strand_char = bam_aux2A(ptr);
864     if (src_strand_char == '-')
865     antisense_splice = true;
866     }
867    
868     ptr = bam_aux_get(buf, "MD");
869     if (ptr) {
870     const char* p = bam_aux2Z(ptr);
871     int bi=0; //base offset position in the read
872     while (*p != 0) {
873     if (isdigit(*p)) {
874     int v=atoi(p);
875     do { p++; } while (isdigit(*p));
876     bi+=v;
877     }
878     while (isalpha(*p)) {
879     p++;
880     num_mismatches++;
881     //mismatches.push_back(bi);
882     mismatches[bi]=true;
883     bi++;
884     }
885     if (*p=='^') { //reference deletion
886     p++;
887     while (isalpha(*p)) { //insert read bases
888     p++; bi++;
889     }
890     }
891     }
892     }
893    
894     /* By convention,the NM field of the SAM record
895     * counts an insertion or deletion. I dont' think
896     * we want the mismatch count in the BowtieHit
897     * record to reflect this. Therefore, subtract out
898     * the mismatches due to in/dels
899     */
900     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
901     switch (itr->opcode)
902     {
903     case MATCH:
904     case REF_SKIP:
905     case PAD:
906     gspan += itr->length;
907     break;
908     case DEL:
909     gspan += itr->length;
910     sam_nm -= itr->length;
911     break;
912     case INS:
913     sam_nm -= itr->length;
914     break;
915     default:
916     break;
917     }
918     }
919     return num_mismatches;
920     }
921    
922 gpertea 137 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
923 gpertea 139 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
924 gpertea 137 int gspan=0;//genomic span of the alignment
925     const char* tag_buf = buf;
926 gpertea 70 sam_nm=0;
927 gpertea 139 int num_mismatches=0;
928 gpertea 70 while((tag_buf = get_token((char**)&buf,"\t")))
929     {
930     vector<string> tuple_fields;
931     tokenize(tag_buf,":", tuple_fields);
932     if (tuple_fields.size() == 3)
933     {
934     if (tuple_fields[0] == "XS")
935     {
936     if (tuple_fields[2] == "-")
937     antisense_splice = true;
938     }
939     else if (tuple_fields[0] == "NM")
940     {
941     sam_nm = atoi(tuple_fields[2].c_str());
942     }
943     else if (tuple_fields[0] == "NS")
944     {
945     //ignored for now
946     }
947     else if (tuple_fields[0] == "MD")
948     {
949     const char* p=tuple_fields[2].c_str();
950     int bi=0; //base offset position in the read
951     while (*p != 0) {
952     if (isdigit(*p)) {
953     int v=atoi(p);
954     do { p++; } while (isdigit(*p));
955     bi+=v;
956     }
957 gpertea 137 while (isalpha(*p)) {
958 gpertea 70 p++;
959 gpertea 139 num_mismatches++;
960     //mismatches.push_back(bi);
961     mismatches[bi]=true;
962 gpertea 70 bi++;
963     }
964     if (*p=='^') { //reference deletion
965     p++;
966 gpertea 137 while (isalpha(*p)) { //insert read bases
967 gpertea 70 p++; bi++;
968 gpertea 137 }
969 gpertea 70 }
970     }
971     }
972 gpertea 139 //else
973     //{
974 gpertea 70 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
975     //return false;
976 gpertea 139 //}
977 gpertea 70 }
978     }
979     /* By convention,the NM field of the SAM record
980     * counts an insertion or deletion. I dont' think
981     * we want the mismatch count in the BowtieHit
982     * record to reflect this. Therefore, subtract out
983     * the mismatches due to in/dels
984     */
985     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
986 gpertea 137 switch (itr->opcode)
987     {
988     case MATCH:
989     case REF_SKIP:
990     case PAD:
991     gspan += itr->length;
992     break;
993     case DEL:
994     gspan += itr->length;
995     sam_nm -= itr->length;
996     break;
997     case INS:
998     sam_nm -= itr->length;
999     break;
1000 gpertea 154 default:
1001     break;
1002 gpertea 70 }
1003 gpertea 137 }
1004 gpertea 139 return num_mismatches;
1005 gpertea 70 }
1006    
1007 gpertea 29 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1008     BowtieHit& bh,
1009     bool strip_slash,
1010     char* name_out,
1011     char* name_tags,
1012     char* seq,
1013     char* qual)
1014     {
1015     if (!orig_bwt_buf || !*orig_bwt_buf)
1016     return false;
1017     char bwt_buf[2048];
1018     strcpy(bwt_buf, orig_bwt_buf);
1019     // Are we still in the header region?
1020     if (bwt_buf[0] == '@')
1021     return false;
1022 gpertea 154
1023 gpertea 29 char* buf = bwt_buf;
1024     char* name = get_token((char**)&buf,"\t");
1025     char* sam_flag_str = get_token((char**)&buf,"\t");
1026     char* text_name = get_token((char**)&buf,"\t");
1027     char* text_offset_str = get_token((char**)&buf,"\t");
1028     const char* map_qual_str = get_token((char**)&buf,"\t");
1029     char* cigar_str = get_token((char**)&buf,"\t");
1030     const char* mate_ref_str = get_token((char**)&buf,"\t");
1031     const char* mate_pos_str = get_token((char**)&buf,"\t");
1032     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1033    
1034     const char* seq_str = get_token((char**)&buf,"\t");
1035     if (seq)
1036     strcpy(seq, seq_str);
1037     const char* qual_str = get_token((char**)&buf,"\t");
1038     if (qual)
1039     strcpy(qual, qual_str);
1040    
1041     if (!name ||
1042     !sam_flag_str ||
1043     !text_name ||
1044     !text_offset_str ||
1045     !map_qual_str ||
1046     !cigar_str ||
1047     !mate_ref_str ||
1048     !mate_pos_str ||
1049     !inferred_insert_sz_str ||
1050     !seq_str ||
1051     !qual_str)
1052     {
1053     // truncated or malformed SAM record
1054     return false;
1055 gpertea 154 }
1056    
1057 gpertea 29 int sam_flag = atoi(sam_flag_str);
1058 gpertea 154 string ref_name = text_name, ref_name2 = "";
1059 gpertea 29 int text_offset = atoi(text_offset_str);
1060    
1061     bool end = true;
1062     unsigned int seg_offset = 0;
1063     unsigned int seg_num = 0;
1064     unsigned int num_segs = 0;
1065    
1066     // Copy the tag out of the name field before we might wipe it out
1067 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1068 gpertea 154
1069 gpertea 29 vector<CigarOp> cigar;
1070     bool spliced_alignment = false;
1071 gpertea 154
1072 gpertea 137 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
1073     if (refspan==0)
1074 gpertea 70 return false;
1075 gpertea 29 //vector<string> attributes;
1076     //tokenize(tag_buf, " \t",attributes);
1077 gpertea 70
1078 gpertea 29 bool antisense_splice = false;
1079 gpertea 70 int sam_nm = 0; //the value of the NM tag (edit distance)
1080 gpertea 137 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
1081 gpertea 139 vector<bool> mismatches;
1082     mismatches.resize(strlen(seq_str), false);
1083     int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
1084 gpertea 154
1085 gpertea 29 if (spliced_alignment)
1086     {
1087     bh = create_hit(name,
1088 gpertea 154 ref_name,
1089     ref_name2,
1090 gpertea 29 text_offset - 1,
1091     cigar,
1092     sam_flag & 0x0010,
1093     antisense_splice,
1094 gpertea 139 num_mismatches,
1095 gpertea 70 0,
1096 gpertea 29 end);
1097     }
1098     else
1099     {
1100     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1101     bh = create_hit(name,
1102 gpertea 154 ref_name,
1103     ref_name2,
1104 gpertea 29 text_offset - 1, // SAM files are 1-indexed
1105     cigar,
1106     sam_flag & 0x0010,
1107     false,
1108 gpertea 139 num_mismatches,
1109 gpertea 29 0,
1110     end);
1111     }
1112 gpertea 70 return true;
1113 gpertea 29 }
1114    
1115 gpertea 137 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
1116     if (op.length<=0) return;
1117     if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
1118     cigar.back().length+=op.length;
1119     }
1120     cigar.push_back(op);
1121     }
1122 gpertea 70
1123 gpertea 159 bool spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches,
1124     int &left, int spl_start, int spl_len, CigarOpCode spl_code, int& spl_mismatches) {
1125     //merge the original 'cigar' with the new insert/gap operation
1126     //at position spl_start and place the result into splcigar;
1127     //TODO: ideally this should also get and rebuild the MD string (alignment mismatches)
1128 gpertea 139
1129 gpertea 159 //return value: mismatches in the insert region for INS case,
1130     //or number of mismatches in the anchor region
1131     //return -1 if somehow the hit seems bad
1132    
1133     //these offsets are relative to the beginning of alignment on reference
1134     int spl_ofs=spl_start-left; //relative position of splice op
1135     if (spl_code == FUSION_FF || spl_code == FUSION_FR || spl_code == FUSION_RF || spl_code == FUSION_RR)
1136     spl_ofs = abs(spl_ofs);
1137     int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
1138     CigarOp gapop(spl_code, spl_len); //for DEL, REF_SKIP, FUSIONS
1139     if (spl_code==INS)
1140     spl_ofs_end += spl_len;
1141    
1142     int ref_ofs=0; //working offset on reference
1143     int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
1144     bool xfound=false;
1145     //if (left<=spl_start+spl_len) {
1146     if (spl_ofs_end>0) {
1147     int prev_opcode=0;
1148     int prev_oplen=0;
1149     for (size_t c = 0 ; c < cigar.size(); ++c)
1150 gpertea 137 {
1151     int prev_read_ofs=read_ofs;
1152     int cur_op_ofs=ref_ofs;
1153     int cur_opcode=cigar[c].opcode;
1154     int cur_oplen=cigar[c].length;
1155 gpertea 154
1156 gpertea 137 switch (cur_opcode) {
1157     case MATCH:
1158     ref_ofs+=cur_oplen;
1159     read_ofs+=cur_oplen;
1160 gpertea 154 if (spl_code==REF_SKIP || spl_code==DEL ||
1161     spl_code==FUSION_FF || spl_code==FUSION_FR ||
1162     spl_code==FUSION_RF || spl_code==FUSION_RR) {
1163     for (int o=cur_op_ofs;o<ref_ofs;o++) {
1164     int rofs=prev_read_ofs+(o-cur_op_ofs);
1165     if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs])
1166     spl_mismatches++;
1167     }
1168     }
1169     else if (spl_code==INS) {
1170     for (int o=cur_op_ofs;o<ref_ofs;o++) {
1171     int rofs=prev_read_ofs+(o-cur_op_ofs);
1172     if (o>=spl_ofs && o<spl_ofs_end && mismatches[rofs])
1173     spl_mismatches++;
1174     }
1175     }
1176 gpertea 137 break;
1177     case DEL:
1178     case REF_SKIP:
1179     case PAD:
1180     ref_ofs+=cur_oplen;
1181     break;
1182     case SOFT_CLIP:
1183     case INS:
1184     read_ofs+=cur_oplen;
1185     break;
1186     //case HARD_CLIP:
1187     }
1188 gpertea 139
1189 gpertea 137 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1190 gpertea 159 if (cur_op_ofs==spl_ofs_end) {
1191     if (spl_code!=INS) {
1192     if (cur_opcode!=INS) {
1193     xfound=true;
1194     //we have to insert the gap here first
1195     cigar_add(splcigar, gapop);
1196     //also, check
1197     }
1198     }
1199     }
1200    
1201     CigarOp op(cigar[c]);
1202     if (xfound)
1203     {
1204     if (spl_code == FUSION_FR || spl_code == FUSION_RR)
1205     {
1206     if (op.opcode == MATCH)
1207     op.opcode = mATCH;
1208     else if (op.opcode == INS)
1209     op.opcode = iNS;
1210     else if (op.opcode == DEL)
1211     op.opcode = dEL;
1212     else if (op.opcode == REF_SKIP)
1213     op.opcode = rEF_SKIP;
1214     }
1215     }
1216     else
1217     {
1218     if (spl_code == FUSION_RF || spl_code == FUSION_RR)
1219     {
1220     if (op.opcode == MATCH)
1221     op.opcode = mATCH;
1222     else if (op.opcode == INS)
1223     op.opcode = iNS;
1224     else if (op.opcode == DEL)
1225     op.opcode = dEL;
1226     else if (op.opcode == REF_SKIP)
1227     op.opcode = rEF_SKIP;
1228     }
1229     }
1230    
1231     cigar_add(splcigar, op);
1232     }
1233 gpertea 137 else //if (ref_ofs>spl_ofs) {
1234     { //op intersection
1235     xfound=true;
1236     if (spl_code==INS) {
1237     //we have to shorten cur_opcode
1238     // find the overlap between current range
1239     int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1240     int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1241 gpertea 154
1242     CigarOp op(cigar[c]);
1243     op.length=spl_ofs-cur_op_ofs;
1244     if (spl_ofs>cur_op_ofs)
1245     cigar_add(splcigar, op);
1246     if (spl_ofs<0)
1247     {
1248     CigarOp temp = gapop;
1249     temp.length += spl_ofs;
1250     if (temp.length>0)
1251     cigar_add(splcigar, temp);
1252     }
1253     else
1254     cigar_add(splcigar, gapop);
1255     op.length=ref_ofs-spl_ofs_end;
1256     if (ref_ofs>spl_ofs_end)
1257     cigar_add(splcigar,op);
1258 gpertea 137 }
1259 gpertea 154 else {//DEL or REF_SKIP or FUSION_[FR][FR]
1260 gpertea 137 //spl_ofs == spl_ofs_end
1261     //we have to split cur_opcode
1262 gpertea 139 //look for mismatches within min_anchor_len distance from splice point
1263 gpertea 137 CigarOp op(cigar[c]);
1264 gpertea 154 CigarOpCode opcode = op.opcode;
1265 gpertea 137 op.length=spl_ofs-cur_op_ofs;
1266 gpertea 159 if (spl_code == FUSION_RF || spl_code == FUSION_RR)
1267 gpertea 154 {
1268     if (opcode == MATCH)
1269     op.opcode = mATCH;
1270     else if (opcode == INS)
1271     op.opcode = iNS;
1272     else if (opcode == DEL)
1273     op.opcode = dEL;
1274     else if (opcode == REF_SKIP)
1275     op.opcode = rEF_SKIP;
1276     }
1277 gpertea 137 cigar_add(splcigar, op);
1278 gpertea 154
1279     cigar_add(splcigar, gapop);
1280    
1281     op.opcode = opcode;
1282 gpertea 159 if (spl_code == FUSION_FR || spl_code == FUSION_RR)
1283 gpertea 154 {
1284     if (opcode == MATCH)
1285     op.opcode = mATCH;
1286     else if (opcode == INS)
1287     op.opcode = iNS;
1288     else if (opcode == DEL)
1289     op.opcode = dEL;
1290     else if (opcode == REF_SKIP)
1291     op.opcode = rEF_SKIP;
1292     }
1293 gpertea 137 op.length=ref_ofs-spl_ofs;
1294     cigar_add(splcigar,op);
1295     }
1296     } //op intersection
1297     prev_opcode=cur_opcode;
1298     prev_oplen=cur_oplen;
1299     } //for each cigar opcode
1300     } //intersection possible
1301 gpertea 139
1302     //if (!xfound) {//no intersection found between splice event and alignment
1303 gpertea 137 if (spl_ofs_end<=0) {
1304     //alignment starts after the splice event
1305     if (spl_code==INS) left-=spl_len;
1306     else left+=spl_len;
1307 gpertea 154
1308     splcigar = cigar;
1309 gpertea 137 }
1310     //else {
1311     //alignment ends before the splice event
1312     //nothing to do
1313     // }
1314 gpertea 139 //return spl_mismatches;
1315     // }
1316 gpertea 97
1317 gpertea 159 return xfound;
1318 gpertea 70 }
1319    
1320 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1321     BowtieHit& bh,
1322     bool strip_slash,
1323     char* name_out,
1324     char* name_tags,
1325     char* seq,
1326     char* qual)
1327     {
1328     if (!orig_bwt_buf || !*orig_bwt_buf)
1329     return false;
1330 gpertea 29
1331 gpertea 69 char bwt_buf[2048];
1332     strcpy(bwt_buf, orig_bwt_buf);
1333     // Are we still in the header region?
1334     if (bwt_buf[0] == '@')
1335     return false;
1336 gpertea 29
1337 gpertea 69 char* buf = bwt_buf;
1338     char* name = get_token((char**)&buf,"\t");
1339     char* sam_flag_str = get_token((char**)&buf,"\t");
1340     char* text_name = get_token((char**)&buf,"\t");
1341     char* text_offset_str = get_token((char**)&buf,"\t");
1342     const char* map_qual_str = get_token((char**)&buf,"\t");
1343     char* cigar_str = get_token((char**)&buf,"\t");
1344     const char* mate_ref_str = get_token((char**)&buf,"\t");
1345     const char* mate_pos_str = get_token((char**)&buf,"\t");
1346     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1347 gpertea 137 //int num_mismatches=0;
1348     //int mismatches[1024]; //list of 0-based mismatch positions in this read
1349 gpertea 70 //parsed from SAM's MD:Z: tag
1350 gpertea 69 const char* seq_str = get_token((char**)&buf,"\t");
1351     if (seq)
1352     strcpy(seq, seq_str);
1353    
1354     const char* qual_str = get_token((char**)&buf,"\t");
1355     if (qual)
1356     strcpy(qual, qual_str);
1357    
1358     if (!name ||
1359     !sam_flag_str ||
1360     !text_name ||
1361     !text_offset_str ||
1362     !map_qual_str ||
1363     !cigar_str ||
1364     !mate_ref_str ||
1365     !mate_pos_str ||
1366     !inferred_insert_sz_str ||
1367     !seq_str ||
1368     !qual_str)
1369     {
1370     // truncated or malformed SAM record
1371     return false;
1372     }
1373    
1374     int sam_flag = atoi(sam_flag_str);
1375     int text_offset = atoi(text_offset_str);
1376 gpertea 137 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1377 gpertea 69 bool end = true;
1378     unsigned int seg_offset = 0;
1379     unsigned int seg_num = 0;
1380     unsigned int num_segs = 0;
1381    
1382     // Copy the tag out of the name field before we might wipe it out
1383 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1384 gpertea 69
1385 gpertea 70 vector<CigarOp> samcigar;
1386     bool spliced_alignment = false;
1387 gpertea 137 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1388 gpertea 154
1389 gpertea 137 if (refspan==0)
1390     return false;
1391 gpertea 70 bool antisense_splice = false;
1392     int sam_nm = 0;
1393 gpertea 139 vector<bool> mismatches;
1394     mismatches.resize(strlen(seq_str), false);
1395 gpertea 69
1396 gpertea 139 int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1397    
1398 gpertea 70 //##############################################
1399 gpertea 69
1400     // Add this alignment to the table of hits for this half of the
1401     // Bowtie map
1402    
1403     // Parse the text_name field to recover the splice coords
1404     vector<string> toks;
1405     tokenize_strict(text_name, "|", toks);
1406    
1407     int num_extra_toks = (int)toks.size() - 6;
1408    
1409     if (num_extra_toks >= 0)
1410     {
1411     static const uint8_t left_window_edge_field = 1;
1412     static const uint8_t splice_field = 2;
1413     //static const uint8_t right_window_edge_field = 3;
1414     static const uint8_t junction_type_field = 4;
1415     static const uint8_t strand_field = 5;
1416    
1417     string contig = toks[0];
1418     for (int t = 1; t <= num_extra_toks; ++t)
1419     {
1420     contig += "|";
1421     contig += toks[t];
1422     }
1423    
1424     vector<string> splice_toks;
1425     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1426     if (splice_toks.size() != 2)
1427     {
1428     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1429 gpertea 70 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1430 gpertea 69 // toks[num_extra_toks + splice_field].c_str());
1431     return false;
1432     }
1433 gpertea 137
1434     string junction_strand = toks[num_extra_toks + strand_field];
1435     if(junction_strand != "rev" && junction_strand != "fwd"){
1436     fprintf(stderr, "Malformed insertion record\n");
1437     return false;
1438     }
1439    
1440 gpertea 154 //
1441     // check for an insertion hit
1442     //
1443     if(toks[num_extra_toks + junction_type_field] == "ins")
1444     {
1445     //int8_t spliced_read_len = strlen(seq_str);
1446     //TODO FIXME: use the CIGAR instead of seq length!
1447     // The 0-based position of the left edge of the alignment. Note that this
1448     // value may need to be further corrected to account for the presence of
1449     // of the insertion.
1450     int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1451    
1452     // The 0-based position of the last genomic sequence before the insertion
1453     int left_splice_pos = atoi(splice_toks[0].c_str());
1454    
1455     string insertedSequence = splice_toks[1];
1456     // The 0-based position of the first genomic sequence after the insertion
1457 gpertea 69
1458 gpertea 154 vector<CigarOp> splcigar;
1459     //this also updates left to the adjusted genomic coordinates
1460 gpertea 159 int spl_num_mismatches=0;
1461     bool overlapped = spliceCigar(splcigar, samcigar, mismatches,
1462     left, left_splice_pos+1, insertedSequence.length(),
1463     INS, spl_num_mismatches);
1464    
1465     if (!overlapped)
1466     return false;
1467 gpertea 69
1468 gpertea 154 if (spl_num_mismatches<0) return false;
1469     num_mismatches-=spl_num_mismatches;
1470     /*
1471     uint32_t right_splice_pos = left_splice_pos + 1;
1472    
1473     //uint32_t right = left + spliced_read_len - 1;
1474     int right = left + refspan - 1;
1475    
1476     if(left > left_splice_pos){
1477     //The genomic position of the left edge of the alignment needs to be corrected
1478     //If the alignment does not extend into the insertion, simply subtract the length
1479     //of the inserted sequence, otherwise, just set it equal to the right edge
1480     left = left - insertedSequence.length();
1481     if(left < right_splice_pos){
1482     left = right_splice_pos;
1483     }
1484     }
1485     if(right > left_splice_pos){
1486     right = right - insertedSequence.length();
1487     if(right < left_splice_pos){
1488     right = left_splice_pos;
1489     }
1490     }
1491     // Now, right and left should be properly transformed into genomic coordinates
1492     // We should be able to deduce how much the alignment matches the insertion
1493     // simply based on the length of the read
1494     int left_match_length = 0;
1495     if(left <= left_splice_pos){
1496     left_match_length = left_splice_pos - left + 1;
1497     }
1498     int right_match_length = 0;
1499     if(right >= right_splice_pos){
1500     right_match_length = right - right_splice_pos + 1;
1501     }
1502     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1503    
1504     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1505     return false;
1506    
1507     //char* pch = strtok( mismatches, ",");
1508     //unsigned char num_mismatches = 0;
1509     //text_offset holds the left end of the alignment,
1510     //RELATIVE TO the start of the contig
1511    
1512     //The 0-based relative position of the left-most character
1513     //before the insertion in the contig
1514     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1515     for (size_t i=0;i<mismatches.size();++i) {
1516     int mismatch_pos = mismatches[i];
1517     // for reversely mapped reads,
1518     //find the correct mismatched position.
1519     if (sam_flag & 0x0010){
1520     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1521     }
1522    
1523     //Only count mismatches outside of the insertion region
1524     // If there is a mismatch within the insertion,
1525     // disallow this hit
1526     if(mismatch_pos + text_offset <= relative_splice_pos ||
1527     mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1528     spl_num_mismatches++;
1529     }else{
1530     return false;
1531     }
1532     }
1533     */
1534     //vector<CigarOp> splcigar;
1535     //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1536     //splcigar.push_back(CigarOp(MATCH, left_match_length));
1537     //splcigar.push_back(CigarOp(INS, insertion_match_length));
1538     //splcigar.push_back(CigarOp(MATCH, right_match_length));
1539    
1540     bh = create_hit(name,
1541     contig,
1542     "",
1543     left,
1544     //splcigar,
1545     splcigar,
1546     sam_flag & 0x0010,
1547     junction_strand == "rev",
1548     num_mismatches,
1549     0,
1550     end);
1551 gpertea 70
1552 gpertea 154 return true;
1553     } //"ins"
1554     else //"del" or intron
1555     {
1556     // The 0-based position of the left edge of the alignment.
1557     int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1558    
1559     // The 0-based position of the last genomic sequence before the deletion
1560     int left_splice_pos = atoi(splice_toks[0].c_str());
1561    
1562 gpertea 139 int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
1563     /*
1564 gpertea 154 if ((sam_flag & 0x0010) == 0) //######
1565     {
1566     if (left_splice_ofs + seg_offset < _anchor_length)
1567     return false;
1568 gpertea 135 }
1569 gpertea 154 else
1570 gpertea 135 {
1571 gpertea 154 if (right_splice_ofs + seg_offset < _anchor_length)
1572     return false;
1573 gpertea 135 }
1574 gpertea 154 */
1575     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1576     //atoi(toks[right_window_edge_field].c_str());
1577    
1578 gpertea 139 /*
1579     //offset of deletion point, relative to the beginning of the alignment
1580     int left_splice_ofs = left_splice_pos-left+1;
1581 gpertea 154
1582     int mismatches_in_anchor = 0;
1583     for (size_t i=0;i<mismatches.size();++i) {
1584     spl_num_mismatches++;
1585     int mismatch_pos = mismatches[i];
1586     if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) ||
1587     ((sam_flag & 0x0010) != 0 &&
1588     abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len)
1589     mismatches_in_anchor++;
1590     }
1591     */
1592     vector<CigarOp> splcigar;
1593 gpertea 139
1594 gpertea 154 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP;
1595    
1596 gpertea 159 int spl_num_mismatches=0;
1597     bool overlapped = spliceCigar(splcigar, samcigar, mismatches, left,
1598     left_splice_pos+1, gap_len, opcode, spl_num_mismatches);
1599 gpertea 140
1600 gpertea 159 if (!overlapped)
1601     return false;
1602    
1603 gpertea 154 if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
1604     return false;
1605 gpertea 139 /*
1606 gpertea 154 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1607     if(toks[num_extra_toks + junction_type_field] == "del"){
1608     splcigar.push_back(CigarOp(DEL, gap_len));
1609     }else{
1610     splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1611     }
1612     splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1613 gpertea 139 */
1614 gpertea 154 bh = create_hit(name,
1615     contig,
1616     "",
1617     left,
1618     splcigar,
1619     (sam_flag & 0x0010),
1620     junction_strand == "rev",
1621     num_mismatches,
1622     spl_num_mismatches,
1623     end);
1624    
1625     return true;
1626     }
1627 gpertea 70 } //parse splice data
1628 gpertea 69 else
1629 gpertea 154 {
1630     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1631     //fprintf(stderr, "%s\n", orig_bwt_buf);
1632     // continue;
1633     return false;
1634     }
1635    
1636 gpertea 69 return false;
1637     }
1638    
1639    
1640 gpertea 135
1641 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1642 gpertea 159 BowtieHit& bh, bool strip_slash,
1643     char* name_out, char* name_tags,
1644     char* seq, char* qual) {
1645 gpertea 154 if (_sam_header==NULL)
1646     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1647     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1648    
1649     uint32_t sam_flag = hit_buf->core.flag;
1650    
1651     int text_offset = hit_buf->core.pos;
1652     int text_mate_pos = hit_buf->core.mpos;
1653     int target_id = hit_buf->core.tid;
1654     int mate_target_id = hit_buf->core.mtid;
1655 gpertea 29
1656 gpertea 154 vector<CigarOp> cigar;
1657     bool spliced_alignment = false;
1658     int num_hits = 1;
1659    
1660     double mapQ = hit_buf->core.qual;
1661     long double error_prob;
1662     if (mapQ > 0)
1663     {
1664     long double p = (-1.0 * mapQ) / 10.0;
1665     error_prob = pow(10.0L, p);
1666     }
1667     else
1668     {
1669     error_prob = 1.0;
1670     }
1671    
1672     bool end = true;
1673     unsigned int seg_offset = 0;
1674     unsigned int seg_num = 0;
1675     unsigned int num_segs = 0;
1676     // Copy the tag out of the name field before we might wipe it out
1677     char* qname = bam1_qname(hit_buf);
1678     char* pipe = strrchr(qname, '|');
1679     if (pipe)
1680     {
1681     if (name_tags)
1682     strcpy(name_tags, pipe);
1683    
1684     char* tag_buf = pipe + 1;
1685     if (strchr(tag_buf, ':'))
1686 gpertea 29 {
1687 gpertea 154 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1688     if (seg_num + 1 == num_segs)
1689     end = true;
1690     else
1691     end = false;
1692 gpertea 29 }
1693 gpertea 154
1694     *pipe = 0;
1695     }
1696 gpertea 29
1697 gpertea 154 if (target_id < 0) {
1698     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1699     bh = create_hit(qname,
1700     "*", //ref_name
1701     0, //left coord
1702     0, //read_len
1703     false, //antisense_aln
1704     0, //edit_dist
1705     end);
1706     return true;
1707     }
1708 gpertea 29
1709 gpertea 154 if (seq!=NULL) {
1710     char *bseq = (char*)bam1_seq(hit_buf);
1711     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1712     char v = bam1_seqi(bseq,i);
1713     seq[i]=bam_nt16_rev_table[v];
1714 gpertea 29 }
1715 gpertea 154 seq[hit_buf->core.l_qseq]=0;
1716     }
1717     if (qual!=NULL) {
1718     char *bq = (char*)bam1_qual(hit_buf);
1719     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1720     qual[i]=bq[i]+33;
1721     }
1722     qual[hit_buf->core.l_qseq]=0;
1723     }
1724 gpertea 29
1725 gpertea 154 bool antisense_splice=false;
1726     unsigned char num_mismatches = 0;
1727     unsigned char num_splice_anchor_mismatches = 0;
1728    
1729     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1730     if (ptr) {
1731     char src_strand_char = bam_aux2A(ptr);
1732     if (src_strand_char == '-')
1733     antisense_splice = true;
1734     }
1735    
1736     ptr = bam_aux_get(hit_buf, "NM");
1737     if (ptr) {
1738     num_mismatches = bam_aux2i(ptr);
1739     }
1740    
1741     ptr = bam_aux_get(hit_buf, "NH");
1742     if (ptr) {
1743     num_hits = bam_aux2i(ptr);
1744     }
1745 gpertea 29
1746 gpertea 154 string text_name = _sam_header->target_name[target_id];
1747     string text_name2 = "";
1748    
1749     bool fusion_alignment = false;
1750     string fusion_cigar_str;
1751     ptr = bam_aux_get(hit_buf, "XF");
1752     if (ptr) {
1753     fusion_alignment = true;
1754     char* xf = bam_aux2Z(ptr);
1755    
1756     // ignore the second part of a fusion alignment
1757     if (xf[0] == '2')
1758     return false;
1759    
1760     vector<string> fields;
1761     tokenize(xf, " ", fields);
1762    
1763     vector<string> contigs;
1764     tokenize(fields[1], "-", contigs);
1765     if (contigs.size() >= 2)
1766     {
1767     text_name = contigs[0];
1768     text_name2 = contigs[1];
1769     }
1770    
1771     text_offset = atoi(fields[2].c_str()) - 1;
1772     fusion_cigar_str = fields[3].c_str();
1773    
1774     if (seq)
1775     strcpy(seq, fields[4].c_str());
1776     if (qual)
1777     strcpy(qual, fields[5].c_str());
1778     }
1779    
1780     if (fusion_alignment) {
1781     const char* p_cig = fusion_cigar_str.c_str();
1782     while (*p_cig) {
1783     char* t;
1784     int length = (int)strtol(p_cig, &t, 10);
1785     if (length <= 0)
1786     {
1787     //fprintf (stderr, "CIGAR op has zero length\n");
1788     return false;
1789 gpertea 29 }
1790 gpertea 154 char op_char = *t;
1791     CigarOpCode opcode;
1792     if (op_char == 'M') opcode = MATCH;
1793     else if(op_char == 'm') opcode = mATCH;
1794     else if (op_char == 'I') opcode = INS;
1795     else if (op_char == 'i') opcode = iNS;
1796     else if (op_char == 'D') opcode = DEL;
1797     else if (op_char == 'd') opcode = dEL;
1798     else if (op_char == 'N' || op_char == 'n')
1799 gpertea 29 {
1800 gpertea 154 if (length > max_report_intron_length)
1801     return false;
1802    
1803     if (op_char == 'N')
1804     opcode = REF_SKIP;
1805     else
1806     opcode = rEF_SKIP;
1807     spliced_alignment = true;
1808     }
1809     else if (op_char == 'F')
1810     {
1811     opcode = FUSION_FF;
1812     length = length - 1;
1813     }
1814     else if (op_char == 'S') opcode = SOFT_CLIP;
1815     else if (op_char == 'H') opcode = HARD_CLIP;
1816     else if (op_char == 'P') opcode = PAD;
1817     else
1818     {
1819     fprintf (stderr, "(%d-%d) invalid CIGAR operation\n", length, (int)op_char);
1820     return false;
1821     }
1822     p_cig = t + 1;
1823     cigar.push_back(CigarOp(opcode, length));
1824 gpertea 29
1825 gpertea 154 if(opcode == INS) {
1826     num_mismatches -= length;
1827     }
1828     else if(opcode == DEL) {
1829     num_mismatches -= length;
1830     }
1831    
1832     /*
1833     * update fusion direction.
1834     */
1835     size_t cigar_size = cigar.size();
1836     if (cigar_size >= 3 && cigar[cigar_size - 2].opcode == FUSION_FF)
1837     {
1838     CigarOpCode prev = cigar[cigar_size - 3].opcode;
1839     CigarOpCode next = cigar[cigar_size - 1].opcode;
1840    
1841     bool increase1 = false, increase2 = false;
1842     if (prev == MATCH || prev == DEL || prev == INS || prev == REF_SKIP)
1843     increase1 = true;
1844     if (next == MATCH || next == DEL || next == INS || next == REF_SKIP)
1845     increase2 = true;
1846    
1847     if (increase1 && !increase2)
1848     cigar[cigar_size - 2].opcode = FUSION_FR;
1849     else if (!increase1 && increase2)
1850     cigar[cigar_size - 2].opcode = FUSION_RF;
1851     else if (!increase1 && !increase2)
1852     cigar[cigar_size - 2].opcode = FUSION_RR;
1853 gpertea 29 }
1854 gpertea 154 }
1855     }
1856     else {
1857     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1858     {
1859     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1860     if (length <= 0)
1861     {
1862 gpertea 159 fprintf (stderr, "insert_id: %s - BAM error: CIGAR op has zero length\n", qname);
1863    
1864     // daehwan - remove this
1865     exit(1);
1866    
1867 gpertea 154 return false;
1868     }
1869 gpertea 29
1870 gpertea 154 CigarOpCode opcode;
1871     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1872     {
1873     case BAM_CMATCH: opcode = MATCH; break;
1874     case BAM_CINS: opcode = INS; break;
1875     case BAM_CDEL: opcode = DEL; break;
1876     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1877     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1878     case BAM_CPAD: opcode = PAD; break;
1879     case BAM_CREF_SKIP:
1880     opcode = REF_SKIP;
1881     spliced_alignment = true;
1882     if (length > (int)max_report_intron_length)
1883     {
1884     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1885     return false;
1886     }
1887     break;
1888     default:
1889     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1890     return false;
1891     }
1892    
1893     if (opcode != HARD_CLIP)
1894     cigar.push_back(CigarOp(opcode, length));
1895    
1896     /*
1897     * By convention,the NM field of the SAM record
1898     * counts an insertion or deletion. I dont' think
1899     * we want the mismatch count in the BowtieHit
1900     * record to reflect this. Therefore, subtract out
1901     * the mismatches due to in/dels
1902     */
1903     if(opcode == INS){
1904     num_mismatches -= length;
1905     }
1906     else if(opcode == DEL){
1907     num_mismatches -= length;
1908     }
1909     }
1910     }
1911 gpertea 29
1912 gpertea 154
1913     string mrnm;
1914     if (mate_target_id >= 0) {
1915     if (mate_target_id == target_id) {
1916     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1917     mrnm = _sam_header->target_name[mate_target_id];
1918     }
1919     else {
1920     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1921     return false;
1922     }
1923     }
1924     else {
1925     text_mate_pos = 0;
1926     }
1927 gpertea 29
1928 gpertea 154 if (spliced_alignment) {
1929     bh = create_hit(qname,
1930     text_name,
1931     text_name2,
1932     text_offset, // BAM files are 0-indexed
1933     cigar,
1934     sam_flag & 0x0010,
1935     antisense_splice,
1936     num_mismatches,
1937     num_splice_anchor_mismatches,
1938     end);
1939 gpertea 29
1940 gpertea 154 }
1941     else {
1942     bh = create_hit(qname,
1943     text_name,
1944     text_name2,
1945     text_offset, // BAM files are 0-indexed
1946     cigar,
1947     sam_flag & 0x0010,
1948     false,
1949     num_mismatches,
1950     0,
1951     end);
1952     }
1953    
1954     return true;
1955 gpertea 29 }
1956    
1957     bool BAMHitFactory::inspect_header(HitStream& hs)
1958     {
1959     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1960    
1961     if (header == NULL) {
1962     fprintf(stderr, "Warning: No BAM header\n");
1963     return false;
1964     }
1965     if (header->l_text == 0) {
1966     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1967     return false;
1968     }
1969     return true;
1970     }
1971    
1972 gpertea 154 bool SplicedBAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1973     BowtieHit& bh, bool strip_slash,
1974     char* name_out, char* name_tags,
1975     char* seq, char* qual) {
1976     if (_sam_header==NULL)
1977     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1978    
1979     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1980     uint32_t sam_flag = hit_buf->core.flag;
1981    
1982     int text_offset = hit_buf->core.pos;
1983     int text_mate_pos = hit_buf->core.mpos;
1984     int target_id = hit_buf->core.tid;
1985     int mate_target_id = hit_buf->core.mtid;
1986    
1987     vector<CigarOp> samcigar;
1988     bool spliced_alignment = false;
1989     int num_hits = 1;
1990    
1991     double mapQ = hit_buf->core.qual;
1992     long double error_prob;
1993     if (mapQ > 0)
1994     {
1995     long double p = (-1.0 * mapQ) / 10.0;
1996     error_prob = pow(10.0L, p);
1997     }
1998     else
1999     {
2000     error_prob = 1.0;
2001     }
2002    
2003     if (seq!=NULL) {
2004     char *bseq = (char*)bam1_seq(hit_buf);
2005     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
2006     char v = bam1_seqi(bseq,i);
2007     seq[i]=bam_nt16_rev_table[v];
2008     }
2009     seq[hit_buf->core.l_qseq]=0;
2010     }
2011     if (qual!=NULL) {
2012     char *bq = (char*)bam1_qual(hit_buf);
2013     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
2014     qual[i]=bq[i]+33;
2015     }
2016     qual[hit_buf->core.l_qseq]=0;
2017     }
2018 gpertea 29
2019 gpertea 154 bool end = true;
2020     unsigned int seg_offset = 0;
2021     unsigned int seg_num = 0;
2022     unsigned int num_segs = 0;
2023     // Copy the tag out of the name field before we might wipe it out
2024     char* name = bam1_qname(hit_buf);
2025     parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
2026    
2027     if (target_id < 0) {
2028     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
2029     bh = create_hit(name,
2030     "*", //ref_name
2031     0, //left coord
2032     0, //read_len
2033     false, //antisense_aln
2034     0, //edit_dist
2035     end);
2036     return true;
2037     }
2038    
2039     string text_name = _sam_header->target_name[target_id];
2040     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
2041     {
2042     //char* t;
2043    
2044     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
2045     if (length <= 0)
2046     {
2047     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
2048     return false;
2049     }
2050    
2051     CigarOpCode opcode;
2052     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
2053     {
2054     case BAM_CMATCH: opcode = MATCH; break;
2055     case BAM_CINS: opcode = INS; break;
2056     case BAM_CDEL: opcode = DEL; break;
2057     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
2058     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
2059     case BAM_CPAD: opcode = PAD; break;
2060     case BAM_CREF_SKIP:
2061     opcode = REF_SKIP;
2062     spliced_alignment = true;
2063     if (length > (int)max_report_intron_length)
2064     {
2065     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
2066     return false;
2067     }
2068     break;
2069     default:
2070     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
2071     return false;
2072     }
2073     if (opcode != HARD_CLIP)
2074     samcigar.push_back(CigarOp(opcode, length));
2075     }
2076    
2077     string mrnm;
2078     if (mate_target_id >= 0) {
2079     if (mate_target_id == target_id) {
2080     mrnm = _sam_header->target_name[mate_target_id];
2081     }
2082     else {
2083     return false;
2084     }
2085     }
2086     else {
2087     text_mate_pos = 0;
2088     }
2089    
2090     bool antisense_splice = false;
2091     int sam_nm = 0;
2092     vector<bool> mismatches;
2093     mismatches.resize(strlen(seq), false);
2094     int num_mismatches=getBAMmismatches(hit_buf, samcigar, mismatches, sam_nm, antisense_splice);
2095     unsigned char num_splice_anchor_mismatches = 0;
2096    
2097     //##############################################
2098    
2099     // Add this alignment to the table of hits for this half of the
2100     // Bowtie map
2101    
2102     // Parse the text_name field to recover the splice coords
2103     vector<string> toks;
2104     tokenize_strict(text_name.c_str(), "|", toks);
2105     int num_extra_toks = (int)toks.size() - 6;
2106     if (num_extra_toks >= 0)
2107     {
2108     static const uint8_t left_window_edge_field = 1;
2109     static const uint8_t splice_field = 2;
2110     //static const uint8_t right_window_edge_field = 3;
2111     static const uint8_t junction_type_field = 4;
2112     static const uint8_t strand_field = 5;
2113    
2114     string contig = toks[0];
2115     for (int t = 1; t <= num_extra_toks; ++t)
2116     {
2117     contig += "|";
2118     contig += toks[t];
2119     }
2120    
2121     vector<string> splice_toks;
2122     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
2123     if (splice_toks.size() != 2)
2124     {
2125     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
2126     //fprintf(stderr, "\t%s (token: %s)\n", text_name,
2127     // toks[num_extra_toks + splice_field].c_str());
2128     return false;
2129     }
2130    
2131     const string& junction_type = toks[num_extra_toks + junction_type_field];
2132     const string junction_strand = toks[num_extra_toks + strand_field];
2133    
2134     //
2135     // check for an insertion hit
2136     //
2137     if(junction_type == "ins")
2138     {
2139     //int8_t spliced_read_len = strlen(seq_str);
2140     //TODO FIXME: use the CIGAR instead of seq length!
2141     // The 0-based position of the left edge of the alignment. Note that this
2142     // value may need to be further corrected to account for the presence of
2143     // of the insertion.
2144     int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
2145    
2146     // The 0-based position of the last genomic sequence before the insertion
2147     int left_splice_pos = atoi(splice_toks[0].c_str());
2148    
2149     string insertedSequence = splice_toks[1];
2150     // The 0-based position of the first genomic sequence after the insertion
2151    
2152     vector<CigarOp> splcigar;
2153     //this also updates left to the adjusted genomic coordinates
2154 gpertea 159 int spl_num_mismatches=0;
2155     bool overlapped=spliceCigar(splcigar, samcigar, mismatches,
2156     left, left_splice_pos+1, insertedSequence.length(), INS, spl_num_mismatches);
2157 gpertea 154
2158 gpertea 159 if (!overlapped)
2159     return false;
2160    
2161 gpertea 154 if (spl_num_mismatches<0) return false;
2162     num_mismatches-=spl_num_mismatches;
2163     bh = create_hit(name,
2164     contig,
2165     "",
2166     left,
2167     splcigar,
2168     sam_flag & 0x0010,
2169     junction_strand == "rev",
2170     num_mismatches,
2171     0,
2172     end);
2173 gpertea 159
2174     // daehwan - remove this
2175     if (samcigar.size() > 1 && false)
2176     {
2177     cout << text_name << "\t" << left << endl
2178     << print_cigar(samcigar) << endl
2179     << print_cigar(splcigar) << endl;
2180     }
2181 gpertea 154
2182     return true;
2183     } //"ins"
2184     else //"del", "intron", or "fusion"
2185     {
2186     char orientation = (sam_flag & 0x0010 ? '-' : '+');
2187     if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")||
2188     !(orientation == '-' || orientation == '+'))
2189     {
2190     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
2191     return false;
2192     }
2193    
2194     // The 0-based position of the left edge of the alignment.
2195     int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str());
2196     if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
2197     left += text_offset;
2198     else
2199     left -= text_offset;
2200    
2201     vector<CigarOp> splcigar;
2202     CigarOpCode opcode;
2203     if(junction_type == "del")
2204     opcode = DEL;
2205     else if(junction_type == "fus")
2206     {
2207     if (junction_strand == "ff")
2208     opcode = FUSION_FF;
2209     else if (junction_strand == "fr")
2210     opcode = FUSION_FR;
2211     else if (junction_strand == "rf")
2212     opcode = FUSION_RF;
2213     else
2214     opcode = FUSION_RR;
2215     }
2216     else
2217     opcode = REF_SKIP;
2218    
2219     int left_splice_pos = atoi(splice_toks[0].c_str());
2220    
2221     // The 0-based position of the last genomic sequence before the deletion
2222     int gap_len = 0;
2223     if (junction_type == "fus")
2224     gap_len = atoi(splice_toks[1].c_str());
2225     else
2226     gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
2227    
2228     if (opcode == FUSION_RF || opcode == FUSION_RR)
2229     left_splice_pos -= 1;
2230     else
2231     left_splice_pos += 1;
2232 gpertea 159
2233     int spl_num_mismatches=0;
2234     bool overlapped=spliceCigar(splcigar, samcigar, mismatches, left,
2235     left_splice_pos, gap_len, opcode, spl_num_mismatches);
2236    
2237     if (!overlapped)
2238     return false;
2239 gpertea 154
2240     if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
2241     return false;
2242    
2243     string contig2 = "";
2244     if (junction_type == "fus")
2245     {
2246     vector<string> contigs;
2247     tokenize(contig, "-", contigs);
2248     if (contigs.size() != 2)
2249     return false;
2250    
2251     contig = contigs[0];
2252     contig2 = contigs[1];
2253    
2254     if (junction_strand == "rf" || junction_strand == "rr")
2255     orientation = (orientation == '+' ? '-' : '+');
2256     }
2257    
2258     bh = create_hit(name,
2259     contig,
2260     contig2,
2261     left,
2262     splcigar,
2263     orientation == '-',
2264     junction_strand == "rev",
2265     num_mismatches,
2266     spl_num_mismatches,
2267     end);
2268    
2269 gpertea 159 // daehwan - remove this
2270     if (samcigar.size() > 1 && false)
2271     {
2272     cout << text_name << "\t" << left << endl
2273     << print_cigar(samcigar) << endl
2274     << print_cigar(splcigar) << endl;
2275     }
2276    
2277 gpertea 154 return true;
2278     }
2279     } //parse splice data
2280     else
2281     {
2282     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
2283     //fprintf(stderr, "%s\n", orig_bwt_buf);
2284     // continue;
2285     return false;
2286     }
2287    
2288     return false;
2289     }
2290    
2291 gpertea 29 void get_mapped_reads(FILE* bwtf,
2292     HitTable& hits,
2293     HitFactory& hit_factory,
2294     bool strip_slash,
2295     bool verbose)
2296     {
2297    
2298    
2299     char bwt_buf[2048];
2300     uint32_t reads_extracted = 0;
2301    
2302     while (fgets(bwt_buf, 2048, bwtf))
2303     {
2304     // Chomp the newline
2305     char* nl = strrchr(bwt_buf, '\n');
2306     if (nl) *nl = 0;
2307     if (*bwt_buf == 0)
2308     continue;
2309     // Get a new record from the tab-delimited Bowtie map
2310     BowtieHit bh;
2311     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
2312     {
2313     // Only check uniqueness if these hits are spliced
2314     hits.add_hit(bh, true);
2315     }
2316     reads_extracted++;
2317     }
2318    
2319     // This will sort the map by insert id.
2320     hits.finalize();
2321     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
2322     }
2323    
2324    
2325     /*
2326     AlignStatus status(const BowtieHit* align)
2327     {
2328     if (!align)
2329     return UNALIGNED;
2330     if (align->contiguous())
2331     return CONTIGUOUS;
2332     return SPLICED;
2333     }
2334     */
2335    
2336     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
2337     {
2338     int max_hit_pos = -1;
2339     for (size_t i = 0; i < hits.size(); ++i)
2340     {
2341     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
2342     }
2343    
2344     if ((int)DoC.size() < max_hit_pos)
2345     DoC.resize(max_hit_pos);
2346    
2347     for (size_t i = 0; i < hits.size(); ++i)
2348     {
2349     const BowtieHit& bh = hits[i];
2350    
2351     // split up the coverage contibution for this reads
2352     size_t j = bh.left();
2353     const vector<CigarOp>& cigar = bh.cigar();
2354    
2355     for (size_t c = 0 ; c < cigar.size(); ++c)
2356     {
2357     switch(cigar[c].opcode)
2358     {
2359     case MATCH:
2360     for (size_t m = 0; m < cigar[c].length; ++m)
2361     {
2362     if (DoC[j + m] < 0xFFFF)
2363     DoC[j + m]++;
2364     }
2365     //fall through this case to REF_SKIP is intentional
2366     case REF_SKIP:
2367     j += cigar[c].length;
2368     break;
2369     default:
2370     break;
2371     }
2372    
2373     }
2374     }
2375     }
2376    
2377     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
2378     {
2379     if ((int)DoC.size() < bh.right())
2380     DoC.resize(bh.right());
2381    
2382     // split up the coverage contibution for this reads
2383     size_t j = bh.left();
2384     const vector<CigarOp>& cigar = bh.cigar();
2385    
2386     for (size_t c = 0 ; c < cigar.size(); ++c)
2387     {
2388     switch(cigar[c].opcode)
2389     {
2390     case MATCH:
2391     for (size_t m = 0; m < cigar[c].length; ++m)
2392     {
2393 gpertea 135 if (DoC[j + m] < VMAXINT32)
2394 gpertea 29 DoC[j + m]++;
2395     }
2396     //fall through this case to REF_SKIP is intentional
2397     case REF_SKIP:
2398     j += cigar[c].length;
2399     break;
2400     default:
2401     break;
2402     }
2403    
2404     }
2405     }
2406    
2407     void print_bamhit(GBamWriter& wbam,
2408 gpertea 154 const char* read_name,
2409     const BowtieHit& bh,
2410     const char* ref_name,
2411     const char* ref_name2,
2412     const char* sequence,
2413     const char* qualities,
2414 gpertea 159 bool from_bowtie,
2415     const vector<string>* extra_fields)
2416 gpertea 29 {
2417 gpertea 154 string seq;
2418     string quals;
2419     if (sequence) {
2420     seq = sequence;
2421     quals = qualities;
2422     seq.resize(bh.read_len());
2423     quals.resize(bh.read_len());
2424     }
2425     else {
2426     seq = "*";
2427     }
2428     if (qualities) {
2429     quals = qualities;
2430     quals.resize(bh.read_len());
2431     }
2432     else {
2433     quals = "*";
2434     }
2435    
2436     uint32_t sam_flag = 0;
2437     if (bh.antisense_align())
2438     {
2439     sam_flag |= 0x0010; // BAM_FREVERSE
2440     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
2441     {
2442     reverse_complement(seq);
2443     reverse(quals.begin(), quals.end());
2444     }
2445 gpertea 29 }
2446 gpertea 154
2447     uint32_t sam_pos = bh.left() + 1;
2448     uint32_t map_quality = 255;
2449     char cigar[256];
2450     cigar[0] = 0;
2451     string mate_ref_name = "*";
2452     uint32_t mate_pos = 0;
2453     uint32_t insert_size = 0;
2454     //string qualities = "*";
2455    
2456     const vector<CigarOp>& bh_cigar = bh.cigar();
2457     /*
2458     * In addition to calculating the cigar string,
2459     * we need to figure out how many in/dels are in the
2460     * sequence, so that we can give the correct
2461     * value for the NM tag
2462     */
2463     int indel_distance = 0;
2464     CigarOpCode fusion_dir = FUSION_NOTHING;
2465     for (size_t c = 0; c < bh_cigar.size(); ++c)
2466 gpertea 29 {
2467 gpertea 154 const CigarOp& op = bh_cigar[c];
2468    
2469     char ibuf[64];
2470     sprintf(ibuf, "%d", op.length);
2471     switch(op.opcode)
2472     {
2473     case MATCH:
2474     case mATCH:
2475     strcat(cigar, ibuf);
2476     if (bh_cigar[c].opcode == MATCH)
2477     strcat(cigar, "M");
2478     else
2479     strcat(cigar, "m");
2480     break;
2481     case INS:
2482     case iNS:
2483     strcat(cigar, ibuf);
2484     if (bh_cigar[c].opcode == INS)
2485     strcat(cigar, "I");
2486     else
2487     strcat(cigar, "i");
2488     indel_distance += bh_cigar[c].length;
2489     break;
2490     case DEL:
2491     case dEL:
2492     strcat(cigar, ibuf);
2493     if (bh_cigar[c].opcode == DEL)
2494     strcat(cigar, "D");
2495     else
2496     strcat(cigar, "d");
2497     indel_distance += bh_cigar[c].length;
2498     break;
2499     case REF_SKIP:
2500     case rEF_SKIP:
2501     strcat(cigar, ibuf);
2502     if (bh_cigar[c].opcode == REF_SKIP)
2503     strcat(cigar, "N");
2504     else
2505     strcat(cigar, "n");
2506     break;
2507     case FUSION_FF:
2508     case FUSION_FR:
2509     case FUSION_RF:
2510     case FUSION_RR:
2511     fusion_dir = op.opcode;
2512     sprintf(ibuf, "%d", bh_cigar[c].length + 1);
2513     strcat(cigar, ibuf);
2514     strcat(cigar, "F");
2515     break;
2516     default:
2517     break;
2518     }
2519 gpertea 29 }
2520    
2521 gpertea 154 char cigar1[256] = {0}, cigar2[256] = {0};
2522     string left_seq, right_seq, left_qual, right_qual;
2523     int left1 = -1, left2 = -1;
2524     extract_partial_hits(bh, seq, quals,
2525     cigar1, cigar2, left_seq, right_seq,
2526     left_qual, right_qual, left1, left2);
2527    
2528     bool containsSplice = false;
2529     for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
2530 gpertea 29 {
2531 gpertea 154 if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP)
2532     {
2533     containsSplice = true;
2534     break;
2535     }
2536 gpertea 29 }
2537    
2538 gpertea 154 vector<string> auxdata;
2539 gpertea 159 if (extra_fields)
2540     auxdata.insert(auxdata.end(), extra_fields->begin(), extra_fields->end());
2541    
2542 gpertea 154 if (!sam_readgroup_id.empty())
2543 gpertea 29 {
2544 gpertea 154 string nm("RG:Z:");
2545     nm += sam_readgroup_id;
2546     auxdata.push_back(nm);
2547 gpertea 29 }
2548 gpertea 154
2549     string nm("NM:i:");
2550     str_appendInt(nm, bh.edit_dist() + indel_distance);
2551     auxdata.push_back(nm);
2552    
2553     if (containsSplice) {
2554 gpertea 159 // do not add more than once
2555     bool XS_found = false;
2556     for (size_t i = 0; i < auxdata.size(); ++i)
2557     {
2558     if (auxdata[i].substr(0, 2) == "XS")
2559     {
2560     XS_found = true;
2561     break;
2562     }
2563     }
2564    
2565     if (!XS_found)
2566     {
2567     nm="XS:A:";
2568     nm+=(char)(bh.antisense_splice() ? '-' : '+');
2569     auxdata.push_back(nm);
2570     }
2571 gpertea 154 }
2572    
2573     if (fusion_dir != FUSION_NOTHING)
2574     {
2575     char XF[2048] = {0};
2576     sprintf(XF,
2577     "XF:Z:1 %s-%s %u %s %s %s",
2578     ref_name,
2579     ref_name2,
2580     sam_pos,
2581     cigar,
2582     seq.c_str(),
2583     quals.c_str());
2584     auxdata.push_back(XF);
2585    
2586     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, left1 + 1, map_quality,
2587     cigar1, mate_ref_name.c_str(), mate_pos,
2588     insert_size, left_seq.c_str(), left_qual.c_str(), &auxdata);
2589    
2590     wbam.write(brec);
2591     delete brec;
2592 gpertea 29
2593 gpertea 154 sprintf(XF,
2594     "XF:Z:2 %s-%s %u %s %s %s",
2595     ref_name,
2596     ref_name2,
2597     sam_pos,
2598     cigar,
2599     seq.c_str(),
2600     quals.c_str());
2601     auxdata.back() = XF;
2602    
2603 gpertea 159 brec = wbam.new_record(read_name, sam_flag, ref_name2, left2 + 1, map_quality,
2604 gpertea 154 cigar2, mate_ref_name.c_str(), mate_pos,
2605     insert_size, right_seq.c_str(), right_qual.c_str(), &auxdata);
2606    
2607     wbam.write(brec);
2608     delete brec;
2609     }
2610     else
2611 gpertea 29 {
2612 gpertea 154 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
2613     cigar, mate_ref_name.c_str(), mate_pos,
2614     insert_size, seq.c_str(), quals.c_str(), &auxdata);
2615    
2616     wbam.write(brec);
2617     delete brec;
2618 gpertea 29 }
2619     }
2620    
2621     /**
2622     * Print a vector of cigar operations to a file.
2623     * @param bh_cigar A vector of CigarOps
2624     * @return a string representation of the cigar string
2625     */
2626 gpertea 154 std::string print_cigar(const vector<CigarOp>& bh_cigar){
2627 gpertea 29 char cigar[256];
2628     cigar[0] = 0;
2629     for (size_t c = 0; c < bh_cigar.size(); ++c)
2630     {
2631     char ibuf[64];
2632     sprintf(ibuf, "%d", bh_cigar[c].length);
2633 gpertea 154 strcat(cigar, ibuf);
2634 gpertea 29 switch(bh_cigar[c].opcode)
2635     {
2636 gpertea 154 case MATCH:
2637     strcat(cigar, "M");
2638     break;
2639     case mATCH:
2640     strcat(cigar, "m");
2641     break;
2642     case INS:
2643     strcat(cigar, "I");
2644     break;
2645     case iNS:
2646     strcat(cigar, "i");
2647     break;
2648     case DEL:
2649     strcat(cigar, "D");
2650     break;
2651     case dEL:
2652     strcat(cigar, "d");
2653     break;
2654     case REF_SKIP:
2655     strcat(cigar, "N");
2656     break;
2657     case rEF_SKIP:
2658     strcat(cigar, "n");
2659     break;
2660     case FUSION_FF:
2661     case FUSION_FR:
2662     case FUSION_RF:
2663     case FUSION_RR:
2664     strcat(cigar, "F");
2665     break;
2666     default:
2667     break;
2668 gpertea 29 }
2669     }
2670     string result(cigar);
2671     return result;
2672     }
2673    
2674 gpertea 154 void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual,
2675     char* cigar1, char* cigar2, string& seq1, string& seq2,
2676     string& qual1, string& qual2, int& left1, int& left2)
2677     {
2678     const int left = bh.left();
2679     int right = left;
2680     int fusion_left = -1, fusion_right = -1;
2681    
2682     const vector<CigarOp>& bh_cigar = bh.cigar();
2683    
2684     CigarOpCode fusion_dir = FUSION_NOTHING;
2685     size_t fusion_idx = 0;
2686     size_t left_part_len = 0;
2687     for (size_t c = 0; c < bh_cigar.size(); ++c)
2688     {
2689     const CigarOp& op = bh_cigar[c];
2690     switch(op.opcode)
2691     {
2692     case MATCH:
2693     case REF_SKIP:
2694     case DEL:
2695     right += op.length;
2696     break;
2697     case mATCH:
2698     case rEF_SKIP:
2699     case dEL:
2700     right -= op.length;
2701     break;
2702     case FUSION_FF:
2703     case FUSION_FR:
2704     case FUSION_RF:
2705     case FUSION_RR:
2706     {
2707     fusion_dir = op.opcode;
2708     fusion_idx = c;
2709     if (op.opcode == FUSION_FF || op.opcode == FUSION_FR)
2710     fusion_left = right - 1;
2711     else
2712     fusion_left = right + 1;
2713     fusion_right = right = op.length;
2714     }
2715     break;
2716     default:
2717     break;
2718     }
2719    
2720     if (fusion_dir == FUSION_NOTHING)
2721     {
2722     if (op.opcode == MATCH || op.opcode == mATCH || op.opcode == INS || op.opcode == iNS)
2723     {
2724     left_part_len += op.length;
2725     }
2726     }
2727     }
2728    
2729     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_FR)
2730     {
2731     for (size_t c = 0; c < fusion_idx; ++c)
2732     {
2733     const CigarOp& op = bh_cigar[c];
2734     char ibuf[64];
2735     sprintf(ibuf, "%d", op.length);
2736     strcat(cigar1, ibuf);
2737    
2738     switch (op.opcode)
2739     {
2740     case MATCH:
2741     strcat(cigar1, "M");
2742     break;
2743     case INS:
2744     strcat(cigar1, "I");
2745     break;
2746     case DEL:
2747     strcat(cigar1, "D");
2748     break;
2749     case REF_SKIP:
2750     strcat(cigar1, "N");
2751     break;
2752     default:
2753     assert (0);
2754     break;
2755     }
2756     }
2757     }
2758     else if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2759     {
2760     assert (fusion_idx > 0);
2761     for (int c = fusion_idx - 1; c >=0; --c)
2762     {
2763     const CigarOp& op = bh_cigar[c];
2764     char ibuf[64];
2765     sprintf(ibuf, "%d", op.length);
2766     strcat(cigar1, ibuf);
2767    
2768     switch (op.opcode)
2769     {
2770     case mATCH:
2771     strcat(cigar1, "M");
2772     break;
2773     case iNS:
2774     strcat(cigar1, "I");
2775     break;
2776     case dEL:
2777     strcat(cigar1, "D");
2778     break;
2779     case rEF_SKIP:
2780     strcat(cigar1, "N");
2781     break;
2782     default:
2783     assert (0);
2784     break;
2785     }
2786     }
2787     }
2788    
2789     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RF)
2790     {
2791     for (size_t c = fusion_idx + 1; c < bh_cigar.size(); ++c)
2792     {
2793     const CigarOp& op = bh_cigar[c];
2794     char ibuf[64];
2795     sprintf(ibuf, "%d", op.length);
2796     strcat(cigar2, ibuf);
2797    
2798     switch (op.opcode)
2799     {
2800     case MATCH:
2801     strcat(cigar2, "M");
2802     break;
2803     case INS:
2804     strcat(cigar2, "I");
2805     break;
2806     case DEL:
2807     strcat(cigar2, "D");
2808     break;
2809     case REF_SKIP:
2810     strcat(cigar2, "N");
2811     break;
2812     default:
2813     assert (0);
2814     break;
2815     }
2816     }
2817     }
2818     else if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR)
2819     {
2820     assert (bh_cigar.size() > 0);
2821     for (size_t c = bh_cigar.size() - 1; c > fusion_idx; --c)
2822     {
2823     const CigarOp& op = bh_cigar[c];
2824     char ibuf[64];
2825     sprintf(ibuf, "%d", op.length);
2826     strcat(cigar2, ibuf);
2827    
2828     switch (op.opcode)
2829     {
2830     case mATCH:
2831     strcat(cigar2, "M");
2832     break;
2833     case iNS:
2834     strcat(cigar2, "I");
2835     break;
2836     case dEL:
2837     strcat(cigar2, "D");
2838     break;
2839     case rEF_SKIP:
2840     strcat(cigar2, "N");
2841     break;
2842     default:
2843     assert (0);
2844     break;
2845     }
2846     }
2847     }
2848    
2849     if (fusion_dir != FUSION_NOTHING)
2850     {
2851     seq1 = seq.substr(0, left_part_len);
2852     qual1 = qual.substr(0, left_part_len);
2853    
2854     if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2855     {
2856     reverse_complement(seq1);
2857     reverse(qual1.begin(), qual1.end());
2858     }
2859    
2860     seq2 = seq.substr(left_part_len);
2861     qual2 = qual.substr(left_part_len);
2862    
2863     if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR)
2864     {
2865     reverse_complement(seq2);
2866     reverse(qual2.begin(), qual2.end());
2867     }
2868    
2869     left1 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_FR) ? left : fusion_left);
2870     left2 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_RF) ? fusion_right : right + 1);
2871     }
2872     }
2873    
2874    
2875 gpertea 159 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt, bool bDebug)
2876 gpertea 29 {
2877 gpertea 154 RefSequenceTable::Sequence* ref_str1 = rt.get_seq(_ref_id);
2878     RefSequenceTable::Sequence* ref_str2 = rt.get_seq(_ref_id2);
2879    
2880     if (!ref_str1 || !ref_str2)
2881 gpertea 29 return false;
2882 gpertea 159
2883     if (bDebug)
2884     {
2885     cout << "check_editdist_consistency" << endl
2886     << "insert id: " << _insert_id << endl;
2887     }
2888 gpertea 29
2889 gpertea 154 RefSequenceTable::Sequence* ref_str = ref_str1;
2890 gpertea 29
2891     size_t pos_seq = 0;
2892 gpertea 154 size_t pos_ref = _left;
2893 gpertea 29 size_t mismatch = 0;
2894 gpertea 154 bool bSawFusion = false;
2895 gpertea 29 for (size_t i = 0; i < _cigar.size(); ++i)
2896     {
2897     CigarOp cigar = _cigar[i];
2898     switch(cigar.opcode)
2899     {
2900     case MATCH:
2901     {
2902 gpertea 154 seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length);
2903 gpertea 29 for (size_t j = 0; j < cigar.length; ++j)
2904     {
2905 gpertea 159 seqan::Dna5 ref_nt = _seq[pos_seq];
2906     if (ref_nt != ref_seq[j])
2907 gpertea 29 ++mismatch;
2908 gpertea 159
2909     if (bDebug)
2910     cout << pos_seq << "\t" << ref_nt << " vs. " << ref_seq[j] << "\t" << mismatch << endl;
2911    
2912 gpertea 154 ++pos_seq;
2913 gpertea 29 }
2914 gpertea 154
2915     pos_ref += cigar.length;
2916 gpertea 29 }
2917     break;
2918 gpertea 154 case mATCH:
2919     {
2920     seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1);
2921     seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>());
2922     seqan::reverseInPlace(ref_seq);
2923    
2924     for (size_t j = 0; j < cigar.length; ++j)
2925     {
2926 gpertea 159 seqan::Dna5 ref_nt = _seq[pos_seq];
2927     if (ref_nt != ref_seq[j])
2928 gpertea 154 ++mismatch;
2929 gpertea 159
2930     if (bDebug)
2931     cout << pos_seq << "\t" << ref_nt << " vs. " << ref_seq[j] << "\t" << mismatch << endl;
2932    
2933 gpertea 154 ++pos_seq;
2934     }
2935    
2936     pos_ref -= cigar.length;
2937     }
2938     break;
2939 gpertea 29 case INS:
2940 gpertea 154 case iNS:
2941 gpertea 29 {
2942     pos_seq += cigar.length;
2943     }
2944     break;
2945    
2946     case DEL:
2947     case REF_SKIP:
2948     {
2949     pos_ref += cigar.length;
2950     }
2951     break;
2952    
2953 gpertea 154 case dEL:
2954     case rEF_SKIP:
2955     {
2956     pos_ref -= cigar.length;
2957     }
2958     break;
2959    
2960     case FUSION_FF:
2961     case FUSION_FR:
2962     case FUSION_RF:
2963     case FUSION_RR:
2964     {
2965     // We don't allow a read spans more than two chromosomes.
2966     if (bSawFusion)
2967     return false;
2968    
2969     ref_str = ref_str2;
2970     pos_ref = cigar.length;
2971    
2972     bSawFusion = true;
2973     }
2974     break;
2975    
2976 gpertea 29 default:
2977     break;
2978     }
2979     }
2980 gpertea 154
2981 gpertea 159 if (bDebug)
2982     cout << "mismatch (real) vs. (calculated):" << mismatch << " vs. " << (int)_edit_dist << endl;
2983    
2984 gpertea 29 return mismatch == _edit_dist;
2985     }
2986 gpertea 159
2987     void bowtie_sam_extra(const BowtieHit& bh, const RefSequenceTable& rt, vector<string>& fields)
2988     {
2989     RefSequenceTable::Sequence* ref_str1 = rt.get_seq(bh.ref_id());
2990     RefSequenceTable::Sequence* ref_str2 = rt.get_seq(bh.ref_id2());
2991    
2992     if (!ref_str1 || !ref_str2)
2993     return;
2994    
2995     RefSequenceTable::Sequence* ref_str = ref_str1;
2996    
2997     size_t pos_seq = 0;
2998     size_t pos_mismatch = 0;
2999     size_t pos_ref = bh.left();
3000     size_t mismatch = 0;
3001     size_t num_gap_opens = 0;
3002     size_t num_gap_conts = 0;
3003     bool bSawFusion = false;
3004    
3005     int AS_score = 0;
3006    
3007     const vector<CigarOp>& cigars = bh.cigar();
3008     const string& seq = bh.seq();
3009     const string& qual = bh.qual();
3010    
3011     string AS = "AS:i:";
3012     string MD = "MD:Z:";
3013    
3014     for (size_t i = 0; i < cigars.size(); ++i)
3015     {
3016     CigarOp cigar = cigars[i];
3017     switch(cigar.opcode)
3018     {
3019     case MATCH:
3020     case mATCH:
3021     {
3022     seqan::Dna5String ref_seq;
3023     if (cigar.opcode == MATCH)
3024     {
3025     ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length);
3026     pos_ref += cigar.length;
3027     }
3028     else
3029     {
3030     ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1);
3031     seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>());
3032     seqan::reverseInPlace(ref_seq);
3033     pos_ref -= cigar.length;
3034     }
3035    
3036     for (size_t j = 0; j < cigar.length; ++j)
3037     {
3038     seqan::Dna5 ref_nt = ref_seq[j];
3039     if (seq[pos_seq] != ref_nt)
3040     {
3041     ++mismatch;
3042    
3043     if (pos_seq < qual.length())
3044     {
3045     float penalty = (bowtie2_max_penalty - bowtie2_min_penalty) * min((int)(qual[pos_seq] - '!'), 40) / 40.0;
3046     AS_score -= (int)penalty;
3047     }
3048    
3049     str_appendInt(MD, (int)pos_mismatch);
3050     MD.push_back((char)ref_nt);
3051     pos_mismatch = 0;
3052     }
3053     else
3054     ++pos_mismatch;
3055    
3056     ++pos_seq;
3057     }
3058     }
3059     break;
3060    
3061     case INS:
3062     case iNS:
3063     {
3064     pos_seq += cigar.length;
3065    
3066     AS_score -= bowtie2_read_gap_open;
3067     AS_score -= (int)(bowtie2_read_gap_cont * cigar.length);
3068    
3069     num_gap_opens += 1;
3070     num_gap_conts += cigar.length;
3071     }
3072     break;
3073    
3074     case DEL:
3075     case dEL:
3076     {
3077     AS_score -= bowtie2_ref_gap_open;
3078     AS_score -= (int)(bowtie2_ref_gap_cont * cigar.length);
3079    
3080     num_gap_opens += 1;
3081     num_gap_conts += cigar.length;
3082    
3083     seqan::Dna5String ref_seq;
3084     if (cigar.opcode == DEL)
3085     {
3086     ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length);
3087     pos_ref += cigar.length;
3088     }
3089     else
3090     {
3091     ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1);
3092     seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>());
3093     seqan::reverseInPlace(ref_seq);
3094     pos_ref -= cigar.length;
3095     }
3096    
3097     str_appendInt(MD, (int)pos_mismatch);
3098     MD.push_back('^');
3099     for (size_t k = 0; k < length(ref_seq); ++k)
3100     MD.push_back((char)ref_seq[k]);
3101    
3102     pos_mismatch = 0;
3103     }
3104     break;
3105    
3106     case REF_SKIP:
3107     case rEF_SKIP:
3108     {
3109     if (cigar.opcode == REF_SKIP)
3110     pos_ref += cigar.length;
3111     else
3112     pos_ref -= cigar.length;
3113     }
3114     break;
3115    
3116     case FUSION_FF:
3117     case FUSION_FR:
3118     case FUSION_RF:
3119     case FUSION_RR:
3120     {
3121     // We don't allow a read spans more than two chromosomes.
3122     if (bSawFusion)
3123     return;
3124    
3125     ref_str = ref_str2;
3126     pos_ref = cigar.length;
3127    
3128     bSawFusion = true;
3129     }
3130     break;
3131    
3132     default:
3133     break;
3134     }
3135     }
3136    
3137     str_appendInt(AS, AS_score);
3138     fields.push_back(AS);
3139    
3140     string XM = "XM:i:";
3141     str_appendInt(XM, (int)mismatch);
3142     fields.push_back(XM);
3143    
3144     string XO = "XO:i:";
3145     str_appendInt(XO, (int)num_gap_opens);
3146     fields.push_back(XO);
3147    
3148     string XG = "XG:i:";
3149     str_appendInt(XG, (int)num_gap_conts);
3150     fields.push_back(XG);
3151    
3152     str_appendInt(MD, (int)pos_mismatch);
3153     fields.push_back(MD);
3154     }