ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 137
Committed: Wed Dec 14 22:58:22 2011 UTC (12 years, 8 months ago) by gpertea
File size: 56203 byte(s)
Log Message:
wip spliceCigar() in bwt_map.cpp needs testing

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21     #include "common.h"
22     #include "bwt_map.h"
23     #include "tokenize.h"
24     #include "reads.h"
25    
26     using namespace std;
27    
28     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29     {
30     uint32_t reference_id = bh.ref_id();
31    
32     pair<RefHits::iterator, bool> ret =
33     _hits_for_ref.insert(make_pair(reference_id, HitList()));
34     HitList& hl = ret.first->second;
35    
36     if (check_uniqueness)
37     {
38     // Check uniqueness, in case we are adding spliced hits from
39     // several spliced alignment sources (e.g. de novo hashing + Bowtie
40     // against a user-supplied index). We don't want to count the same
41     // alignment twice if it happened to be found by more than one method
42     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44     for (; lb != ub && lb != hl.end(); ++lb)
45     {
46     if (*lb == bh)
47     {
48     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49     return;
50     }
51    
52     if (lb->insert_id() == bh.insert_id() &&
53     lb->ref_id() == bh.ref_id() &&
54     lb->antisense_align() == bh.antisense_align())
55     {
56     // If we get here, we may be looking at the same alignment
57     // However, spanning_reads may report a shorter, trimmed alignment
58     // so not all fields will be equal. If they just disagree on the
59     // ends, and don't indicate a different junction coord, the
60     // alignments are the same.
61    
62     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63     (bh.left() <= lb->left() && bh.right() >= lb->right()))
64     {
65     vector<pair<int,int> > lb_gaps, bh_gaps;
66     lb->gaps(lb_gaps);
67     bh.gaps(bh_gaps);
68     if (lb_gaps == bh_gaps)
69     {
70     // One alignment is contained in the other, they agree on
71     // where the gaps, if any, are, and they share an id
72     // => this is a redundant aligment, so toss it
73     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74     return;
75     }
76     }
77     }
78     }
79     }
80     _total_hits++;
81     hl.push_back(bh);
82     }
83    
84     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85     {
86     return h1.insert_id() < h2.insert_id();
87     }
88    
89     void LineHitFactory::openStream(HitStream& hs)
90     {
91     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92     //open the file for HitStream here
93     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94     if (hs._hit_file==NULL)
95     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96     return;
97     }
98     if (hs._fzpipe!=NULL) {
99     hs._hit_file=hs._fzpipe->file;
100     }
101     }
102    
103     void LineHitFactory::rewind(HitStream& hs)
104     {
105     if (hs._fzpipe!=NULL) {
106     hs._fzpipe->rewind();
107     hs._hit_file=hs._fzpipe->file;
108     }
109     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110     }
111    
112     bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113     FILE* f=(FILE *)(hs._hit_file);
114     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115     if (!new_rec || feof(f)) {
116     hs._eof=true;
117     return false;
118     }
119     ++_line_num;
120     char* nl = strrchr(_hit_buf, '\n');
121     if (nl) *nl = 0;
122     buf = _hit_buf;
123     buf_size = _hit_buf_max_sz - 1;
124     return true;
125     }
126    
127     void LineHitFactory::closeStream(HitStream& hs) {
128     if (hs._fzpipe!=NULL) {
129     hs._fzpipe->close();
130     return;
131     }
132     if (hs._hit_file!=NULL) {
133     fclose((FILE*)(hs._hit_file));
134     hs._hit_file=NULL;
135     }
136     }
137     void BAMHitFactory::openStream(HitStream& hs) {
138     if (hs._hit_file==NULL) {
139     if (hs._hit_file_name.empty())
140     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141     return; //invalid stream, could be just a place holder
142     //open the file here if not already open
143     string fext=getFext(hs._hit_file_name);
144     if (fext=="sam")
145     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146     else
147     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148    
149     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150    
151     if (sam_file == NULL)
152     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153     if (sam_file->header == NULL)
154     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155     memset(&_next_hit, 0, sizeof(_next_hit));
156     //_beginning = bgzf_tell(sam_file->x.bam);
157     _sam_header=sam_file->header;
158     if (inspect_header(hs) == false)
159     err_die("Error: invalid SAM header for file %s\n",
160     hs._hit_file_name.c_str());
161     }
162     }
163    
164     void BAMHitFactory::closeStream(HitStream& hs) {
165     if (hs._hit_file) {
166     samclose((samfile_t*)(hs._hit_file));
167     }
168     hs._hit_file=NULL;
169     _sam_header=NULL;
170     }
171    
172     void BAMHitFactory::rewind(HitStream& hs)
173     {
174     /*
175     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176     {
177     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178     _eof = false;
179     }
180     */
181     this->closeStream(hs);
182     this->openStream(hs);
183     }
184    
185     string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186     const bam1_t* bamrec=(const bam1_t*)hit_buf;
187     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188     string sam_line(tamline);
189     free(tamline);
190     return sam_line;
191     }
192    
193     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194     if (_next_hit.data) {
195     free(_next_hit.data);
196     _next_hit.data = NULL;
197     }
198     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199     if (hs.eof() || !hs.ready()) return false;
200    
201     //mark_curr_pos();
202    
203     memset(&_next_hit, 0, sizeof(_next_hit));
204    
205     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206     if (bytes_read <= 0) {
207     hs._eof = true;
208     return false;
209     }
210     buf = (const char*)&_next_hit;
211     buf_size = bytes_read;
212     return true;
213     }
214    
215    
216     BowtieHit HitFactory::create_hit(const string& insert_name,
217     const string& ref_name,
218     int left,
219     const vector<CigarOp>& cigar,
220     bool antisense_aln,
221     bool antisense_splice,
222     unsigned char edit_dist,
223     unsigned char splice_mms,
224     bool end)
225     {
226     uint64_t insert_id = _insert_table.get_id(insert_name);
227     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228    
229     return BowtieHit(reference_id,
230     insert_id,
231     left,
232     cigar,
233     antisense_aln,
234     antisense_splice,
235     edit_dist,
236     splice_mms,
237     end);
238     }
239    
240     BowtieHit HitFactory::create_hit(const string& insert_name,
241     const string& ref_name,
242     uint32_t left,
243     uint32_t read_len,
244     bool antisense_aln,
245     unsigned char edit_dist,
246     bool end)
247     {
248     uint64_t insert_id = _insert_table.get_id(insert_name);
249     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250    
251     return BowtieHit(reference_id,
252     insert_id,
253     left,
254     read_len,
255     antisense_aln,
256     edit_dist,
257     end);
258     }
259    
260     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261     BowtieHit& bh,
262     bool strip_slash,
263     char* name_out,
264     char* name_tags,
265     char* seq,
266     char* qual)
267     {
268     if (!orig_bwt_buf || !*orig_bwt_buf)
269     return false;
270    
271     static const int buf_size = 2048;
272    
273     char bwt_buf[buf_size];
274     strcpy(bwt_buf, orig_bwt_buf);
275     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276    
277     char orientation;
278    
279     //int bwtf_ret = 0;
280     //uint32_t seqid = 0;
281    
282     char text_name[buf_size];
283     unsigned int text_offset;
284    
285    
286     //unsigned int other_occs;
287     char mismatches[buf_size];
288     //memset(mismatches, 0, sizeof(mismatches));
289    
290     const char* buf = bwt_buf;
291     char* name = get_token((char**)&buf,"\t");
292     char* orientation_str = get_token((char**)&buf,"\t");
293     char* text_name_str = get_token((char**)&buf,"\t");
294    
295     strcpy(text_name, text_name_str);
296    
297     char* text_offset_str = get_token((char**)&buf,"\t");
298     char* seq_str = get_token((char**)&buf,"\t");
299     if (seq)
300     strcpy(seq, seq_str);
301    
302     const char* qual_str = get_token((char**)&buf,"\t");
303     if (qual)
304     strcpy(qual, qual_str);
305    
306     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307     mismatches[0] = 0;
308     char* mismatches_str = get_token((char**)&buf, "\t");
309     if (mismatches_str)
310     strcpy(mismatches, mismatches_str);
311    
312     orientation = orientation_str[0];
313     text_offset = atoi(text_offset_str);
314    
315     bool end = true;
316     unsigned int seg_offset = 0;
317     unsigned int seg_num = 0;
318     unsigned int num_segs = 0;
319    
320     // Copy the tag out of the name field before we might wipe it out
321     char* pipe = strrchr(name, '|');
322     if (pipe)
323     {
324     if (name_tags)
325     strcpy(name_tags, pipe);
326    
327     char* tag_buf = pipe + 1;
328     if (strchr(tag_buf, ':'))
329     {
330     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331     if (seg_num + 1 == num_segs)
332     end = true;
333     else
334     end = false;
335     }
336    
337     *pipe = 0;
338     }
339     // Stripping the slash and number following it gives the insert name
340     char* slash = strrchr(name, '/');
341     if (strip_slash && slash)
342     *slash = 0;
343    
344     size_t read_len = strlen(seq_str);
345    
346     // Add this alignment to the table of hits for this half of the
347     // Bowtie map
348    
349     //vector<string> mismatch_toks;
350     char* pch = strtok (mismatches,",");
351     unsigned char num_mismatches = 0;
352     while (pch != NULL)
353     {
354     char* colon = strchr(pch, ':');
355     if (colon)
356     {
357     num_mismatches++;
358     }
359     //mismatch_toks.push_back(pch);
360     pch = strtok (NULL, ",");
361     }
362    
363     bh = create_hit(name,
364     text_name,
365     text_offset,
366     (int)read_len,
367     orientation == '-',
368     num_mismatches,
369     end);
370    
371     return true;
372     }
373    
374     int anchor_mismatch = 0;
375    
376 gpertea 70
377     void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378     bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379     unsigned int & num_segs) {
380     char* pipe = strrchr(name, '|');
381     if (pipe)
382     {
383     if (name_tags)
384     strcpy(name_tags, pipe);
385    
386     char* tag_buf = pipe + 1;
387     if (strchr(tag_buf, ':'))
388     {
389     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390     if (seg_num + 1 == num_segs)
391     end = true;
392     else
393     end = false;
394     }
395    
396     *pipe = 0;
397     }
398    
399     // Stripping the slash and number following it gives the insert name
400     char* slash = strrchr(name, '/');
401     if (strip_slash && slash)
402     *slash = 0;
403     }
404    
405    
406 gpertea 29 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407     BowtieHit& bh,
408     bool strip_slash,
409     char* name_out,
410     char* name_tags,
411     char* seq,
412     char* qual)
413     {
414     if (!orig_bwt_buf || !*orig_bwt_buf)
415     return false;
416    
417     static const int buf_size = 2048;
418    
419     char bwt_buf[buf_size];
420     strcpy(bwt_buf, orig_bwt_buf);
421     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422    
423     char orientation;
424     char text_name[buf_size];
425     unsigned int text_offset;
426     char mismatches[buf_size];
427     //memset(mismatches, 0, sizeof(mismatches));
428    
429     char* buf = bwt_buf;
430     char* name = get_token((char**)&buf,"\t");
431     char* orientation_str = get_token((char**)&buf,"\t");
432     char* text_name_str = get_token((char**)&buf,"\t");
433     strcpy(text_name, text_name_str);
434    
435     char* text_offset_str = get_token((char**)&buf,"\t");
436     char* seq_str = get_token((char**)&buf,"\t");
437     if (seq)
438     strcpy(seq, seq_str);
439    
440     const char* qual_str = get_token((char**)&buf,"\t");
441     if (qual)
442     strcpy(qual, qual_str);
443    
444     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445     mismatches[0] = 0;
446     char* mismatches_str = get_token((char**)&buf, "\t");
447     if (mismatches_str)
448     strcpy(mismatches, mismatches_str);
449    
450     orientation = orientation_str[0];
451     text_offset = atoi(text_offset_str);
452    
453     bool end = true;
454     unsigned int seg_offset = 0;
455     unsigned int seg_num = 0;
456     unsigned int num_segs = 0;
457    
458     // Copy the tag out of the name field before we might wipe it out
459 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460 gpertea 29
461     // Add this alignment to the table of hits for this half of the
462     // Bowtie map
463    
464     // Parse the text_name field to recover the splice coords
465     vector<string> toks;
466    
467     tokenize_strict(text_name, "|", toks);
468    
469     int num_extra_toks = (int)toks.size() - 6;
470    
471     if (num_extra_toks >= 0)
472     {
473     static const uint8_t left_window_edge_field = 1;
474     static const uint8_t splice_field = 2;
475     //static const uint8_t right_window_edge_field = 3;
476     static const uint8_t junction_type_field = 4;
477     static const uint8_t strand_field = 5;
478    
479     string contig = toks[0];
480     for (int t = 1; t <= num_extra_toks; ++t)
481     {
482     contig += "|";
483     contig += toks[t];
484     }
485    
486     vector<string> splice_toks;
487     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488     if (splice_toks.size() != 2)
489     {
490     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491     //fprintf(stderr, "%s (token: %s)\n", text_name,
492     // toks[num_extra_toks + splice_field].c_str());
493     return false;
494     }
495    
496     //
497     // check for an insertion hit
498     //
499     if(toks[num_extra_toks + junction_type_field] == "ins")
500     {
501     int8_t spliced_read_len = strlen(seq_str);
502     /*
503     * The 0-based position of the left edge of the alignment. Note that this
504     * value may need to be futher corrected to account for the presence of
505     * of the insertion.
506     */
507     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508     uint32_t right = left + spliced_read_len - 1;
509    
510 gpertea 137
511 gpertea 29 /*
512     * The 0-based position of the last genomic sequence before the insertion
513     */
514     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
515    
516     string insertedSequence = splice_toks[1];
517     /*
518     * The 0-based position of the first genomic sequence after teh insertion
519     */
520     uint32_t right_splice_pos = left_splice_pos + 1;
521     if(left > left_splice_pos){
522     /*
523     * The genomic position of the left edge of the alignment needs to be corrected
524     * If the alignment does not extend into the insertion, simply subtract the length
525     * of the inserted sequence, otherwise, just set it equal to the right edge
526     */
527     left = left - insertedSequence.length();
528     if(left < right_splice_pos){
529     left = right_splice_pos;
530     }
531     }
532     if(right > left_splice_pos){
533     right = right - insertedSequence.length();
534     if(right < left_splice_pos){
535     right = left_splice_pos;
536     }
537     }
538     /*
539     * Now, right and left should be properly transformed into genomic coordinates
540     * We should be able to deduce how much the alignment matches the insertion
541     * simply based on the length of the read
542     */
543     int left_match_length = 0;
544     if(left <= left_splice_pos){
545     left_match_length = left_splice_pos - left + 1;
546     }
547     int right_match_length = 0;
548     if(right >= right_splice_pos){
549     right_match_length = right - right_splice_pos + 1;
550     }
551     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
552    
553     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
554     return false;
555    
556     string junction_strand = toks[num_extra_toks + strand_field];
557     if(junction_strand != "rev" && junction_strand != "fwd"){
558     fprintf(stderr, "Malformed insertion record\n");
559     return false;
560     }
561    
562     char* pch = strtok( mismatches, ",");
563     unsigned char num_mismatches = 0;
564    
565     /*
566     * remember that text_offset holds the left end of the
567     * alignment, relative to the start of the contig
568     */
569    
570     /*
571     * The 0-based relative position of the left-most character
572     * before the insertion in the contig
573     */
574     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
575     while (pch != NULL)
576     {
577     char* colon = strchr(pch, ':');
578     if (colon)
579     {
580     *colon = 0;
581     int mismatch_pos = atoi(pch);
582    
583     /*
584     * for reversely mapped reads,
585     * find the correct mismatched position.
586     */
587     if(orientation == '-'){
588     mismatch_pos = spliced_read_len - mismatch_pos - 1;
589     }
590    
591     /*
592     * Only count mismatches outside of the insertion region
593     * If there is a mismatch within the insertion,
594     * disallow this hit
595     */
596     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
597     num_mismatches++;
598     }else{
599     return false;
600     }
601     }
602     pch = strtok (NULL, ",");
603     }
604    
605    
606     vector<CigarOp> cigar;
607     cigar.push_back(CigarOp(MATCH, left_match_length));
608     cigar.push_back(CigarOp(INS, insertion_match_length));
609     cigar.push_back(CigarOp(MATCH, right_match_length));
610    
611     /*
612     * For now, disallow hits that don't span
613     * the insertion. If we allow these types of hits,
614     * then long_spanning.cpp needs to be updated
615     * in order to intelligently merge these kinds
616     * of reads back together
617     *
618     * Following code has been changed to allow segment that end
619     * in an insertion
620     */
621     bh = create_hit(name,
622     contig,
623     left,
624     cigar,
625     orientation == '-',
626     junction_strand == "rev",
627     num_mismatches,
628     0,
629     end);
630     return true;
631     }
632    
633     else
634     {
635     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
636     int spliced_read_len = strlen(seq_str);
637     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
638     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
639     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
640    
641     if (right_splice_pos <= 0 || left_splice_pos <= 0)
642     return false;
643    
644     if (orientation == '+')
645     {
646     if (left_splice_pos + seg_offset < _anchor_length){
647     return false;
648     }
649     }
650     else
651     {
652     if (right_splice_pos + seg_offset < _anchor_length)
653     return false;
654     }
655     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
656     //atoi(toks[right_window_edge_field].c_str());
657     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
658    
659     string junction_strand = toks[num_extra_toks + strand_field];
660     if (!(junction_strand == "rev" || junction_strand == "fwd")||
661     !(orientation == '-' || orientation == '+'))
662     {
663     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
664     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
665     // junction_strand.c_str(), orientation);
666     return false;
667     }
668    
669     //vector<string> mismatch_toks;
670     char* pch = strtok (mismatches,",");
671     int mismatches_in_anchor = 0;
672     unsigned char num_mismatches = 0;
673     while (pch != NULL)
674     {
675     char* colon = strchr(pch, ':');
676     if (colon)
677     {
678     *colon = 0;
679     num_mismatches++;
680     int mismatch_pos = atoi(pch);
681     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
682     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
683     mismatches_in_anchor++;
684     }
685     //mismatch_toks.push_back(pch);
686     pch = strtok (NULL, ",");
687     }
688    
689     // FIXME: we probably should exclude these hits somewhere, but this
690     // isn't the right place
691     vector<CigarOp> cigar;
692     cigar.push_back(CigarOp(MATCH, left_splice_pos));
693     if(toks[num_extra_toks + junction_type_field] == "del"){
694     cigar.push_back(CigarOp(DEL, gap_len));
695     }else{
696     cigar.push_back(CigarOp(REF_SKIP, gap_len));
697     }
698     cigar.push_back(CigarOp(MATCH, right_splice_pos));
699    
700     bh = create_hit(name,
701     contig,
702     left,
703     cigar,
704     orientation == '-',
705     junction_strand == "rev",
706     num_mismatches,
707     mismatches_in_anchor,
708     end);
709     return true;
710     }
711     }
712     else
713     {
714     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
715     //fprintf(stderr, "%s\n", orig_bwt_buf);
716     // continue;
717     return false;
718     }
719    
720     return false;
721     }
722    
723 gpertea 70
724 gpertea 137 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
725 gpertea 70 bool &spliced_alignment) {
726     const char* p_cig = cigar_str;
727 gpertea 137 int refspan=0; //alignment span on reference sequence
728 gpertea 70 while (*p_cig)
729     {
730     char* t;
731 gpertea 137 int op_len = (int)strtol(p_cig, &t, 10);
732     if (op_len <= 0)
733 gpertea 70 {
734 gpertea 137 fprintf (stderr, "Error: CIGAR op has zero length\n");
735     return 0;
736 gpertea 70 }
737     char op_char = toupper(*t);
738     CigarOpCode opcode;
739 gpertea 137 switch (op_char) {
740     case '=':
741     case 'X':
742     case 'M': opcode = MATCH;
743     refspan+=op_len;
744     break;
745     case 'I': opcode = INS;
746     break;
747     case 'D': opcode = DEL;
748     refspan+=op_len;
749     break;
750     case 'N': if (op_len > max_report_intron_length)
751     return 0;
752     opcode = REF_SKIP;
753     spliced_alignment = true;
754     refspan+=op_len;
755     break;
756     case 'S': opcode = SOFT_CLIP;
757     break;
758     case 'H': opcode = HARD_CLIP;
759     break;
760     case 'P': opcode = PAD;
761     break;
762     default: fprintf (stderr, "Error: invalid CIGAR operation\n");
763     return 0;
764     }
765 gpertea 70 p_cig = t + 1;
766 gpertea 137 cigar.push_back(CigarOp(opcode, op_len));
767     } //while cigar codes
768 gpertea 70 if (*p_cig) {
769 gpertea 137 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
770     return 0;
771 gpertea 70 }
772 gpertea 137 return refspan;
773 gpertea 70 }
774    
775 gpertea 137 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
776     vector<int>& mismatches, int& sam_nm, bool& antisense_splice) {
777     int gspan=0;//genomic span of the alignment
778     const char* tag_buf = buf;
779 gpertea 70 sam_nm=0;
780 gpertea 137 //num_mismatches=0;
781 gpertea 70 while((tag_buf = get_token((char**)&buf,"\t")))
782     {
783     vector<string> tuple_fields;
784     tokenize(tag_buf,":", tuple_fields);
785     if (tuple_fields.size() == 3)
786     {
787     if (tuple_fields[0] == "XS")
788     {
789     if (tuple_fields[2] == "-")
790     antisense_splice = true;
791     }
792     else if (tuple_fields[0] == "NM")
793     {
794     sam_nm = atoi(tuple_fields[2].c_str());
795     }
796     else if (tuple_fields[0] == "NS")
797     {
798     //ignored for now
799     }
800     else if (tuple_fields[0] == "MD")
801     {
802     const char* p=tuple_fields[2].c_str();
803     int bi=0; //base offset position in the read
804     while (*p != 0) {
805     if (isdigit(*p)) {
806     int v=atoi(p);
807     do { p++; } while (isdigit(*p));
808     bi+=v;
809     }
810 gpertea 137 while (isalpha(*p)) {
811 gpertea 70 p++;
812 gpertea 137 mismatches.push_back(bi);
813 gpertea 70 bi++;
814     }
815     if (*p=='^') { //reference deletion
816     p++;
817 gpertea 137 while (isalpha(*p)) { //insert read bases
818 gpertea 70 p++; bi++;
819 gpertea 137 }
820 gpertea 70 }
821     }
822     }
823     else
824     {
825     //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
826     //return false;
827     }
828     }
829     }
830     /* By convention,the NM field of the SAM record
831     * counts an insertion or deletion. I dont' think
832     * we want the mismatch count in the BowtieHit
833     * record to reflect this. Therefore, subtract out
834     * the mismatches due to in/dels
835     */
836     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
837 gpertea 137 switch (itr->opcode)
838     {
839     case MATCH:
840     case REF_SKIP:
841     case PAD:
842     gspan += itr->length;
843     break;
844     case DEL:
845     gspan += itr->length;
846     sam_nm -= itr->length;
847     break;
848     case INS:
849     sam_nm -= itr->length;
850     break;
851 gpertea 70 }
852 gpertea 137 }
853     return gspan;
854 gpertea 70 }
855    
856 gpertea 29 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
857     BowtieHit& bh,
858     bool strip_slash,
859     char* name_out,
860     char* name_tags,
861     char* seq,
862     char* qual)
863     {
864     if (!orig_bwt_buf || !*orig_bwt_buf)
865     return false;
866     char bwt_buf[2048];
867     strcpy(bwt_buf, orig_bwt_buf);
868     // Are we still in the header region?
869     if (bwt_buf[0] == '@')
870     return false;
871    
872     char* buf = bwt_buf;
873     char* name = get_token((char**)&buf,"\t");
874     char* sam_flag_str = get_token((char**)&buf,"\t");
875     char* text_name = get_token((char**)&buf,"\t");
876     char* text_offset_str = get_token((char**)&buf,"\t");
877     const char* map_qual_str = get_token((char**)&buf,"\t");
878     char* cigar_str = get_token((char**)&buf,"\t");
879     const char* mate_ref_str = get_token((char**)&buf,"\t");
880     const char* mate_pos_str = get_token((char**)&buf,"\t");
881     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
882    
883     const char* seq_str = get_token((char**)&buf,"\t");
884     if (seq)
885     strcpy(seq, seq_str);
886    
887     const char* qual_str = get_token((char**)&buf,"\t");
888     if (qual)
889     strcpy(qual, qual_str);
890    
891     if (!name ||
892     !sam_flag_str ||
893     !text_name ||
894     !text_offset_str ||
895     !map_qual_str ||
896     !cigar_str ||
897     !mate_ref_str ||
898     !mate_pos_str ||
899     !inferred_insert_sz_str ||
900     !seq_str ||
901     !qual_str)
902     {
903     // truncated or malformed SAM record
904     return false;
905     }
906    
907    
908     int sam_flag = atoi(sam_flag_str);
909     int text_offset = atoi(text_offset_str);
910    
911     bool end = true;
912     unsigned int seg_offset = 0;
913     unsigned int seg_num = 0;
914     unsigned int num_segs = 0;
915    
916     // Copy the tag out of the name field before we might wipe it out
917 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
918 gpertea 29
919     vector<CigarOp> cigar;
920     bool spliced_alignment = false;
921 gpertea 137 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
922     if (refspan==0)
923 gpertea 70 return false;
924 gpertea 29 //vector<string> attributes;
925     //tokenize(tag_buf, " \t",attributes);
926 gpertea 70
927 gpertea 29 bool antisense_splice = false;
928 gpertea 70 int sam_nm = 0; //the value of the NM tag (edit distance)
929 gpertea 137 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
930     vector<int> mismatches;
931     getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
932 gpertea 29
933     if (spliced_alignment)
934     {
935     bh = create_hit(name,
936     text_name,
937     text_offset - 1,
938     cigar,
939     sam_flag & 0x0010,
940     antisense_splice,
941 gpertea 137 mismatches.size(),
942 gpertea 70 0,
943 gpertea 29 end);
944     }
945     else
946     {
947     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
948     bh = create_hit(name,
949     text_name,
950     text_offset - 1, // SAM files are 1-indexed
951     cigar,
952     sam_flag & 0x0010,
953     false,
954 gpertea 137 mismatches.size(),
955 gpertea 29 0,
956     end);
957     }
958 gpertea 70 return true;
959 gpertea 29 }
960    
961 gpertea 137 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
962     if (op.length<=0) return;
963     if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
964     cigar.back().length+=op.length;
965     }
966     cigar.push_back(op);
967     }
968 gpertea 70
969 gpertea 137 int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<int> mismatches,
970     int &left, int spl_start, int spl_len, CigarOpCode spl_code) {
971 gpertea 70 //merge the original 'cigar' with the new insert/gap operation
972 gpertea 137 //at position spl_start and place the result into splcigar;
973     int num_mismatches=mismatches.size();
974     //these offsets are relative to the beginning of alignment on reference
975     int spl_ofs=spl_start-left; //relative position of splice op
976     int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
977     CigarOp gapop(spl_code, spl_len); //for DEL or REF_SKIP
978     if (spl_code==INS)
979     spl_ofs_end += spl_len;
980     int ref_ofs=0; //working offset on reference
981     int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
982     bool xfound=false;
983     //if (left<=spl_start+spl_len) {
984     if (spl_ofs_end>0) {
985     int prev_opcode=0;
986     int prev_oplen=0;
987     for (size_t c = 0 ; c < cigar.size(); ++c)
988     {
989     int prev_read_ofs=read_ofs;
990     int cur_op_ofs=ref_ofs;
991     int cur_opcode=cigar[c].opcode;
992     int cur_oplen=cigar[c].length;
993     switch (cur_opcode) {
994     case MATCH:
995     ref_ofs+=cur_oplen;
996     read_ofs+=cur_oplen;
997     break;
998     case DEL:
999     case REF_SKIP:
1000     case PAD:
1001     ref_ofs+=cur_oplen;
1002     break;
1003     case SOFT_CLIP:
1004     case INS:
1005     read_ofs+=cur_oplen;
1006     break;
1007     //case HARD_CLIP:
1008     }
1009     if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1010     if (spl_code!=INS && cur_op_ofs==spl_ofs_end)
1011     {
1012     //we have to insert the gap here first
1013     cigar_add(splcigar, gapop);
1014     }
1015     cigar_add(splcigar, cigar[c]);
1016     }
1017     else //if (ref_ofs>spl_ofs) {
1018     { //op intersection
1019     xfound=true;
1020     if (spl_code==INS) {
1021     //we have to shorten cur_opcode
1022     // find the overlap between current range
1023     int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1024     int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1025     CigarOp op(cigar[c]);
1026     int len=op.length;
1027     len-=(ovl_end-ovl_start);
1028     if (len>0) {
1029     op.length=len;
1030     cigar_add(splcigar, op);
1031     }
1032     }
1033     else {//DEL or REF_SKIP
1034     //spl_ofs == spl_ofs_end
1035     //we have to split cur_opcode
1036     CigarOp op(cigar[c]);
1037     op.length=spl_ofs-cur_op_ofs;
1038     cigar_add(splcigar, op);
1039     cigar_add(splcigar, gapop);
1040     op.length=ref_ofs-spl_ofs;
1041     cigar_add(splcigar,op);
1042     }
1043     } //op intersection
1044     prev_opcode=cur_opcode;
1045     prev_oplen=cur_oplen;
1046     } //for each cigar opcode
1047     } //intersection possible
1048     if (!xfound) {//no intersection found between splice event and alignment
1049     if (spl_ofs_end<=0) {
1050     //alignment starts after the splice event
1051     if (spl_code==INS) left-=spl_len;
1052     else left+=spl_len;
1053     }
1054     //else {
1055     //alignment ends before the splice event
1056     //nothing to do
1057     // }
1058     }
1059 gpertea 97
1060 gpertea 137 //TODO: adjust mismatches if opcode==INS and they fall in the removed part
1061     //TODO: what about anchor mismatches for REF_SKIP
1062     return num_mismatches;
1063 gpertea 70 }
1064    
1065 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1066     BowtieHit& bh,
1067     bool strip_slash,
1068     char* name_out,
1069     char* name_tags,
1070     char* seq,
1071     char* qual)
1072     {
1073     if (!orig_bwt_buf || !*orig_bwt_buf)
1074     return false;
1075 gpertea 29
1076 gpertea 69 char bwt_buf[2048];
1077     strcpy(bwt_buf, orig_bwt_buf);
1078     // Are we still in the header region?
1079     if (bwt_buf[0] == '@')
1080     return false;
1081 gpertea 29
1082 gpertea 69 char* buf = bwt_buf;
1083     char* name = get_token((char**)&buf,"\t");
1084     char* sam_flag_str = get_token((char**)&buf,"\t");
1085     char* text_name = get_token((char**)&buf,"\t");
1086     char* text_offset_str = get_token((char**)&buf,"\t");
1087     const char* map_qual_str = get_token((char**)&buf,"\t");
1088     char* cigar_str = get_token((char**)&buf,"\t");
1089     const char* mate_ref_str = get_token((char**)&buf,"\t");
1090     const char* mate_pos_str = get_token((char**)&buf,"\t");
1091     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1092 gpertea 137 //int num_mismatches=0;
1093     //int mismatches[1024]; //list of 0-based mismatch positions in this read
1094 gpertea 70 //parsed from SAM's MD:Z: tag
1095 gpertea 69 const char* seq_str = get_token((char**)&buf,"\t");
1096     if (seq)
1097     strcpy(seq, seq_str);
1098    
1099     const char* qual_str = get_token((char**)&buf,"\t");
1100     if (qual)
1101     strcpy(qual, qual_str);
1102    
1103     if (!name ||
1104     !sam_flag_str ||
1105     !text_name ||
1106     !text_offset_str ||
1107     !map_qual_str ||
1108     !cigar_str ||
1109     !mate_ref_str ||
1110     !mate_pos_str ||
1111     !inferred_insert_sz_str ||
1112     !seq_str ||
1113     !qual_str)
1114     {
1115     // truncated or malformed SAM record
1116     return false;
1117     }
1118    
1119    
1120     int sam_flag = atoi(sam_flag_str);
1121     int text_offset = atoi(text_offset_str);
1122 gpertea 137 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1123 gpertea 69 bool end = true;
1124     unsigned int seg_offset = 0;
1125     unsigned int seg_num = 0;
1126     unsigned int num_segs = 0;
1127    
1128     // Copy the tag out of the name field before we might wipe it out
1129 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1130 gpertea 69
1131 gpertea 70 vector<CigarOp> samcigar;
1132     bool spliced_alignment = false;
1133 gpertea 137 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1134     if (refspan==0)
1135     return false;
1136 gpertea 70 bool antisense_splice = false;
1137     int sam_nm = 0;
1138 gpertea 137 vector<int> mismatches;
1139     getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1140 gpertea 69
1141 gpertea 70 //##############################################
1142 gpertea 69
1143     // Add this alignment to the table of hits for this half of the
1144     // Bowtie map
1145    
1146     // Parse the text_name field to recover the splice coords
1147     vector<string> toks;
1148     tokenize_strict(text_name, "|", toks);
1149    
1150     int num_extra_toks = (int)toks.size() - 6;
1151    
1152     if (num_extra_toks >= 0)
1153     {
1154     static const uint8_t left_window_edge_field = 1;
1155     static const uint8_t splice_field = 2;
1156     //static const uint8_t right_window_edge_field = 3;
1157     static const uint8_t junction_type_field = 4;
1158     static const uint8_t strand_field = 5;
1159    
1160     string contig = toks[0];
1161     for (int t = 1; t <= num_extra_toks; ++t)
1162     {
1163     contig += "|";
1164     contig += toks[t];
1165     }
1166    
1167     vector<string> splice_toks;
1168     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1169     if (splice_toks.size() != 2)
1170     {
1171     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1172 gpertea 70 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1173 gpertea 69 // toks[num_extra_toks + splice_field].c_str());
1174     return false;
1175     }
1176 gpertea 137
1177     string junction_strand = toks[num_extra_toks + strand_field];
1178     if(junction_strand != "rev" && junction_strand != "fwd"){
1179     fprintf(stderr, "Malformed insertion record\n");
1180     return false;
1181     }
1182    
1183     int spl_num_mismatches=0;
1184    
1185 gpertea 69 //
1186     // check for an insertion hit
1187     //
1188     if(toks[num_extra_toks + junction_type_field] == "ins")
1189     {
1190 gpertea 137 //int8_t spliced_read_len = strlen(seq_str);
1191     //TODO FIXME: use the CIGAR instead of seq length!
1192 gpertea 70 // The 0-based position of the left edge of the alignment. Note that this
1193 gpertea 137 // value may need to be further corrected to account for the presence of
1194 gpertea 70 // of the insertion.
1195 gpertea 137 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1196 gpertea 69
1197 gpertea 70 // The 0-based position of the last genomic sequence before the insertion
1198 gpertea 137 int left_splice_pos = atoi(splice_toks[0].c_str());
1199 gpertea 69
1200     string insertedSequence = splice_toks[1];
1201 gpertea 70 // The 0-based position of the first genomic sequence after the insertion
1202    
1203 gpertea 137 vector<CigarOp> splcigar;
1204    
1205     //TODO: implement this
1206     //this also updates left to the adjusted genomic coordinates
1207     spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
1208     left, left_splice_pos, insertedSequence.length(), INS);
1209     /*
1210 gpertea 69 uint32_t right_splice_pos = left_splice_pos + 1;
1211 gpertea 137
1212     //uint32_t right = left + spliced_read_len - 1;
1213     int right = left + refspan - 1;
1214    
1215 gpertea 69 if(left > left_splice_pos){
1216 gpertea 137 //The genomic position of the left edge of the alignment needs to be corrected
1217     //If the alignment does not extend into the insertion, simply subtract the length
1218     //of the inserted sequence, otherwise, just set it equal to the right edge
1219 gpertea 69 left = left - insertedSequence.length();
1220     if(left < right_splice_pos){
1221     left = right_splice_pos;
1222     }
1223     }
1224     if(right > left_splice_pos){
1225     right = right - insertedSequence.length();
1226     if(right < left_splice_pos){
1227     right = left_splice_pos;
1228     }
1229     }
1230 gpertea 137 // Now, right and left should be properly transformed into genomic coordinates
1231     // We should be able to deduce how much the alignment matches the insertion
1232     // simply based on the length of the read
1233 gpertea 69 int left_match_length = 0;
1234     if(left <= left_splice_pos){
1235     left_match_length = left_splice_pos - left + 1;
1236     }
1237     int right_match_length = 0;
1238     if(right >= right_splice_pos){
1239     right_match_length = right - right_splice_pos + 1;
1240     }
1241     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1242    
1243     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1244     return false;
1245    
1246 gpertea 70 //char* pch = strtok( mismatches, ",");
1247     //unsigned char num_mismatches = 0;
1248     //text_offset holds the left end of the alignment,
1249     //RELATIVE TO the start of the contig
1250 gpertea 69
1251 gpertea 70 //The 0-based relative position of the left-most character
1252     //before the insertion in the contig
1253 gpertea 69 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1254 gpertea 137 for (size_t i=0;i<mismatches.size();++i) {
1255 gpertea 70 int mismatch_pos = mismatches[i];
1256     // for reversely mapped reads,
1257     //find the correct mismatched position.
1258     if (sam_flag & 0x0010){
1259     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1260     }
1261 gpertea 69
1262 gpertea 70 //Only count mismatches outside of the insertion region
1263     // If there is a mismatch within the insertion,
1264     // disallow this hit
1265 gpertea 137 if(mismatch_pos + text_offset <= relative_splice_pos ||
1266     mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1267 gpertea 70 spl_num_mismatches++;
1268     }else{
1269     return false;
1270 gpertea 69 }
1271     }
1272 gpertea 137 */
1273     //vector<CigarOp> splcigar;
1274     //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1275     //splcigar.push_back(CigarOp(MATCH, left_match_length));
1276     //splcigar.push_back(CigarOp(INS, insertion_match_length));
1277     //splcigar.push_back(CigarOp(MATCH, right_match_length));
1278 gpertea 69
1279     bh = create_hit(name,
1280     contig,
1281     left,
1282 gpertea 70 //splcigar,
1283     splcigar,
1284     sam_flag & 0x0010,
1285 gpertea 69 junction_strand == "rev",
1286 gpertea 70 spl_num_mismatches,
1287 gpertea 69 0,
1288     end);
1289     return true;
1290 gpertea 70 } //"ins"
1291 gpertea 137 else //"del" or intron
1292 gpertea 69 {
1293 gpertea 137 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1294     int spliced_read_len = strlen(seq_str); //FIXME TODO: use CIGAR instead
1295     int left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1296 gpertea 69 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1297 gpertea 137 int right_splice_pos = spliced_read_len - left_splice_pos;
1298 gpertea 69
1299     if (right_splice_pos <= 0 || left_splice_pos <= 0)
1300     return false;
1301    
1302     if ((sam_flag & 0x0010) == 0) //######
1303     {
1304 gpertea 135 if (left_splice_pos + seg_offset < _anchor_length)
1305     return false;
1306     }
1307     else
1308     {
1309     if (right_splice_pos + seg_offset < _anchor_length)
1310     return false;
1311     }
1312 gpertea 69 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1313     //atoi(toks[right_window_edge_field].c_str());
1314     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1315    
1316     int mismatches_in_anchor = 0;
1317 gpertea 137 for (size_t i=0;i<mismatches.size();++i) {
1318     spl_num_mismatches++;
1319     int mismatch_pos = mismatches[i];
1320     if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1321     ((sam_flag & 0x0010) != 0 &&
1322 gpertea 70 abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1323     mismatches_in_anchor++;
1324 gpertea 137 }
1325 gpertea 69
1326     // FIXME: we probably should exclude these hits somewhere, but this
1327     // isn't the right place
1328 gpertea 70 vector<CigarOp> splcigar;
1329     CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1330 gpertea 137 //TODO: FIXME: use relative offsets for this call instead, like for the INS case
1331     spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left, left_splice_pos, gap_len, opcode);
1332 gpertea 70 /*
1333     splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1334 gpertea 69 if(toks[num_extra_toks + junction_type_field] == "del"){
1335 gpertea 70 splcigar.push_back(CigarOp(DEL, gap_len));
1336 gpertea 69 }else{
1337 gpertea 70 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1338 gpertea 69 }
1339 gpertea 70 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1340     */
1341 gpertea 69 bh = create_hit(name,
1342     contig,
1343     left,
1344 gpertea 70 splcigar,
1345     (sam_flag & 0x0010),
1346 gpertea 69 junction_strand == "rev",
1347 gpertea 70 spl_num_mismatches,
1348 gpertea 69 mismatches_in_anchor,
1349     end);
1350     return true;
1351     }
1352 gpertea 70 } //parse splice data
1353 gpertea 69 else
1354     {
1355     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1356     //fprintf(stderr, "%s\n", orig_bwt_buf);
1357     // continue;
1358     return false;
1359     }
1360    
1361     return false;
1362     }
1363    
1364    
1365 gpertea 135
1366 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1367     BowtieHit& bh, bool strip_slash,
1368     char* name_out, char* name_tags,
1369     char* seq, char* qual) {
1370     if (_sam_header==NULL)
1371     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1372     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1373    
1374    
1375     uint32_t sam_flag = hit_buf->core.flag;
1376    
1377     int text_offset = hit_buf->core.pos;
1378     int text_mate_pos = hit_buf->core.mpos;
1379     int target_id = hit_buf->core.tid;
1380     int mate_target_id = hit_buf->core.mtid;
1381    
1382     vector<CigarOp> cigar;
1383     bool spliced_alignment = false;
1384     int num_hits = 1;
1385    
1386     double mapQ = hit_buf->core.qual;
1387     long double error_prob;
1388     if (mapQ > 0)
1389     {
1390     long double p = (-1.0 * mapQ) / 10.0;
1391     error_prob = pow(10.0L, p);
1392     }
1393     else
1394     {
1395     error_prob = 1.0;
1396     }
1397    
1398     //header->target_name[c->tid]
1399    
1400     bool end = true;
1401     unsigned int seg_offset = 0;
1402     unsigned int seg_num = 0;
1403     unsigned int num_segs = 0;
1404     // Copy the tag out of the name field before we might wipe it out
1405     char* qname = bam1_qname(hit_buf);
1406     char* pipe = strrchr(qname, '|');
1407     if (pipe)
1408     {
1409     if (name_tags)
1410     strcpy(name_tags, pipe);
1411    
1412     char* tag_buf = pipe + 1;
1413     if (strchr(tag_buf, ':'))
1414     {
1415     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1416     if (seg_num + 1 == num_segs)
1417     end = true;
1418     else
1419     end = false;
1420     }
1421    
1422     *pipe = 0;
1423     }
1424    
1425    
1426     if (target_id < 0) {
1427     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1428     bh = create_hit(qname,
1429     "*", //ref_name
1430     0, //left coord
1431     0, //read_len
1432     false, //antisense_aln
1433     0, //edit_dist
1434     end);
1435     return true;
1436     }
1437    
1438     //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1439     string text_name = _sam_header->target_name[target_id];
1440     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1441     {
1442     //char* t;
1443    
1444     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1445     if (length <= 0)
1446     {
1447     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1448     return false;
1449     }
1450    
1451     CigarOpCode opcode;
1452     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1453     {
1454     case BAM_CMATCH: opcode = MATCH; break;
1455     case BAM_CINS: opcode = INS; break;
1456     case BAM_CDEL: opcode = DEL; break;
1457     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1458     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1459     case BAM_CPAD: opcode = PAD; break;
1460     case BAM_CREF_SKIP:
1461     opcode = REF_SKIP;
1462     spliced_alignment = true;
1463     if (length > (int)max_report_intron_length)
1464     {
1465     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1466     return false;
1467     }
1468     break;
1469     default:
1470     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1471     return false;
1472     }
1473     if (opcode != HARD_CLIP)
1474     cigar.push_back(CigarOp(opcode, length));
1475     }
1476    
1477     string mrnm;
1478     if (mate_target_id >= 0) {
1479     if (mate_target_id == target_id) {
1480     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1481     mrnm = _sam_header->target_name[mate_target_id];
1482     }
1483     else {
1484     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1485     return false;
1486     }
1487     }
1488     else {
1489     text_mate_pos = 0;
1490     }
1491     //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1492     bool antisense_splice=false;
1493     unsigned char num_mismatches = 0;
1494     unsigned char num_splice_anchor_mismatches = 0;
1495    
1496     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1497     if (ptr) {
1498     char src_strand_char = bam_aux2A(ptr);
1499     if (src_strand_char == '-')
1500     antisense_splice = true;
1501     // else if (src_strand_char == '+')
1502     // source_strand = CUFF_FWD;
1503     }
1504    
1505     ptr = bam_aux_get(hit_buf, "NM");
1506     if (ptr) {
1507     num_mismatches = bam_aux2i(ptr);
1508     }
1509    
1510     ptr = bam_aux_get(hit_buf, "NH");
1511     if (ptr) {
1512     num_hits = bam_aux2i(ptr);
1513     }
1514    
1515     //bool antisense_aln = bam1_strand(hit_buf);
1516    
1517     //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1518     // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1519     if (spliced_alignment) {
1520     //if (source_strand == CUFF_STRAND_UNKNOWN) {
1521     // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1522     // }
1523     bh = create_hit(qname,
1524     text_name,
1525     text_offset, // BAM files are 0-indexed
1526     cigar,
1527     sam_flag & 0x0010,
1528     antisense_splice,
1529     num_mismatches,
1530     num_splice_anchor_mismatches,
1531     end);
1532    
1533     }
1534     else {
1535     //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1536     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1537     bh = create_hit(qname,
1538     text_name,
1539     text_offset, // BAM files are 0-indexed
1540     cigar,
1541     sam_flag & 0x0010,
1542     false,
1543     num_mismatches,
1544     0,
1545     end);
1546     }
1547     if (seq!=NULL) {
1548     char *bseq = (char*)bam1_seq(hit_buf);
1549     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1550     char v = bam1_seqi(bseq,i);
1551     seq[i]=bam_nt16_rev_table[v];
1552     }
1553     seq[hit_buf->core.l_qseq]=0;
1554     }
1555     if (qual!=NULL) {
1556     char *bq = (char*)bam1_qual(hit_buf);
1557     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1558     qual[i]=bq[i]+33;
1559     }
1560     qual[hit_buf->core.l_qseq]=0;
1561     }
1562     return true;
1563     }
1564    
1565     bool BAMHitFactory::inspect_header(HitStream& hs)
1566     {
1567     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1568    
1569     if (header == NULL) {
1570     fprintf(stderr, "Warning: No BAM header\n");
1571     return false;
1572     }
1573     if (header->l_text == 0) {
1574     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1575     return false;
1576     }
1577     return true;
1578     }
1579    
1580    
1581     void get_mapped_reads(FILE* bwtf,
1582     HitTable& hits,
1583     HitFactory& hit_factory,
1584     bool strip_slash,
1585     bool verbose)
1586     {
1587    
1588    
1589     char bwt_buf[2048];
1590     uint32_t reads_extracted = 0;
1591    
1592     while (fgets(bwt_buf, 2048, bwtf))
1593     {
1594     // Chomp the newline
1595     char* nl = strrchr(bwt_buf, '\n');
1596     if (nl) *nl = 0;
1597     if (*bwt_buf == 0)
1598     continue;
1599     // Get a new record from the tab-delimited Bowtie map
1600     BowtieHit bh;
1601     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1602     {
1603     // Only check uniqueness if these hits are spliced
1604     hits.add_hit(bh, true);
1605     }
1606     reads_extracted++;
1607     }
1608    
1609     // This will sort the map by insert id.
1610     hits.finalize();
1611     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1612     }
1613    
1614    
1615     /*
1616     AlignStatus status(const BowtieHit* align)
1617     {
1618     if (!align)
1619     return UNALIGNED;
1620     if (align->contiguous())
1621     return CONTIGUOUS;
1622     return SPLICED;
1623     }
1624     */
1625    
1626     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1627     {
1628     int max_hit_pos = -1;
1629     for (size_t i = 0; i < hits.size(); ++i)
1630     {
1631     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1632     }
1633    
1634     if ((int)DoC.size() < max_hit_pos)
1635     DoC.resize(max_hit_pos);
1636    
1637     for (size_t i = 0; i < hits.size(); ++i)
1638     {
1639     const BowtieHit& bh = hits[i];
1640    
1641     // split up the coverage contibution for this reads
1642     size_t j = bh.left();
1643     const vector<CigarOp>& cigar = bh.cigar();
1644    
1645     for (size_t c = 0 ; c < cigar.size(); ++c)
1646     {
1647     switch(cigar[c].opcode)
1648     {
1649     case MATCH:
1650     for (size_t m = 0; m < cigar[c].length; ++m)
1651     {
1652     if (DoC[j + m] < 0xFFFF)
1653     DoC[j + m]++;
1654     }
1655     //fall through this case to REF_SKIP is intentional
1656     case REF_SKIP:
1657     j += cigar[c].length;
1658     break;
1659     default:
1660     break;
1661     }
1662    
1663     }
1664     }
1665     }
1666    
1667     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1668     {
1669     if ((int)DoC.size() < bh.right())
1670     DoC.resize(bh.right());
1671    
1672     // split up the coverage contibution for this reads
1673     size_t j = bh.left();
1674     const vector<CigarOp>& cigar = bh.cigar();
1675    
1676     for (size_t c = 0 ; c < cigar.size(); ++c)
1677     {
1678     switch(cigar[c].opcode)
1679     {
1680     case MATCH:
1681     for (size_t m = 0; m < cigar[c].length; ++m)
1682     {
1683 gpertea 135 if (DoC[j + m] < VMAXINT32)
1684 gpertea 29 DoC[j + m]++;
1685     }
1686     //fall through this case to REF_SKIP is intentional
1687     case REF_SKIP:
1688     j += cigar[c].length;
1689     break;
1690     default:
1691     break;
1692     }
1693    
1694     }
1695     }
1696    
1697     void print_hit(FILE* fout,
1698     const char* read_name,
1699     const BowtieHit& bh,
1700     const char* ref_name,
1701     const char* sequence,
1702     const char* qualities,
1703     bool from_bowtie)
1704     {
1705     string seq;
1706     string quals;
1707     if (sequence)
1708     {
1709     seq = sequence;
1710     quals = qualities;
1711     seq.resize(bh.read_len());
1712     quals.resize(bh.read_len());
1713     }
1714     else
1715     {
1716     seq = "*";
1717     }
1718    
1719     if (qualities)
1720     {
1721     quals = qualities;
1722     quals.resize(bh.read_len());
1723     }
1724     else
1725     {
1726     quals = "*";
1727     }
1728    
1729     uint32_t sam_flag = 0;
1730     if (bh.antisense_align())
1731     {
1732     sam_flag |= 0x0010; // BAM_FREVERSE
1733     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1734     {
1735     reverse_complement(seq);
1736     reverse(quals.begin(), quals.end());
1737     }
1738     }
1739    
1740     uint32_t sam_pos = bh.left() + 1;
1741     uint32_t map_quality = 255;
1742     char cigar[256];
1743     cigar[0] = 0;
1744     string mate_ref_name = "*";
1745     uint32_t mate_pos = 0;
1746     uint32_t insert_size = 0;
1747     //string qualities = "*";
1748    
1749     const vector<CigarOp>& bh_cigar = bh.cigar();
1750    
1751     /*
1752     * In addition to calculating the cigar string,
1753     * we need to figure out how many in/dels are in the
1754     * sequence, so that we can give the correct
1755     * value for the NM tag
1756     */
1757     int indel_distance = 0;
1758     for (size_t c = 0; c < bh_cigar.size(); ++c)
1759     {
1760     char ibuf[64];
1761     sprintf(ibuf, "%d", bh_cigar[c].length);
1762     switch(bh_cigar[c].opcode)
1763     {
1764     case MATCH:
1765     strcat(cigar, ibuf);
1766     strcat(cigar, "M");
1767     break;
1768     case INS:
1769     strcat(cigar, ibuf);
1770     strcat(cigar, "I");
1771     indel_distance += bh_cigar[c].length;
1772     break;
1773     case DEL:
1774     strcat(cigar, ibuf);
1775     strcat(cigar, "D");
1776     indel_distance += bh_cigar[c].length;
1777     break;
1778     case REF_SKIP:
1779     strcat(cigar, ibuf);
1780     strcat(cigar, "N");
1781     break;
1782     default:
1783     break;
1784     }
1785     }
1786    
1787     //string q = string(bh.read_len(), '!');
1788     //string s = string(bh.read_len(), 'N');
1789    
1790     fprintf(fout,
1791     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1792     read_name,
1793     sam_flag,
1794     ref_name,
1795     sam_pos,
1796     map_quality,
1797     cigar,
1798     mate_ref_name.c_str(),
1799     mate_pos,
1800     insert_size,
1801     seq.c_str(),
1802     quals.c_str());
1803    
1804     if (!sam_readgroup_id.empty())
1805     fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1806    
1807     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1808    
1809     bool containsSplice = false;
1810     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1811     if(itr->opcode == REF_SKIP){
1812     containsSplice = true;
1813     break;
1814     }
1815     }
1816    
1817     if (containsSplice)
1818     fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1819    
1820     fprintf(fout, "\n");
1821     }
1822    
1823     void print_bamhit(GBamWriter& wbam,
1824     const char* read_name,
1825     const BowtieHit& bh,
1826     const char* ref_name,
1827     const char* sequence,
1828     const char* qualities,
1829     bool from_bowtie)
1830     {
1831     string seq;
1832     string quals;
1833     if (sequence) {
1834     seq = sequence;
1835     quals = qualities;
1836     seq.resize(bh.read_len());
1837     quals.resize(bh.read_len());
1838     }
1839     else {
1840     seq = "*";
1841     }
1842     if (qualities) {
1843     quals = qualities;
1844     quals.resize(bh.read_len());
1845     }
1846     else
1847     {
1848     quals = "*";
1849     }
1850    
1851     uint32_t sam_flag = 0;
1852     if (bh.antisense_align())
1853     {
1854     sam_flag |= 0x0010; // BAM_FREVERSE
1855     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1856     {
1857     reverse_complement(seq);
1858     reverse(quals.begin(), quals.end());
1859     }
1860     }
1861    
1862     uint32_t sam_pos = bh.left() + 1;
1863     uint32_t map_quality = 255;
1864     char cigar[256];
1865     cigar[0] = 0;
1866     string mate_ref_name = "*";
1867     uint32_t mate_pos = 0;
1868     uint32_t insert_size = 0;
1869     //string qualities = "*";
1870    
1871     const vector<CigarOp>& bh_cigar = bh.cigar();
1872     /*
1873     * In addition to calculating the cigar string,
1874     * we need to figure out how many in/dels are in the
1875     * sequence, so that we can give the correct
1876     * value for the NM tag
1877     */
1878     int indel_distance = 0;
1879     for (size_t c = 0; c < bh_cigar.size(); ++c)
1880     {
1881     char ibuf[64];
1882     sprintf(ibuf, "%d", bh_cigar[c].length);
1883     switch(bh_cigar[c].opcode)
1884     {
1885     case MATCH:
1886     strcat(cigar, ibuf);
1887     strcat(cigar, "M");
1888     break;
1889     case INS:
1890     strcat(cigar, ibuf);
1891     strcat(cigar, "I");
1892     indel_distance += bh_cigar[c].length;
1893     break;
1894     case DEL:
1895     strcat(cigar, ibuf);
1896     strcat(cigar, "D");
1897     indel_distance += bh_cigar[c].length;
1898     break;
1899     case REF_SKIP:
1900     strcat(cigar, ibuf);
1901     strcat(cigar, "N");
1902     break;
1903     default:
1904     break;
1905     }
1906     }
1907    
1908     //string q = string(bh.read_len(), '!');
1909     //string s = string(bh.read_len(), 'N');
1910     /*
1911     fprintf(fout,
1912     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1913     read_name,
1914     sam_flag,
1915     ref_name,
1916     sam_pos,
1917     map_quality,
1918     cigar,
1919     mate_ref_name.c_str(),
1920     mate_pos,
1921     insert_size,
1922     seq.c_str(),
1923     quals.c_str());
1924    
1925     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1926     */
1927     vector<string> auxdata;
1928    
1929     if (!sam_readgroup_id.empty())
1930     {
1931     string nm("RG:Z:");
1932     nm += sam_readgroup_id;
1933     auxdata.push_back(nm);
1934     }
1935    
1936     string nm("NM:i:");
1937     str_appendInt(nm, bh.edit_dist() + indel_distance);
1938     auxdata.push_back(nm);
1939     bool containsSplice = false;
1940     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1941     if(itr->opcode == REF_SKIP){
1942     containsSplice = true;
1943     break;
1944     }
1945    
1946     if (containsSplice) {
1947     //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1948     nm="XS:A:";
1949     nm+=(char)(bh.antisense_splice() ? '-' : '+');
1950     auxdata.push_back(nm);
1951     }
1952    
1953     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1954     cigar, mate_ref_name.c_str(), mate_pos,
1955     insert_size, seq.c_str(), quals.c_str(), &auxdata);
1956    
1957    
1958    
1959     wbam.write(brec);
1960     delete brec;
1961     }
1962    
1963     /**
1964     * Print a vector of cigar operations to a file.
1965     * @param bh_cigar A vector of CigarOps
1966     * @return a string representation of the cigar string
1967     */
1968     std::string print_cigar(vector<CigarOp>& bh_cigar){
1969     char cigar[256];
1970     cigar[0] = 0;
1971     for (size_t c = 0; c < bh_cigar.size(); ++c)
1972     {
1973     char ibuf[64];
1974     sprintf(ibuf, "%d", bh_cigar[c].length);
1975     switch(bh_cigar[c].opcode)
1976     {
1977     case MATCH:
1978     strcat(cigar, ibuf);
1979     strcat(cigar, "M");
1980     break;
1981     case INS:
1982     strcat(cigar, ibuf);
1983     strcat(cigar, "I");
1984     break;
1985     case DEL:
1986     strcat(cigar, ibuf);
1987     strcat(cigar, "D");
1988     break;
1989     case REF_SKIP:
1990     strcat(cigar, ibuf);
1991     strcat(cigar, "N");
1992     break;
1993     default:
1994     break;
1995     }
1996     }
1997     string result(cigar);
1998     return result;
1999     }
2000    
2001     bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
2002     {
2003     RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
2004     if (!ref_str)
2005     return false;
2006    
2007     const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
2008    
2009     size_t pos_seq = 0;
2010     size_t pos_ref = 0;
2011     size_t mismatch = 0;
2012     for (size_t i = 0; i < _cigar.size(); ++i)
2013     {
2014     CigarOp cigar = _cigar[i];
2015     switch(cigar.opcode)
2016     {
2017     case MATCH:
2018     {
2019     for (size_t j = 0; j < cigar.length; ++j)
2020     {
2021     if (_seq[pos_seq++] != ref_seq[pos_ref++])
2022     ++mismatch;
2023     }
2024     }
2025     break;
2026     case INS:
2027     {
2028     pos_seq += cigar.length;
2029     }
2030     break;
2031    
2032     case DEL:
2033     case REF_SKIP:
2034     {
2035     pos_ref += cigar.length;
2036     }
2037     break;
2038    
2039     default:
2040     break;
2041     }
2042     }
2043    
2044     return mismatch == _edit_dist;
2045     }