ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (12 years, 7 months ago) by gpertea
File size: 80629 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include <seqan/sequence.h>
22 #include <seqan/find.h>
23 #include <seqan/file.h>
24 #include <seqan/modifier.h>
25
26 #include "common.h"
27 #include "bwt_map.h"
28 #include "tokenize.h"
29 #include "reads.h"
30
31 using namespace std;
32
33 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
34 {
35 uint32_t reference_id = bh.ref_id();
36
37 pair<RefHits::iterator, bool> ret =
38 _hits_for_ref.insert(make_pair(reference_id, HitList()));
39 HitList& hl = ret.first->second;
40
41 if (check_uniqueness)
42 {
43 // Check uniqueness, in case we are adding spliced hits from
44 // several spliced alignment sources (e.g. de novo hashing + Bowtie
45 // against a user-supplied index). We don't want to count the same
46 // alignment twice if it happened to be found by more than one method
47 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
48 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
49 for (; lb != ub && lb != hl.end(); ++lb)
50 {
51 if (*lb == bh)
52 {
53 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
54 return;
55 }
56
57 if (lb->insert_id() == bh.insert_id() &&
58 lb->ref_id() == bh.ref_id() &&
59 lb->antisense_align() == bh.antisense_align())
60 {
61 // If we get here, we may be looking at the same alignment
62 // However, spanning_reads may report a shorter, trimmed alignment
63 // so not all fields will be equal. If they just disagree on the
64 // ends, and don't indicate a different junction coord, the
65 // alignments are the same.
66
67 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
68 (bh.left() <= lb->left() && bh.right() >= lb->right()))
69 {
70 vector<pair<int,int> > lb_gaps, bh_gaps;
71 lb->gaps(lb_gaps);
72 bh.gaps(bh_gaps);
73 if (lb_gaps == bh_gaps)
74 {
75 // One alignment is contained in the other, they agree on
76 // where the gaps, if any, are, and they share an id
77 // => this is a redundant aligment, so toss it
78 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
79 return;
80 }
81 }
82 }
83 }
84 }
85 _total_hits++;
86 hl.push_back(bh);
87 }
88
89 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
90 {
91 return h1.insert_id() < h2.insert_id();
92 }
93
94 void LineHitFactory::openStream(HitStream& hs)
95 {
96 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
97 //open the file for HitStream here
98 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
99 if (hs._hit_file==NULL)
100 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
101 return;
102 }
103 if (hs._fzpipe!=NULL) {
104 hs._hit_file=hs._fzpipe->file;
105 }
106 }
107
108 void LineHitFactory::rewind(HitStream& hs)
109 {
110 if (hs._fzpipe!=NULL) {
111 hs._fzpipe->rewind();
112 hs._hit_file=hs._fzpipe->file;
113 }
114 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
115 }
116
117 void LineHitFactory::seek(HitStream& hs, int64_t offset)
118 {
119 // daehwan - implement this later
120 if (hs._fzpipe != NULL) {
121 hs._fzpipe->seek(offset);
122 hs._hit_file=hs._fzpipe->file;
123 }
124 // else if (hs._hit_file) ::seek((FILE*)(hs._hit_file));
125 }
126
127 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
128 FILE* f=(FILE *)(hs._hit_file);
129 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
130 if (!new_rec || feof(f)) {
131 hs._eof=true;
132 return false;
133 }
134 ++_line_num;
135 char* nl = strrchr(_hit_buf, '\n');
136 if (nl) *nl = 0;
137 buf = _hit_buf;
138 buf_size = _hit_buf_max_sz - 1;
139 return true;
140 }
141
142 void LineHitFactory::closeStream(HitStream& hs) {
143 if (hs._fzpipe!=NULL) {
144 hs._fzpipe->close();
145 return;
146 }
147 if (hs._hit_file!=NULL) {
148 fclose((FILE*)(hs._hit_file));
149 hs._hit_file=NULL;
150 }
151 }
152 void BAMHitFactory::openStream(HitStream& hs) {
153 if (hs._hit_file==NULL) {
154 if (hs._hit_file_name.empty())
155 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
156 return; //invalid stream, could be just a place holder
157 //open the file here if not already open
158 string fext=getFext(hs._hit_file_name);
159 if (fext=="sam")
160 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
161 else
162 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
163
164 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
165
166 if (sam_file == NULL)
167 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
168 if (sam_file->header == NULL)
169 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
170 memset(&_next_hit, 0, sizeof(_next_hit));
171 //_beginning = bgzf_tell(sam_file->x.bam);
172 _sam_header=sam_file->header;
173 if (inspect_header(hs) == false)
174 err_die("Error: invalid SAM header for file %s\n",
175 hs._hit_file_name.c_str());
176 }
177 }
178
179 void BAMHitFactory::closeStream(HitStream& hs) {
180 if (hs._hit_file) {
181 samclose((samfile_t*)(hs._hit_file));
182 }
183 hs._hit_file=NULL;
184 _sam_header=NULL;
185 }
186
187 void BAMHitFactory::rewind(HitStream& hs)
188 {
189 /*
190 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
191 {
192 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
193 _eof = false;
194 }
195 */
196 this->closeStream(hs);
197 this->openStream(hs);
198 }
199
200 void BAMHitFactory::seek(HitStream& hs, int64_t offset)
201 {
202 if (hs._hit_file) {
203 bgzf_seek(((samfile_t*)hs._hit_file)->x.bam, offset, SEEK_SET);
204 }
205 }
206
207 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
208 const bam1_t* bamrec=(const bam1_t*)hit_buf;
209 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
210 string sam_line(tamline);
211 free(tamline);
212 return sam_line;
213 }
214
215 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
216 if (_next_hit.data) {
217 free(_next_hit.data);
218 _next_hit.data = NULL;
219 }
220 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
221 if (hs.eof() || !hs.ready()) return false;
222
223 //mark_curr_pos();
224
225 memset(&_next_hit, 0, sizeof(_next_hit));
226
227 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
228 if (bytes_read <= 0) {
229 hs._eof = true;
230 return false;
231 }
232 buf = (const char*)&_next_hit;
233 buf_size = bytes_read;
234 return true;
235 }
236
237
238 BowtieHit HitFactory::create_hit(const string& insert_name,
239 const string& ref_name,
240 const string& ref_name2,
241 int left,
242 const vector<CigarOp>& cigar,
243 bool antisense_aln,
244 bool antisense_splice,
245 unsigned char edit_dist,
246 unsigned char splice_mms,
247 bool end)
248 {
249 uint64_t insert_id = _insert_table.get_id(insert_name);
250
251 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
252 uint32_t reference_id2 = reference_id;
253
254 if (ref_name2.length() > 0)
255 reference_id2 = _ref_table.get_id(ref_name2, NULL, 0);
256
257 return BowtieHit(reference_id,
258 reference_id2,
259 insert_id,
260 left,
261 cigar,
262 antisense_aln,
263 antisense_splice,
264 edit_dist,
265 splice_mms,
266 end);
267 }
268
269 BowtieHit HitFactory::create_hit(const string& insert_name,
270 const string& ref_name,
271 uint32_t left,
272 uint32_t read_len,
273 bool antisense_aln,
274 unsigned char edit_dist,
275 bool end)
276 {
277 uint64_t insert_id = _insert_table.get_id(insert_name);
278 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
279
280 return BowtieHit(reference_id,
281 reference_id,
282 insert_id,
283 left,
284 read_len,
285 antisense_aln,
286 edit_dist,
287 end);
288 }
289
290 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
291 BowtieHit& bh,
292 bool strip_slash,
293 char* name_out,
294 char* name_tags,
295 char* seq,
296 char* qual)
297 {
298 if (!orig_bwt_buf || !*orig_bwt_buf)
299 return false;
300
301 static const int buf_size = 2048;
302
303 char bwt_buf[buf_size];
304 strcpy(bwt_buf, orig_bwt_buf);
305 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
306
307 char orientation;
308
309 //int bwtf_ret = 0;
310 //uint32_t seqid = 0;
311
312 char text_name[buf_size];
313 unsigned int text_offset;
314
315
316 //unsigned int other_occs;
317 char mismatches[buf_size];
318 //memset(mismatches, 0, sizeof(mismatches));
319
320 const char* buf = bwt_buf;
321 char* name = get_token((char**)&buf,"\t");
322 char* orientation_str = get_token((char**)&buf,"\t");
323 char* text_name_str = get_token((char**)&buf,"\t");
324
325 strcpy(text_name, text_name_str);
326
327 char* text_offset_str = get_token((char**)&buf,"\t");
328 char* seq_str = get_token((char**)&buf,"\t");
329 if (seq)
330 strcpy(seq, seq_str);
331
332 const char* qual_str = get_token((char**)&buf,"\t");
333 if (qual)
334 strcpy(qual, qual_str);
335
336 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
337 mismatches[0] = 0;
338 char* mismatches_str = get_token((char**)&buf, "\t");
339 if (mismatches_str)
340 strcpy(mismatches, mismatches_str);
341
342 orientation = orientation_str[0];
343 text_offset = atoi(text_offset_str);
344
345 bool end = true;
346 unsigned int seg_offset = 0;
347 unsigned int seg_num = 0;
348 unsigned int num_segs = 0;
349
350 // Copy the tag out of the name field before we might wipe it out
351 char* pipe = strrchr(name, '|');
352 if (pipe)
353 {
354 if (name_tags)
355 strcpy(name_tags, pipe);
356
357 char* tag_buf = pipe + 1;
358 if (strchr(tag_buf, ':'))
359 {
360 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
361 if (seg_num + 1 == num_segs)
362 end = true;
363 else
364 end = false;
365 }
366
367 *pipe = 0;
368 }
369 // Stripping the slash and number following it gives the insert name
370 char* slash = strrchr(name, '/');
371 if (strip_slash && slash)
372 *slash = 0;
373
374 size_t read_len = strlen(seq_str);
375
376 // Add this alignment to the table of hits for this half of the
377 // Bowtie map
378
379 //vector<string> mismatch_toks;
380 char* pch = strtok (mismatches,",");
381 unsigned char num_mismatches = 0;
382 while (pch != NULL)
383 {
384 char* colon = strchr(pch, ':');
385 if (colon)
386 {
387 num_mismatches++;
388 }
389 //mismatch_toks.push_back(pch);
390 pch = strtok (NULL, ",");
391 }
392
393 bh = create_hit(name,
394 text_name,
395 text_offset,
396 (int)read_len,
397 orientation == '-',
398 num_mismatches,
399 end);
400
401 return true;
402 }
403
404 int anchor_mismatch = 0;
405
406
407 void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
408 bool &end, unsigned int &seg_offset, unsigned int& seg_num,
409 unsigned int & num_segs) {
410 char* pipe = strrchr(name, '|');
411 if (pipe)
412 {
413 if (name_tags)
414 strcpy(name_tags, pipe);
415
416 char* tag_buf = pipe + 1;
417 if (strchr(tag_buf, ':'))
418 {
419 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
420 if (seg_num + 1 == num_segs)
421 end = true;
422 else
423 end = false;
424 }
425
426 *pipe = 0;
427 }
428
429 // Stripping the slash and number following it gives the insert name
430 char* slash = strrchr(name, '/');
431 if (strip_slash && slash)
432 *slash = 0;
433 }
434
435
436 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
437 BowtieHit& bh,
438 bool strip_slash,
439 char* name_out,
440 char* name_tags,
441 char* seq,
442 char* qual)
443 {
444 if (!orig_bwt_buf || !*orig_bwt_buf)
445 return false;
446
447 static const int buf_size = 2048;
448
449 char bwt_buf[buf_size];
450 strcpy(bwt_buf, orig_bwt_buf);
451 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
452
453 char orientation;
454 char text_name[buf_size];
455 unsigned int text_offset;
456 char mismatches[buf_size];
457 //memset(mismatches, 0, sizeof(mismatches));
458
459 char* buf = bwt_buf;
460 char* name = get_token((char**)&buf,"\t");
461 char* orientation_str = get_token((char**)&buf,"\t");
462 char* text_name_str = get_token((char**)&buf,"\t");
463 strcpy(text_name, text_name_str);
464
465 char* text_offset_str = get_token((char**)&buf,"\t");
466 char* seq_str = get_token((char**)&buf,"\t");
467 if (seq)
468 strcpy(seq, seq_str);
469
470 const char* qual_str = get_token((char**)&buf,"\t");
471 if (qual)
472 strcpy(qual, qual_str);
473
474 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
475 mismatches[0] = 0;
476 char* mismatches_str = get_token((char**)&buf, "\t");
477 if (mismatches_str)
478 strcpy(mismatches, mismatches_str);
479
480 orientation = orientation_str[0];
481 text_offset = atoi(text_offset_str);
482
483 bool end = true;
484 unsigned int seg_offset = 0;
485 unsigned int seg_num = 0;
486 unsigned int num_segs = 0;
487
488 // Copy the tag out of the name field before we might wipe it out
489 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
490
491 // Add this alignment to the table of hits for this half of the
492 // Bowtie map
493
494 // Parse the text_name field to recover the splice coords
495 vector<string> toks;
496
497 tokenize_strict(text_name, "|", toks);
498
499 int num_extra_toks = (int)toks.size() - 6;
500
501 if (num_extra_toks >= 0)
502 {
503 static const uint8_t left_window_edge_field = 1;
504 static const uint8_t splice_field = 2;
505 //static const uint8_t right_window_edge_field = 3;
506 static const uint8_t junction_type_field = 4;
507 static const uint8_t strand_field = 5;
508
509 string contig = toks[0];
510 for (int t = 1; t <= num_extra_toks; ++t)
511 {
512 contig += "|";
513 contig += toks[t];
514 }
515
516 vector<string> splice_toks;
517 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
518 if (splice_toks.size() != 2)
519 {
520 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
521 //fprintf(stderr, "%s (token: %s)\n", text_name,
522 // toks[num_extra_toks + splice_field].c_str());
523 return false;
524 }
525
526 //
527 // check for an insertion hit
528 //
529 if(toks[num_extra_toks + junction_type_field] == "ins")
530 {
531 int8_t spliced_read_len = strlen(seq_str);
532 /*
533 * The 0-based position of the left edge of the alignment. Note that this
534 * value may need to be futher corrected to account for the presence of
535 * of the insertion.
536 */
537 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
538 uint32_t right = left + spliced_read_len - 1;
539
540
541 /*
542 * The 0-based position of the last genomic sequence before the insertion
543 */
544 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
545
546 string insertedSequence = splice_toks[1];
547 /*
548 * The 0-based position of the first genomic sequence after teh insertion
549 */
550 uint32_t right_splice_pos = left_splice_pos + 1;
551 if(left > left_splice_pos){
552 /*
553 * The genomic position of the left edge of the alignment needs to be corrected
554 * If the alignment does not extend into the insertion, simply subtract the length
555 * of the inserted sequence, otherwise, just set it equal to the right edge
556 */
557 left = left - insertedSequence.length();
558 if(left < right_splice_pos){
559 left = right_splice_pos;
560 }
561 }
562 if(right > left_splice_pos){
563 right = right - insertedSequence.length();
564 if(right < left_splice_pos){
565 right = left_splice_pos;
566 }
567 }
568 /*
569 * Now, right and left should be properly transformed into genomic coordinates
570 * We should be able to deduce how much the alignment matches the insertion
571 * simply based on the length of the read
572 */
573 int left_match_length = 0;
574 if(left <= left_splice_pos){
575 left_match_length = left_splice_pos - left + 1;
576 }
577 int right_match_length = 0;
578 if(right >= right_splice_pos){
579 right_match_length = right - right_splice_pos + 1;
580 }
581 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
582
583 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
584 return false;
585
586 string junction_strand = toks[num_extra_toks + strand_field];
587 if(junction_strand != "rev" && junction_strand != "fwd"){
588 fprintf(stderr, "Malformed insertion record\n");
589 return false;
590 }
591
592 char* pch = strtok( mismatches, ",");
593 unsigned char num_mismatches = 0;
594
595 /*
596 * remember that text_offset holds the left end of the
597 * alignment, relative to the start of the contig
598 */
599
600 /*
601 * The 0-based relative position of the left-most character
602 * before the insertion in the contig
603 */
604 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
605 while (pch != NULL)
606 {
607 char* colon = strchr(pch, ':');
608 if (colon)
609 {
610 *colon = 0;
611 int mismatch_pos = atoi(pch);
612
613 /*
614 * for reversely mapped reads,
615 * find the correct mismatched position.
616 */
617 if(orientation == '-'){
618 mismatch_pos = spliced_read_len - mismatch_pos - 1;
619 }
620
621 /*
622 * Only count mismatches outside of the insertion region
623 * If there is a mismatch within the insertion,
624 * disallow this hit
625 */
626 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
627 num_mismatches++;
628 }else{
629 return false;
630 }
631 }
632 pch = strtok (NULL, ",");
633 }
634
635
636 vector<CigarOp> cigar;
637 cigar.push_back(CigarOp(MATCH, left_match_length));
638 cigar.push_back(CigarOp(INS, insertion_match_length));
639 cigar.push_back(CigarOp(MATCH, right_match_length));
640
641 /*
642 * For now, disallow hits that don't span
643 * the insertion. If we allow these types of hits,
644 * then long_spanning.cpp needs to be updated
645 * in order to intelligently merge these kinds
646 * of reads back together
647 *
648 * Following code has been changed to allow segment that end
649 * in an insertion
650 */
651 bh = create_hit(name,
652 contig,
653 "",
654 left,
655 cigar,
656 orientation == '-',
657 junction_strand == "rev",
658 num_mismatches,
659 0,
660 end);
661 return true;
662 }
663
664 else
665 {
666 const string& junction_type = toks[num_extra_toks + junction_type_field];
667 string junction_strand = toks[num_extra_toks + strand_field];
668
669 int spliced_read_len = strlen(seq_str);
670 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str());
671 int8_t left_splice_pos = atoi(splice_toks[0].c_str());
672 if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
673 {
674 left += text_offset;
675 left_splice_pos = left_splice_pos - left + 1;
676 }
677 else
678 {
679 left -= text_offset;
680 left_splice_pos = left - left_splice_pos + 1;
681 }
682
683 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
684 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
685
686 int gap_len = 0;
687 if (junction_type == "fus")
688 gap_len = atoi(splice_toks[1].c_str());
689 else
690 gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
691
692 if (right_splice_pos <= 0 || left_splice_pos <= 0)
693 return false;
694
695 if (orientation == '+')
696 {
697 if (left_splice_pos + seg_offset < _anchor_length){
698 return false;
699 }
700 }
701 else
702 {
703 if (right_splice_pos + seg_offset < _anchor_length)
704 return false;
705 }
706
707 if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")||
708 !(orientation == '-' || orientation == '+'))
709 {
710 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
711 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
712 // junction_strand.c_str(), orientation);
713 return false;
714 }
715
716 char* pch = strtok (mismatches,",");
717 int mismatches_in_anchor = 0;
718 unsigned char num_mismatches = 0;
719 while (pch != NULL)
720 {
721 char* colon = strchr(pch, ':');
722 if (colon)
723 {
724 *colon = 0;
725 num_mismatches++;
726 int mismatch_pos = atoi(pch);
727 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
728 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
729 mismatches_in_anchor++;
730 }
731 pch = strtok (NULL, ",");
732 }
733
734 // FIXME: we probably should exclude these hits somewhere, but this
735 // isn't the right place
736 vector<CigarOp> cigar;
737 if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
738 cigar.push_back(CigarOp(MATCH, left_splice_pos));
739 else
740 cigar.push_back(CigarOp(mATCH, left_splice_pos));
741
742 if(junction_type == "del")
743 cigar.push_back(CigarOp(DEL, gap_len));
744 else if(junction_type == "fus")
745 {
746 if (junction_strand == "ff")
747 cigar.push_back(CigarOp(FUSION_FF, gap_len));
748 else if (junction_strand == "fr")
749 cigar.push_back(CigarOp(FUSION_FR, gap_len));
750 else if (junction_strand == "rf")
751 cigar.push_back(CigarOp(FUSION_RF, gap_len));
752 else
753 cigar.push_back(CigarOp(FUSION_RR, gap_len));
754 }
755 else
756 cigar.push_back(CigarOp(REF_SKIP, gap_len));
757
758 if (junction_type != "fus" || (junction_strand != "fr" && junction_strand != "rr"))
759 cigar.push_back(CigarOp(MATCH, right_splice_pos));
760 else
761 cigar.push_back(CigarOp(mATCH, right_splice_pos));
762
763 string contig2 = "";
764 if (junction_type == "fus")
765 {
766 vector<string> contigs;
767 tokenize(contig, "-", contigs);
768 if (contigs.size() != 2)
769 return false;
770
771 contig = contigs[0];
772 contig2 = contigs[1];
773
774 if (junction_strand == "rf" || junction_strand == "rr")
775 orientation = (orientation == '+' ? '-' : '+');
776 }
777
778 bh = create_hit(name,
779 contig,
780 contig2,
781 left,
782 cigar,
783 orientation == '-',
784 junction_strand == "rev",
785 num_mismatches,
786 mismatches_in_anchor,
787 end);
788 return true;
789 }
790 }
791 else
792 {
793 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
794 //fprintf(stderr, "%s\n", orig_bwt_buf);
795 // continue;
796 return false;
797 }
798
799 return false;
800 }
801
802
803 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
804 bool &spliced_alignment) {
805 const char* p_cig = cigar_str;
806 int refspan=0; //alignment span on reference sequence
807
808 while (*p_cig)
809 {
810 char* t;
811 int op_len = (int)strtol(p_cig, &t, 10);
812 if (op_len <= 0)
813 {
814 fprintf (stderr, "Error: CIGAR op has zero length\n");
815 return 0;
816 }
817 char op_char = toupper(*t);
818 CigarOpCode opcode;
819 switch (op_char) {
820 case '=':
821 case 'X':
822 case 'M': opcode = MATCH;
823 refspan+=op_len;
824 break;
825 case 'I': opcode = INS;
826 break;
827 case 'D': opcode = DEL;
828 refspan+=op_len;
829 break;
830 case 'N': if (op_len > max_report_intron_length)
831 return 0;
832 opcode = REF_SKIP;
833 spliced_alignment = true;
834 refspan+=op_len;
835 break;
836 case 'S': opcode = SOFT_CLIP;
837 break;
838 case 'H': opcode = HARD_CLIP;
839 break;
840 case 'P': opcode = PAD;
841 break;
842 default: fprintf (stderr, "Error: invalid CIGAR operation\n");
843 return 0;
844 }
845 p_cig = t + 1;
846 cigar.push_back(CigarOp(opcode, op_len));
847 } //while cigar codes
848 if (*p_cig) {
849 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
850 return 0;
851 }
852 return refspan;
853 }
854
855 int getBAMmismatches(const bam1_t* buf, vector<CigarOp>& cigar,
856 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
857 int gspan=0;//genomic span of the alignment
858 sam_nm=0;
859 int num_mismatches=0;
860
861 uint8_t* ptr = bam_aux_get(buf, "XS");
862 if (ptr) {
863 char src_strand_char = bam_aux2A(ptr);
864 if (src_strand_char == '-')
865 antisense_splice = true;
866 }
867
868 ptr = bam_aux_get(buf, "MD");
869 if (ptr) {
870 const char* p = bam_aux2Z(ptr);
871 int bi=0; //base offset position in the read
872 while (*p != 0) {
873 if (isdigit(*p)) {
874 int v=atoi(p);
875 do { p++; } while (isdigit(*p));
876 bi+=v;
877 }
878 while (isalpha(*p)) {
879 p++;
880 num_mismatches++;
881 //mismatches.push_back(bi);
882 mismatches[bi]=true;
883 bi++;
884 }
885 if (*p=='^') { //reference deletion
886 p++;
887 while (isalpha(*p)) { //insert read bases
888 p++; bi++;
889 }
890 }
891 }
892 }
893
894 /* By convention,the NM field of the SAM record
895 * counts an insertion or deletion. I dont' think
896 * we want the mismatch count in the BowtieHit
897 * record to reflect this. Therefore, subtract out
898 * the mismatches due to in/dels
899 */
900 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
901 switch (itr->opcode)
902 {
903 case MATCH:
904 case REF_SKIP:
905 case PAD:
906 gspan += itr->length;
907 break;
908 case DEL:
909 gspan += itr->length;
910 sam_nm -= itr->length;
911 break;
912 case INS:
913 sam_nm -= itr->length;
914 break;
915 default:
916 break;
917 }
918 }
919 return num_mismatches;
920 }
921
922 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
923 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
924 int gspan=0;//genomic span of the alignment
925 const char* tag_buf = buf;
926 sam_nm=0;
927 int num_mismatches=0;
928 while((tag_buf = get_token((char**)&buf,"\t")))
929 {
930 vector<string> tuple_fields;
931 tokenize(tag_buf,":", tuple_fields);
932 if (tuple_fields.size() == 3)
933 {
934 if (tuple_fields[0] == "XS")
935 {
936 if (tuple_fields[2] == "-")
937 antisense_splice = true;
938 }
939 else if (tuple_fields[0] == "NM")
940 {
941 sam_nm = atoi(tuple_fields[2].c_str());
942 }
943 else if (tuple_fields[0] == "NS")
944 {
945 //ignored for now
946 }
947 else if (tuple_fields[0] == "MD")
948 {
949 const char* p=tuple_fields[2].c_str();
950 int bi=0; //base offset position in the read
951 while (*p != 0) {
952 if (isdigit(*p)) {
953 int v=atoi(p);
954 do { p++; } while (isdigit(*p));
955 bi+=v;
956 }
957 while (isalpha(*p)) {
958 p++;
959 num_mismatches++;
960 //mismatches.push_back(bi);
961 mismatches[bi]=true;
962 bi++;
963 }
964 if (*p=='^') { //reference deletion
965 p++;
966 while (isalpha(*p)) { //insert read bases
967 p++; bi++;
968 }
969 }
970 }
971 }
972 //else
973 //{
974 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
975 //return false;
976 //}
977 }
978 }
979 /* By convention,the NM field of the SAM record
980 * counts an insertion or deletion. I dont' think
981 * we want the mismatch count in the BowtieHit
982 * record to reflect this. Therefore, subtract out
983 * the mismatches due to in/dels
984 */
985 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
986 switch (itr->opcode)
987 {
988 case MATCH:
989 case REF_SKIP:
990 case PAD:
991 gspan += itr->length;
992 break;
993 case DEL:
994 gspan += itr->length;
995 sam_nm -= itr->length;
996 break;
997 case INS:
998 sam_nm -= itr->length;
999 break;
1000 default:
1001 break;
1002 }
1003 }
1004 return num_mismatches;
1005 }
1006
1007 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1008 BowtieHit& bh,
1009 bool strip_slash,
1010 char* name_out,
1011 char* name_tags,
1012 char* seq,
1013 char* qual)
1014 {
1015 if (!orig_bwt_buf || !*orig_bwt_buf)
1016 return false;
1017 char bwt_buf[2048];
1018 strcpy(bwt_buf, orig_bwt_buf);
1019 // Are we still in the header region?
1020 if (bwt_buf[0] == '@')
1021 return false;
1022
1023 char* buf = bwt_buf;
1024 char* name = get_token((char**)&buf,"\t");
1025 char* sam_flag_str = get_token((char**)&buf,"\t");
1026 char* text_name = get_token((char**)&buf,"\t");
1027 char* text_offset_str = get_token((char**)&buf,"\t");
1028 const char* map_qual_str = get_token((char**)&buf,"\t");
1029 char* cigar_str = get_token((char**)&buf,"\t");
1030 const char* mate_ref_str = get_token((char**)&buf,"\t");
1031 const char* mate_pos_str = get_token((char**)&buf,"\t");
1032 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1033
1034 const char* seq_str = get_token((char**)&buf,"\t");
1035 if (seq)
1036 strcpy(seq, seq_str);
1037 const char* qual_str = get_token((char**)&buf,"\t");
1038 if (qual)
1039 strcpy(qual, qual_str);
1040
1041 if (!name ||
1042 !sam_flag_str ||
1043 !text_name ||
1044 !text_offset_str ||
1045 !map_qual_str ||
1046 !cigar_str ||
1047 !mate_ref_str ||
1048 !mate_pos_str ||
1049 !inferred_insert_sz_str ||
1050 !seq_str ||
1051 !qual_str)
1052 {
1053 // truncated or malformed SAM record
1054 return false;
1055 }
1056
1057 int sam_flag = atoi(sam_flag_str);
1058 string ref_name = text_name, ref_name2 = "";
1059 int text_offset = atoi(text_offset_str);
1060
1061 bool end = true;
1062 unsigned int seg_offset = 0;
1063 unsigned int seg_num = 0;
1064 unsigned int num_segs = 0;
1065
1066 // Copy the tag out of the name field before we might wipe it out
1067 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1068
1069 vector<CigarOp> cigar;
1070 bool spliced_alignment = false;
1071
1072 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
1073 if (refspan==0)
1074 return false;
1075 //vector<string> attributes;
1076 //tokenize(tag_buf, " \t",attributes);
1077
1078 bool antisense_splice = false;
1079 int sam_nm = 0; //the value of the NM tag (edit distance)
1080 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
1081 vector<bool> mismatches;
1082 mismatches.resize(strlen(seq_str), false);
1083 int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
1084
1085 if (spliced_alignment)
1086 {
1087 bh = create_hit(name,
1088 ref_name,
1089 ref_name2,
1090 text_offset - 1,
1091 cigar,
1092 sam_flag & 0x0010,
1093 antisense_splice,
1094 num_mismatches,
1095 0,
1096 end);
1097 }
1098 else
1099 {
1100 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1101 bh = create_hit(name,
1102 ref_name,
1103 ref_name2,
1104 text_offset - 1, // SAM files are 1-indexed
1105 cigar,
1106 sam_flag & 0x0010,
1107 false,
1108 num_mismatches,
1109 0,
1110 end);
1111 }
1112 return true;
1113 }
1114
1115 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
1116 if (op.length<=0) return;
1117 if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
1118 cigar.back().length+=op.length;
1119 }
1120 cigar.push_back(op);
1121 }
1122
1123 int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches,
1124 int &left, int spl_start, int spl_len, CigarOpCode spl_code) {
1125 //merge the original 'cigar' with the new insert/gap operation
1126 //at position spl_start and place the result into splcigar;
1127 //TODO: ideally this should also get and rebuild the MD string (alignment mismatches)
1128 int spl_mismatches=0;
1129 //return value: mismatches in the insert region for INS case,
1130 //or number of mismatches in the anchor region
1131 //return -1 if somehow the hit seems bad
1132
1133 //these offsets are relative to the beginning of alignment on reference
1134 int spl_ofs=abs(spl_start-left); //relative position of splice op
1135 int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
1136 CigarOp gapop(spl_code, spl_len); //for DEL, REF_SKIP, FUSIONS
1137 if (spl_code==INS)
1138 spl_ofs_end += spl_len;
1139 int ref_ofs=0; //working offset on reference
1140 int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
1141 bool xfound=false;
1142 //if (left<=spl_start+spl_len) {
1143 if (spl_ofs_end>0) {
1144 int prev_opcode=0;
1145 int prev_oplen=0;
1146 for (size_t c = 0 ; c < cigar.size(); ++c)
1147 {
1148 int prev_read_ofs=read_ofs;
1149 int cur_op_ofs=ref_ofs;
1150 int cur_opcode=cigar[c].opcode;
1151 int cur_oplen=cigar[c].length;
1152
1153 switch (cur_opcode) {
1154 case MATCH:
1155 ref_ofs+=cur_oplen;
1156 read_ofs+=cur_oplen;
1157 if (spl_code==REF_SKIP || spl_code==DEL ||
1158 spl_code==FUSION_FF || spl_code==FUSION_FR ||
1159 spl_code==FUSION_RF || spl_code==FUSION_RR) {
1160 for (int o=cur_op_ofs;o<ref_ofs;o++) {
1161 int rofs=prev_read_ofs+(o-cur_op_ofs);
1162 if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs])
1163 spl_mismatches++;
1164 }
1165 }
1166 else if (spl_code==INS) {
1167 for (int o=cur_op_ofs;o<ref_ofs;o++) {
1168 int rofs=prev_read_ofs+(o-cur_op_ofs);
1169 if (o>=spl_ofs && o<spl_ofs_end && mismatches[rofs])
1170 spl_mismatches++;
1171 }
1172 }
1173 break;
1174 case DEL:
1175 case REF_SKIP:
1176 case PAD:
1177 ref_ofs+=cur_oplen;
1178 break;
1179 case SOFT_CLIP:
1180 case INS:
1181 read_ofs+=cur_oplen;
1182 break;
1183 //case HARD_CLIP:
1184 }
1185
1186 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1187 if (cur_op_ofs==spl_ofs_end && spl_code!=INS)
1188 {
1189 xfound=true;
1190 //we have to insert the gap here first
1191 cigar_add(splcigar, gapop);
1192 //also, check
1193 }
1194 cigar_add(splcigar, cigar[c]);
1195 }
1196 else //if (ref_ofs>spl_ofs) {
1197 { //op intersection
1198 xfound=true;
1199 if (spl_code==INS) {
1200 //we have to shorten cur_opcode
1201 // find the overlap between current range
1202 int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1203 int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1204
1205 CigarOp op(cigar[c]);
1206 op.length=spl_ofs-cur_op_ofs;
1207 if (spl_ofs>cur_op_ofs)
1208 cigar_add(splcigar, op);
1209 if (spl_ofs<0)
1210 {
1211 CigarOp temp = gapop;
1212 temp.length += spl_ofs;
1213 if (temp.length>0)
1214 cigar_add(splcigar, temp);
1215 }
1216 else
1217 cigar_add(splcigar, gapop);
1218 op.length=ref_ofs-spl_ofs_end;
1219 if (ref_ofs>spl_ofs_end)
1220 cigar_add(splcigar,op);
1221 }
1222 else {//DEL or REF_SKIP or FUSION_[FR][FR]
1223 //spl_ofs == spl_ofs_end
1224 //we have to split cur_opcode
1225 //look for mismatches within min_anchor_len distance from splice point
1226 CigarOp op(cigar[c]);
1227 CigarOpCode opcode = op.opcode;
1228 op.length=spl_ofs-cur_op_ofs;
1229 if (gapop.opcode == FUSION_RF || gapop.opcode == FUSION_RR)
1230 {
1231 if (opcode == MATCH)
1232 op.opcode = mATCH;
1233 else if (opcode == INS)
1234 op.opcode = iNS;
1235 else if (opcode == DEL)
1236 op.opcode = dEL;
1237 else if (opcode == REF_SKIP)
1238 op.opcode = rEF_SKIP;
1239 }
1240 cigar_add(splcigar, op);
1241
1242 cigar_add(splcigar, gapop);
1243
1244 op.opcode = opcode;
1245 if (gapop.opcode == FUSION_FR || gapop.opcode == FUSION_RR)
1246 {
1247 if (opcode == MATCH)
1248 op.opcode = mATCH;
1249 else if (opcode == INS)
1250 op.opcode = iNS;
1251 else if (opcode == DEL)
1252 op.opcode = dEL;
1253 else if (opcode == REF_SKIP)
1254 op.opcode = rEF_SKIP;
1255 }
1256 op.length=ref_ofs-spl_ofs;
1257 cigar_add(splcigar,op);
1258 }
1259 } //op intersection
1260 prev_opcode=cur_opcode;
1261 prev_oplen=cur_oplen;
1262 } //for each cigar opcode
1263 } //intersection possible
1264
1265 //if (!xfound) {//no intersection found between splice event and alignment
1266 if (spl_ofs_end<=0) {
1267 //alignment starts after the splice event
1268 if (spl_code==INS) left-=spl_len;
1269 else left+=spl_len;
1270
1271 splcigar = cigar;
1272 }
1273 //else {
1274 //alignment ends before the splice event
1275 //nothing to do
1276 // }
1277 //return spl_mismatches;
1278 // }
1279
1280 return spl_mismatches;
1281 }
1282
1283 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1284 BowtieHit& bh,
1285 bool strip_slash,
1286 char* name_out,
1287 char* name_tags,
1288 char* seq,
1289 char* qual)
1290 {
1291 if (!orig_bwt_buf || !*orig_bwt_buf)
1292 return false;
1293
1294 char bwt_buf[2048];
1295 strcpy(bwt_buf, orig_bwt_buf);
1296 // Are we still in the header region?
1297 if (bwt_buf[0] == '@')
1298 return false;
1299
1300 char* buf = bwt_buf;
1301 char* name = get_token((char**)&buf,"\t");
1302 char* sam_flag_str = get_token((char**)&buf,"\t");
1303 char* text_name = get_token((char**)&buf,"\t");
1304 char* text_offset_str = get_token((char**)&buf,"\t");
1305 const char* map_qual_str = get_token((char**)&buf,"\t");
1306 char* cigar_str = get_token((char**)&buf,"\t");
1307 const char* mate_ref_str = get_token((char**)&buf,"\t");
1308 const char* mate_pos_str = get_token((char**)&buf,"\t");
1309 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1310 //int num_mismatches=0;
1311 //int mismatches[1024]; //list of 0-based mismatch positions in this read
1312 //parsed from SAM's MD:Z: tag
1313 const char* seq_str = get_token((char**)&buf,"\t");
1314 if (seq)
1315 strcpy(seq, seq_str);
1316
1317 const char* qual_str = get_token((char**)&buf,"\t");
1318 if (qual)
1319 strcpy(qual, qual_str);
1320
1321 if (!name ||
1322 !sam_flag_str ||
1323 !text_name ||
1324 !text_offset_str ||
1325 !map_qual_str ||
1326 !cigar_str ||
1327 !mate_ref_str ||
1328 !mate_pos_str ||
1329 !inferred_insert_sz_str ||
1330 !seq_str ||
1331 !qual_str)
1332 {
1333 // truncated or malformed SAM record
1334 return false;
1335 }
1336
1337 int sam_flag = atoi(sam_flag_str);
1338 int text_offset = atoi(text_offset_str);
1339 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1340 bool end = true;
1341 unsigned int seg_offset = 0;
1342 unsigned int seg_num = 0;
1343 unsigned int num_segs = 0;
1344
1345 // Copy the tag out of the name field before we might wipe it out
1346 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1347
1348 vector<CigarOp> samcigar;
1349 bool spliced_alignment = false;
1350 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1351
1352 if (refspan==0)
1353 return false;
1354 bool antisense_splice = false;
1355 int sam_nm = 0;
1356 vector<bool> mismatches;
1357 mismatches.resize(strlen(seq_str), false);
1358
1359 int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1360
1361 //##############################################
1362
1363 // Add this alignment to the table of hits for this half of the
1364 // Bowtie map
1365
1366 // Parse the text_name field to recover the splice coords
1367 vector<string> toks;
1368 tokenize_strict(text_name, "|", toks);
1369
1370 int num_extra_toks = (int)toks.size() - 6;
1371
1372 if (num_extra_toks >= 0)
1373 {
1374 static const uint8_t left_window_edge_field = 1;
1375 static const uint8_t splice_field = 2;
1376 //static const uint8_t right_window_edge_field = 3;
1377 static const uint8_t junction_type_field = 4;
1378 static const uint8_t strand_field = 5;
1379
1380 string contig = toks[0];
1381 for (int t = 1; t <= num_extra_toks; ++t)
1382 {
1383 contig += "|";
1384 contig += toks[t];
1385 }
1386
1387 vector<string> splice_toks;
1388 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1389 if (splice_toks.size() != 2)
1390 {
1391 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1392 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1393 // toks[num_extra_toks + splice_field].c_str());
1394 return false;
1395 }
1396
1397 string junction_strand = toks[num_extra_toks + strand_field];
1398 if(junction_strand != "rev" && junction_strand != "fwd"){
1399 fprintf(stderr, "Malformed insertion record\n");
1400 return false;
1401 }
1402
1403 //
1404 // check for an insertion hit
1405 //
1406 if(toks[num_extra_toks + junction_type_field] == "ins")
1407 {
1408 //int8_t spliced_read_len = strlen(seq_str);
1409 //TODO FIXME: use the CIGAR instead of seq length!
1410 // The 0-based position of the left edge of the alignment. Note that this
1411 // value may need to be further corrected to account for the presence of
1412 // of the insertion.
1413 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1414
1415 // The 0-based position of the last genomic sequence before the insertion
1416 int left_splice_pos = atoi(splice_toks[0].c_str());
1417
1418 string insertedSequence = splice_toks[1];
1419 // The 0-based position of the first genomic sequence after the insertion
1420
1421 vector<CigarOp> splcigar;
1422 //this also updates left to the adjusted genomic coordinates
1423 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
1424 left, left_splice_pos+1, insertedSequence.length(), INS);
1425
1426 if (spl_num_mismatches<0) return false;
1427 num_mismatches-=spl_num_mismatches;
1428 /*
1429 uint32_t right_splice_pos = left_splice_pos + 1;
1430
1431 //uint32_t right = left + spliced_read_len - 1;
1432 int right = left + refspan - 1;
1433
1434 if(left > left_splice_pos){
1435 //The genomic position of the left edge of the alignment needs to be corrected
1436 //If the alignment does not extend into the insertion, simply subtract the length
1437 //of the inserted sequence, otherwise, just set it equal to the right edge
1438 left = left - insertedSequence.length();
1439 if(left < right_splice_pos){
1440 left = right_splice_pos;
1441 }
1442 }
1443 if(right > left_splice_pos){
1444 right = right - insertedSequence.length();
1445 if(right < left_splice_pos){
1446 right = left_splice_pos;
1447 }
1448 }
1449 // Now, right and left should be properly transformed into genomic coordinates
1450 // We should be able to deduce how much the alignment matches the insertion
1451 // simply based on the length of the read
1452 int left_match_length = 0;
1453 if(left <= left_splice_pos){
1454 left_match_length = left_splice_pos - left + 1;
1455 }
1456 int right_match_length = 0;
1457 if(right >= right_splice_pos){
1458 right_match_length = right - right_splice_pos + 1;
1459 }
1460 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1461
1462 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1463 return false;
1464
1465 //char* pch = strtok( mismatches, ",");
1466 //unsigned char num_mismatches = 0;
1467 //text_offset holds the left end of the alignment,
1468 //RELATIVE TO the start of the contig
1469
1470 //The 0-based relative position of the left-most character
1471 //before the insertion in the contig
1472 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1473 for (size_t i=0;i<mismatches.size();++i) {
1474 int mismatch_pos = mismatches[i];
1475 // for reversely mapped reads,
1476 //find the correct mismatched position.
1477 if (sam_flag & 0x0010){
1478 mismatch_pos = spliced_read_len - mismatch_pos - 1;
1479 }
1480
1481 //Only count mismatches outside of the insertion region
1482 // If there is a mismatch within the insertion,
1483 // disallow this hit
1484 if(mismatch_pos + text_offset <= relative_splice_pos ||
1485 mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1486 spl_num_mismatches++;
1487 }else{
1488 return false;
1489 }
1490 }
1491 */
1492 //vector<CigarOp> splcigar;
1493 //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1494 //splcigar.push_back(CigarOp(MATCH, left_match_length));
1495 //splcigar.push_back(CigarOp(INS, insertion_match_length));
1496 //splcigar.push_back(CigarOp(MATCH, right_match_length));
1497
1498 bh = create_hit(name,
1499 contig,
1500 "",
1501 left,
1502 //splcigar,
1503 splcigar,
1504 sam_flag & 0x0010,
1505 junction_strand == "rev",
1506 num_mismatches,
1507 0,
1508 end);
1509
1510 return true;
1511 } //"ins"
1512 else //"del" or intron
1513 {
1514 // The 0-based position of the left edge of the alignment.
1515 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1516
1517 // The 0-based position of the last genomic sequence before the deletion
1518 int left_splice_pos = atoi(splice_toks[0].c_str());
1519
1520 int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
1521 /*
1522 if ((sam_flag & 0x0010) == 0) //######
1523 {
1524 if (left_splice_ofs + seg_offset < _anchor_length)
1525 return false;
1526 }
1527 else
1528 {
1529 if (right_splice_ofs + seg_offset < _anchor_length)
1530 return false;
1531 }
1532 */
1533 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1534 //atoi(toks[right_window_edge_field].c_str());
1535
1536 /*
1537 //offset of deletion point, relative to the beginning of the alignment
1538 int left_splice_ofs = left_splice_pos-left+1;
1539
1540 int mismatches_in_anchor = 0;
1541 for (size_t i=0;i<mismatches.size();++i) {
1542 spl_num_mismatches++;
1543 int mismatch_pos = mismatches[i];
1544 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) ||
1545 ((sam_flag & 0x0010) != 0 &&
1546 abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len)
1547 mismatches_in_anchor++;
1548 }
1549 */
1550 vector<CigarOp> splcigar;
1551
1552 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP;
1553
1554 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left,
1555 left_splice_pos+1, gap_len, opcode);
1556
1557 if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
1558 return false;
1559 /*
1560 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1561 if(toks[num_extra_toks + junction_type_field] == "del"){
1562 splcigar.push_back(CigarOp(DEL, gap_len));
1563 }else{
1564 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1565 }
1566 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1567 */
1568 bh = create_hit(name,
1569 contig,
1570 "",
1571 left,
1572 splcigar,
1573 (sam_flag & 0x0010),
1574 junction_strand == "rev",
1575 num_mismatches,
1576 spl_num_mismatches,
1577 end);
1578
1579 return true;
1580 }
1581 } //parse splice data
1582 else
1583 {
1584 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1585 //fprintf(stderr, "%s\n", orig_bwt_buf);
1586 // continue;
1587 return false;
1588 }
1589
1590 return false;
1591 }
1592
1593
1594
1595 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1596 BowtieHit& bh, bool strip_slash,
1597 char* name_out, char* name_tags,
1598 char* seq, char* qual) {
1599 if (_sam_header==NULL)
1600 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1601 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1602
1603 uint32_t sam_flag = hit_buf->core.flag;
1604
1605 int text_offset = hit_buf->core.pos;
1606 int text_mate_pos = hit_buf->core.mpos;
1607 int target_id = hit_buf->core.tid;
1608 int mate_target_id = hit_buf->core.mtid;
1609
1610 vector<CigarOp> cigar;
1611 bool spliced_alignment = false;
1612 int num_hits = 1;
1613
1614 double mapQ = hit_buf->core.qual;
1615 long double error_prob;
1616 if (mapQ > 0)
1617 {
1618 long double p = (-1.0 * mapQ) / 10.0;
1619 error_prob = pow(10.0L, p);
1620 }
1621 else
1622 {
1623 error_prob = 1.0;
1624 }
1625
1626 bool end = true;
1627 unsigned int seg_offset = 0;
1628 unsigned int seg_num = 0;
1629 unsigned int num_segs = 0;
1630 // Copy the tag out of the name field before we might wipe it out
1631 char* qname = bam1_qname(hit_buf);
1632 char* pipe = strrchr(qname, '|');
1633 if (pipe)
1634 {
1635 if (name_tags)
1636 strcpy(name_tags, pipe);
1637
1638 char* tag_buf = pipe + 1;
1639 if (strchr(tag_buf, ':'))
1640 {
1641 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1642 if (seg_num + 1 == num_segs)
1643 end = true;
1644 else
1645 end = false;
1646 }
1647
1648 *pipe = 0;
1649 }
1650
1651 if (target_id < 0) {
1652 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1653 bh = create_hit(qname,
1654 "*", //ref_name
1655 0, //left coord
1656 0, //read_len
1657 false, //antisense_aln
1658 0, //edit_dist
1659 end);
1660 return true;
1661 }
1662
1663 if (seq!=NULL) {
1664 char *bseq = (char*)bam1_seq(hit_buf);
1665 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1666 char v = bam1_seqi(bseq,i);
1667 seq[i]=bam_nt16_rev_table[v];
1668 }
1669 seq[hit_buf->core.l_qseq]=0;
1670 }
1671 if (qual!=NULL) {
1672 char *bq = (char*)bam1_qual(hit_buf);
1673 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1674 qual[i]=bq[i]+33;
1675 }
1676 qual[hit_buf->core.l_qseq]=0;
1677 }
1678
1679 bool antisense_splice=false;
1680 unsigned char num_mismatches = 0;
1681 unsigned char num_splice_anchor_mismatches = 0;
1682
1683 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1684 if (ptr) {
1685 char src_strand_char = bam_aux2A(ptr);
1686 if (src_strand_char == '-')
1687 antisense_splice = true;
1688 }
1689
1690 ptr = bam_aux_get(hit_buf, "NM");
1691 if (ptr) {
1692 num_mismatches = bam_aux2i(ptr);
1693 }
1694
1695 ptr = bam_aux_get(hit_buf, "NH");
1696 if (ptr) {
1697 num_hits = bam_aux2i(ptr);
1698 }
1699
1700 string text_name = _sam_header->target_name[target_id];
1701 string text_name2 = "";
1702
1703 bool fusion_alignment = false;
1704 string fusion_cigar_str;
1705 ptr = bam_aux_get(hit_buf, "XF");
1706 if (ptr) {
1707 fusion_alignment = true;
1708 char* xf = bam_aux2Z(ptr);
1709
1710 // ignore the second part of a fusion alignment
1711 if (xf[0] == '2')
1712 return false;
1713
1714 vector<string> fields;
1715 tokenize(xf, " ", fields);
1716
1717 vector<string> contigs;
1718 tokenize(fields[1], "-", contigs);
1719 if (contigs.size() >= 2)
1720 {
1721 text_name = contigs[0];
1722 text_name2 = contigs[1];
1723 }
1724
1725 text_offset = atoi(fields[2].c_str()) - 1;
1726 fusion_cigar_str = fields[3].c_str();
1727
1728 if (seq)
1729 strcpy(seq, fields[4].c_str());
1730 if (qual)
1731 strcpy(qual, fields[5].c_str());
1732 }
1733
1734 if (fusion_alignment) {
1735 const char* p_cig = fusion_cigar_str.c_str();
1736 while (*p_cig) {
1737 char* t;
1738 int length = (int)strtol(p_cig, &t, 10);
1739 if (length <= 0)
1740 {
1741 //fprintf (stderr, "CIGAR op has zero length\n");
1742 return false;
1743 }
1744 char op_char = *t;
1745 CigarOpCode opcode;
1746 if (op_char == 'M') opcode = MATCH;
1747 else if(op_char == 'm') opcode = mATCH;
1748 else if (op_char == 'I') opcode = INS;
1749 else if (op_char == 'i') opcode = iNS;
1750 else if (op_char == 'D') opcode = DEL;
1751 else if (op_char == 'd') opcode = dEL;
1752 else if (op_char == 'N' || op_char == 'n')
1753 {
1754 if (length > max_report_intron_length)
1755 return false;
1756
1757 if (op_char == 'N')
1758 opcode = REF_SKIP;
1759 else
1760 opcode = rEF_SKIP;
1761 spliced_alignment = true;
1762 }
1763 else if (op_char == 'F')
1764 {
1765 opcode = FUSION_FF;
1766 length = length - 1;
1767 }
1768 else if (op_char == 'S') opcode = SOFT_CLIP;
1769 else if (op_char == 'H') opcode = HARD_CLIP;
1770 else if (op_char == 'P') opcode = PAD;
1771 else
1772 {
1773 fprintf (stderr, "(%d-%d) invalid CIGAR operation\n", length, (int)op_char);
1774 return false;
1775 }
1776 p_cig = t + 1;
1777 cigar.push_back(CigarOp(opcode, length));
1778
1779 if(opcode == INS) {
1780 num_mismatches -= length;
1781 }
1782 else if(opcode == DEL) {
1783 num_mismatches -= length;
1784 }
1785
1786 /*
1787 * update fusion direction.
1788 */
1789 size_t cigar_size = cigar.size();
1790 if (cigar_size >= 3 && cigar[cigar_size - 2].opcode == FUSION_FF)
1791 {
1792 CigarOpCode prev = cigar[cigar_size - 3].opcode;
1793 CigarOpCode next = cigar[cigar_size - 1].opcode;
1794
1795 bool increase1 = false, increase2 = false;
1796 if (prev == MATCH || prev == DEL || prev == INS || prev == REF_SKIP)
1797 increase1 = true;
1798 if (next == MATCH || next == DEL || next == INS || next == REF_SKIP)
1799 increase2 = true;
1800
1801 if (increase1 && !increase2)
1802 cigar[cigar_size - 2].opcode = FUSION_FR;
1803 else if (!increase1 && increase2)
1804 cigar[cigar_size - 2].opcode = FUSION_RF;
1805 else if (!increase1 && !increase2)
1806 cigar[cigar_size - 2].opcode = FUSION_RR;
1807 }
1808 }
1809 }
1810 else {
1811 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1812 {
1813 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1814 if (length <= 0)
1815 {
1816 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1817 return false;
1818 }
1819
1820 CigarOpCode opcode;
1821 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1822 {
1823 case BAM_CMATCH: opcode = MATCH; break;
1824 case BAM_CINS: opcode = INS; break;
1825 case BAM_CDEL: opcode = DEL; break;
1826 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1827 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1828 case BAM_CPAD: opcode = PAD; break;
1829 case BAM_CREF_SKIP:
1830 opcode = REF_SKIP;
1831 spliced_alignment = true;
1832 if (length > (int)max_report_intron_length)
1833 {
1834 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1835 return false;
1836 }
1837 break;
1838 default:
1839 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1840 return false;
1841 }
1842
1843 if (opcode != HARD_CLIP)
1844 cigar.push_back(CigarOp(opcode, length));
1845
1846 /*
1847 * By convention,the NM field of the SAM record
1848 * counts an insertion or deletion. I dont' think
1849 * we want the mismatch count in the BowtieHit
1850 * record to reflect this. Therefore, subtract out
1851 * the mismatches due to in/dels
1852 */
1853 if(opcode == INS){
1854 num_mismatches -= length;
1855 }
1856 else if(opcode == DEL){
1857 num_mismatches -= length;
1858 }
1859 }
1860 }
1861
1862
1863 string mrnm;
1864 if (mate_target_id >= 0) {
1865 if (mate_target_id == target_id) {
1866 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1867 mrnm = _sam_header->target_name[mate_target_id];
1868 }
1869 else {
1870 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1871 return false;
1872 }
1873 }
1874 else {
1875 text_mate_pos = 0;
1876 }
1877
1878 if (spliced_alignment) {
1879 bh = create_hit(qname,
1880 text_name,
1881 text_name2,
1882 text_offset, // BAM files are 0-indexed
1883 cigar,
1884 sam_flag & 0x0010,
1885 antisense_splice,
1886 num_mismatches,
1887 num_splice_anchor_mismatches,
1888 end);
1889
1890 }
1891 else {
1892 bh = create_hit(qname,
1893 text_name,
1894 text_name2,
1895 text_offset, // BAM files are 0-indexed
1896 cigar,
1897 sam_flag & 0x0010,
1898 false,
1899 num_mismatches,
1900 0,
1901 end);
1902 }
1903
1904 return true;
1905 }
1906
1907 bool BAMHitFactory::inspect_header(HitStream& hs)
1908 {
1909 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1910
1911 if (header == NULL) {
1912 fprintf(stderr, "Warning: No BAM header\n");
1913 return false;
1914 }
1915 if (header->l_text == 0) {
1916 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1917 return false;
1918 }
1919 return true;
1920 }
1921
1922 bool SplicedBAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1923 BowtieHit& bh, bool strip_slash,
1924 char* name_out, char* name_tags,
1925 char* seq, char* qual) {
1926 if (_sam_header==NULL)
1927 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1928
1929 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1930 uint32_t sam_flag = hit_buf->core.flag;
1931
1932 int text_offset = hit_buf->core.pos;
1933 int text_mate_pos = hit_buf->core.mpos;
1934 int target_id = hit_buf->core.tid;
1935 int mate_target_id = hit_buf->core.mtid;
1936
1937 vector<CigarOp> samcigar;
1938 bool spliced_alignment = false;
1939 int num_hits = 1;
1940
1941 double mapQ = hit_buf->core.qual;
1942 long double error_prob;
1943 if (mapQ > 0)
1944 {
1945 long double p = (-1.0 * mapQ) / 10.0;
1946 error_prob = pow(10.0L, p);
1947 }
1948 else
1949 {
1950 error_prob = 1.0;
1951 }
1952
1953 if (seq!=NULL) {
1954 char *bseq = (char*)bam1_seq(hit_buf);
1955 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1956 char v = bam1_seqi(bseq,i);
1957 seq[i]=bam_nt16_rev_table[v];
1958 }
1959 seq[hit_buf->core.l_qseq]=0;
1960 }
1961 if (qual!=NULL) {
1962 char *bq = (char*)bam1_qual(hit_buf);
1963 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1964 qual[i]=bq[i]+33;
1965 }
1966 qual[hit_buf->core.l_qseq]=0;
1967 }
1968
1969 bool end = true;
1970 unsigned int seg_offset = 0;
1971 unsigned int seg_num = 0;
1972 unsigned int num_segs = 0;
1973 // Copy the tag out of the name field before we might wipe it out
1974 char* name = bam1_qname(hit_buf);
1975 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1976
1977 if (target_id < 0) {
1978 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1979 bh = create_hit(name,
1980 "*", //ref_name
1981 0, //left coord
1982 0, //read_len
1983 false, //antisense_aln
1984 0, //edit_dist
1985 end);
1986 return true;
1987 }
1988
1989 string text_name = _sam_header->target_name[target_id];
1990 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1991 {
1992 //char* t;
1993
1994 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1995 if (length <= 0)
1996 {
1997 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1998 return false;
1999 }
2000
2001 CigarOpCode opcode;
2002 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
2003 {
2004 case BAM_CMATCH: opcode = MATCH; break;
2005 case BAM_CINS: opcode = INS; break;
2006 case BAM_CDEL: opcode = DEL; break;
2007 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
2008 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
2009 case BAM_CPAD: opcode = PAD; break;
2010 case BAM_CREF_SKIP:
2011 opcode = REF_SKIP;
2012 spliced_alignment = true;
2013 if (length > (int)max_report_intron_length)
2014 {
2015 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
2016 return false;
2017 }
2018 break;
2019 default:
2020 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
2021 return false;
2022 }
2023 if (opcode != HARD_CLIP)
2024 samcigar.push_back(CigarOp(opcode, length));
2025 }
2026
2027 string mrnm;
2028 if (mate_target_id >= 0) {
2029 if (mate_target_id == target_id) {
2030 mrnm = _sam_header->target_name[mate_target_id];
2031 }
2032 else {
2033 return false;
2034 }
2035 }
2036 else {
2037 text_mate_pos = 0;
2038 }
2039
2040 bool antisense_splice = false;
2041 int sam_nm = 0;
2042 vector<bool> mismatches;
2043 mismatches.resize(strlen(seq), false);
2044 int num_mismatches=getBAMmismatches(hit_buf, samcigar, mismatches, sam_nm, antisense_splice);
2045 unsigned char num_splice_anchor_mismatches = 0;
2046
2047 //##############################################
2048
2049 // Add this alignment to the table of hits for this half of the
2050 // Bowtie map
2051
2052 // Parse the text_name field to recover the splice coords
2053 vector<string> toks;
2054 tokenize_strict(text_name.c_str(), "|", toks);
2055 int num_extra_toks = (int)toks.size() - 6;
2056 if (num_extra_toks >= 0)
2057 {
2058 static const uint8_t left_window_edge_field = 1;
2059 static const uint8_t splice_field = 2;
2060 //static const uint8_t right_window_edge_field = 3;
2061 static const uint8_t junction_type_field = 4;
2062 static const uint8_t strand_field = 5;
2063
2064 string contig = toks[0];
2065 for (int t = 1; t <= num_extra_toks; ++t)
2066 {
2067 contig += "|";
2068 contig += toks[t];
2069 }
2070
2071 vector<string> splice_toks;
2072 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
2073 if (splice_toks.size() != 2)
2074 {
2075 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
2076 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
2077 // toks[num_extra_toks + splice_field].c_str());
2078 return false;
2079 }
2080
2081 const string& junction_type = toks[num_extra_toks + junction_type_field];
2082 const string junction_strand = toks[num_extra_toks + strand_field];
2083
2084 //
2085 // check for an insertion hit
2086 //
2087 if(junction_type == "ins")
2088 {
2089 //int8_t spliced_read_len = strlen(seq_str);
2090 //TODO FIXME: use the CIGAR instead of seq length!
2091 // The 0-based position of the left edge of the alignment. Note that this
2092 // value may need to be further corrected to account for the presence of
2093 // of the insertion.
2094 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
2095
2096 // The 0-based position of the last genomic sequence before the insertion
2097 int left_splice_pos = atoi(splice_toks[0].c_str());
2098
2099 string insertedSequence = splice_toks[1];
2100 // The 0-based position of the first genomic sequence after the insertion
2101
2102 vector<CigarOp> splcigar;
2103 //this also updates left to the adjusted genomic coordinates
2104 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
2105 left, left_splice_pos+1, insertedSequence.length(), INS);
2106
2107 if (spl_num_mismatches<0) return false;
2108 num_mismatches-=spl_num_mismatches;
2109 bh = create_hit(name,
2110 contig,
2111 "",
2112 left,
2113 splcigar,
2114 sam_flag & 0x0010,
2115 junction_strand == "rev",
2116 num_mismatches,
2117 0,
2118 end);
2119
2120 return true;
2121 } //"ins"
2122 else //"del", "intron", or "fusion"
2123 {
2124 char orientation = (sam_flag & 0x0010 ? '-' : '+');
2125 if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")||
2126 !(orientation == '-' || orientation == '+'))
2127 {
2128 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
2129 return false;
2130 }
2131
2132 // The 0-based position of the left edge of the alignment.
2133 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str());
2134 if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
2135 left += text_offset;
2136 else
2137 left -= text_offset;
2138
2139 vector<CigarOp> splcigar;
2140 CigarOpCode opcode;
2141 if(junction_type == "del")
2142 opcode = DEL;
2143 else if(junction_type == "fus")
2144 {
2145 if (junction_strand == "ff")
2146 opcode = FUSION_FF;
2147 else if (junction_strand == "fr")
2148 opcode = FUSION_FR;
2149 else if (junction_strand == "rf")
2150 opcode = FUSION_RF;
2151 else
2152 opcode = FUSION_RR;
2153 }
2154 else
2155 opcode = REF_SKIP;
2156
2157 int left_splice_pos = atoi(splice_toks[0].c_str());
2158
2159 // The 0-based position of the last genomic sequence before the deletion
2160 int gap_len = 0;
2161 if (junction_type == "fus")
2162 gap_len = atoi(splice_toks[1].c_str());
2163 else
2164 gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
2165
2166 if (opcode == FUSION_RF || opcode == FUSION_RR)
2167 left_splice_pos -= 1;
2168 else
2169 left_splice_pos += 1;
2170
2171 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left,
2172 left_splice_pos, gap_len, opcode);
2173
2174 if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
2175 return false;
2176
2177 string contig2 = "";
2178 if (junction_type == "fus")
2179 {
2180 vector<string> contigs;
2181 tokenize(contig, "-", contigs);
2182 if (contigs.size() != 2)
2183 return false;
2184
2185 contig = contigs[0];
2186 contig2 = contigs[1];
2187
2188 if (junction_strand == "rf" || junction_strand == "rr")
2189 orientation = (orientation == '+' ? '-' : '+');
2190 }
2191
2192 bh = create_hit(name,
2193 contig,
2194 contig2,
2195 left,
2196 splcigar,
2197 orientation == '-',
2198 junction_strand == "rev",
2199 num_mismatches,
2200 spl_num_mismatches,
2201 end);
2202
2203 return true;
2204 }
2205 } //parse splice data
2206 else
2207 {
2208 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
2209 //fprintf(stderr, "%s\n", orig_bwt_buf);
2210 // continue;
2211 return false;
2212 }
2213
2214 return false;
2215 }
2216
2217 void get_mapped_reads(FILE* bwtf,
2218 HitTable& hits,
2219 HitFactory& hit_factory,
2220 bool strip_slash,
2221 bool verbose)
2222 {
2223
2224
2225 char bwt_buf[2048];
2226 uint32_t reads_extracted = 0;
2227
2228 while (fgets(bwt_buf, 2048, bwtf))
2229 {
2230 // Chomp the newline
2231 char* nl = strrchr(bwt_buf, '\n');
2232 if (nl) *nl = 0;
2233 if (*bwt_buf == 0)
2234 continue;
2235 // Get a new record from the tab-delimited Bowtie map
2236 BowtieHit bh;
2237 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
2238 {
2239 // Only check uniqueness if these hits are spliced
2240 hits.add_hit(bh, true);
2241 }
2242 reads_extracted++;
2243 }
2244
2245 // This will sort the map by insert id.
2246 hits.finalize();
2247 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
2248 }
2249
2250
2251 /*
2252 AlignStatus status(const BowtieHit* align)
2253 {
2254 if (!align)
2255 return UNALIGNED;
2256 if (align->contiguous())
2257 return CONTIGUOUS;
2258 return SPLICED;
2259 }
2260 */
2261
2262 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
2263 {
2264 int max_hit_pos = -1;
2265 for (size_t i = 0; i < hits.size(); ++i)
2266 {
2267 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
2268 }
2269
2270 if ((int)DoC.size() < max_hit_pos)
2271 DoC.resize(max_hit_pos);
2272
2273 for (size_t i = 0; i < hits.size(); ++i)
2274 {
2275 const BowtieHit& bh = hits[i];
2276
2277 // split up the coverage contibution for this reads
2278 size_t j = bh.left();
2279 const vector<CigarOp>& cigar = bh.cigar();
2280
2281 for (size_t c = 0 ; c < cigar.size(); ++c)
2282 {
2283 switch(cigar[c].opcode)
2284 {
2285 case MATCH:
2286 for (size_t m = 0; m < cigar[c].length; ++m)
2287 {
2288 if (DoC[j + m] < 0xFFFF)
2289 DoC[j + m]++;
2290 }
2291 //fall through this case to REF_SKIP is intentional
2292 case REF_SKIP:
2293 j += cigar[c].length;
2294 break;
2295 default:
2296 break;
2297 }
2298
2299 }
2300 }
2301 }
2302
2303 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
2304 {
2305 if ((int)DoC.size() < bh.right())
2306 DoC.resize(bh.right());
2307
2308 // split up the coverage contibution for this reads
2309 size_t j = bh.left();
2310 const vector<CigarOp>& cigar = bh.cigar();
2311
2312 for (size_t c = 0 ; c < cigar.size(); ++c)
2313 {
2314 switch(cigar[c].opcode)
2315 {
2316 case MATCH:
2317 for (size_t m = 0; m < cigar[c].length; ++m)
2318 {
2319 if (DoC[j + m] < VMAXINT32)
2320 DoC[j + m]++;
2321 }
2322 //fall through this case to REF_SKIP is intentional
2323 case REF_SKIP:
2324 j += cigar[c].length;
2325 break;
2326 default:
2327 break;
2328 }
2329
2330 }
2331 }
2332
2333 void print_hit(FILE* fout,
2334 const char* read_name,
2335 const BowtieHit& bh,
2336 const char* ref_name,
2337 const char* ref_name2,
2338 const char* sequence,
2339 const char* qualities)
2340 {
2341 string seq;
2342 string quals;
2343 if (sequence)
2344 {
2345 seq = sequence;
2346 quals = qualities;
2347 seq.resize(bh.read_len());
2348 quals.resize(bh.read_len());
2349 }
2350 else
2351 {
2352 seq = "*";
2353 }
2354
2355 if (qualities)
2356 {
2357 quals = qualities;
2358 quals.resize(bh.read_len());
2359 }
2360 else
2361 {
2362 quals = "*";
2363 }
2364
2365 uint32_t sam_flag = 0;
2366 if (bh.antisense_align())
2367 {
2368 sam_flag |= 0x0010; // BAM_FREVERSE
2369 }
2370
2371 int left = bh.left();
2372
2373 uint32_t map_quality = 255;
2374 char cigar[256] = {0};
2375 string mate_ref_name = "*";
2376 uint32_t mate_pos = 0;
2377 uint32_t insert_size = 0;
2378
2379 const vector<CigarOp>& bh_cigar = bh.cigar();
2380
2381 /*
2382 * In addition to calculating the cigar string,
2383 * we need to figure out how many in/dels are in the
2384 * sequence, so that we can give the correct
2385 * value for the NM tag
2386 */
2387 int indel_distance = 0;
2388 CigarOpCode fusion_dir = FUSION_NOTHING;
2389 for (size_t c = 0; c < bh_cigar.size(); ++c)
2390 {
2391 const CigarOp& op = bh_cigar[c];
2392
2393 char ibuf[64];
2394 sprintf(ibuf, "%d", op.length);
2395 switch(op.opcode)
2396 {
2397 case MATCH:
2398 case mATCH:
2399 strcat(cigar, ibuf);
2400 if (bh_cigar[c].opcode == MATCH)
2401 strcat(cigar, "M");
2402 else
2403 strcat(cigar, "m");
2404 break;
2405 case INS:
2406 case iNS:
2407 strcat(cigar, ibuf);
2408 if (bh_cigar[c].opcode == INS)
2409 strcat(cigar, "I");
2410 else
2411 strcat(cigar, "i");
2412 indel_distance += bh_cigar[c].length;
2413 break;
2414 case DEL:
2415 case dEL:
2416 strcat(cigar, ibuf);
2417 if (bh_cigar[c].opcode == DEL)
2418 strcat(cigar, "D");
2419 else
2420 strcat(cigar, "d");
2421 indel_distance += bh_cigar[c].length;
2422 break;
2423 case REF_SKIP:
2424 case rEF_SKIP:
2425 strcat(cigar, ibuf);
2426 if (bh_cigar[c].opcode == REF_SKIP)
2427 strcat(cigar, "N");
2428 else
2429 strcat(cigar, "n");
2430 break;
2431 case FUSION_FF:
2432 case FUSION_FR:
2433 case FUSION_RF:
2434 case FUSION_RR:
2435 fusion_dir = op.opcode;
2436 sprintf(ibuf, "%d", bh_cigar[c].length + 1);
2437 strcat(cigar, ibuf);
2438 strcat(cigar, "F");
2439 break;
2440 default:
2441 break;
2442 }
2443 }
2444
2445 char cigar1[256] = {0}, cigar2[256] = {0};
2446 string left_seq, right_seq, left_qual, right_qual;
2447 int left1 = -1, left2 = -1;
2448 extract_partial_hits(bh, seq, quals,
2449 cigar1, cigar2, left_seq, right_seq,
2450 left_qual, right_qual, left1, left2);
2451
2452
2453 bool containsSplice = false;
2454 for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
2455 {
2456 if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP)
2457 {
2458 containsSplice = true;
2459 break;
2460 }
2461 }
2462
2463 if (fusion_dir != FUSION_NOTHING)
2464 {
2465 fprintf(fout,
2466 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t",
2467 read_name,
2468 sam_flag,
2469 ref_name,
2470 left1 + 1,
2471 map_quality,
2472 cigar1,
2473 mate_ref_name.c_str(),
2474 mate_pos,
2475 insert_size,
2476 left_seq.c_str(),
2477 left_qual.c_str(),
2478 bh.edit_dist() + indel_distance);
2479
2480 if (containsSplice)
2481 {
2482 fprintf(fout,
2483 "XS:A:%c\t",
2484 bh.antisense_splice() ? '-' : '+');
2485 }
2486
2487 fprintf(fout,
2488 "FR:Z:1 %s-%s %d %s %s %s\n",
2489 ref_name,
2490 ref_name2,
2491 left + 1,
2492 cigar,
2493 seq.c_str(),
2494 quals.c_str());
2495
2496 fprintf(fout,
2497 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t",
2498 read_name,
2499 sam_flag,
2500 ref_name2,
2501 left2 + 1,
2502 map_quality,
2503 cigar2,
2504 mate_ref_name.c_str(),
2505 mate_pos,
2506 insert_size,
2507 right_seq.c_str(),
2508 right_qual.c_str(),
2509 bh.edit_dist() + indel_distance);
2510
2511 if (containsSplice)
2512 {
2513 fprintf(fout,
2514 "XS:A:%c\t",
2515 bh.antisense_splice() ? '-' : '+');
2516 }
2517
2518 fprintf(fout,
2519 "FR:Z:2 %s-%s %d %s %s %s\n",
2520 ref_name,
2521 ref_name2,
2522 left + 1,
2523 cigar,
2524 seq.c_str(),
2525 quals.c_str());
2526 }
2527 else
2528 {
2529 fprintf(fout,
2530 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t",
2531 read_name,
2532 sam_flag,
2533 ref_name,
2534 left + 1,
2535 map_quality,
2536 cigar,
2537 mate_ref_name.c_str(),
2538 mate_pos,
2539 insert_size,
2540 seq.c_str(),
2541 quals.c_str(),
2542 bh.edit_dist() + indel_distance);
2543
2544 if (containsSplice)
2545 {
2546 fprintf(fout,
2547 "XS:A:%c\t",
2548 bh.antisense_splice() ? '-' : '+');
2549 }
2550
2551 fprintf(fout, "\n");
2552 }
2553 }
2554
2555 void print_bamhit(GBamWriter& wbam,
2556 const char* read_name,
2557 const BowtieHit& bh,
2558 const char* ref_name,
2559 const char* ref_name2,
2560 const char* sequence,
2561 const char* qualities,
2562 bool from_bowtie)
2563 {
2564 string seq;
2565 string quals;
2566 if (sequence) {
2567 seq = sequence;
2568 quals = qualities;
2569 seq.resize(bh.read_len());
2570 quals.resize(bh.read_len());
2571 }
2572 else {
2573 seq = "*";
2574 }
2575 if (qualities) {
2576 quals = qualities;
2577 quals.resize(bh.read_len());
2578 }
2579 else {
2580 quals = "*";
2581 }
2582
2583 uint32_t sam_flag = 0;
2584 if (bh.antisense_align())
2585 {
2586 sam_flag |= 0x0010; // BAM_FREVERSE
2587 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
2588 {
2589 reverse_complement(seq);
2590 reverse(quals.begin(), quals.end());
2591 }
2592 }
2593
2594 uint32_t sam_pos = bh.left() + 1;
2595 uint32_t map_quality = 255;
2596 char cigar[256];
2597 cigar[0] = 0;
2598 string mate_ref_name = "*";
2599 uint32_t mate_pos = 0;
2600 uint32_t insert_size = 0;
2601 //string qualities = "*";
2602
2603 const vector<CigarOp>& bh_cigar = bh.cigar();
2604 /*
2605 * In addition to calculating the cigar string,
2606 * we need to figure out how many in/dels are in the
2607 * sequence, so that we can give the correct
2608 * value for the NM tag
2609 */
2610 int indel_distance = 0;
2611 CigarOpCode fusion_dir = FUSION_NOTHING;
2612 for (size_t c = 0; c < bh_cigar.size(); ++c)
2613 {
2614 const CigarOp& op = bh_cigar[c];
2615
2616 char ibuf[64];
2617 sprintf(ibuf, "%d", op.length);
2618 switch(op.opcode)
2619 {
2620 case MATCH:
2621 case mATCH:
2622 strcat(cigar, ibuf);
2623 if (bh_cigar[c].opcode == MATCH)
2624 strcat(cigar, "M");
2625 else
2626 strcat(cigar, "m");
2627 break;
2628 case INS:
2629 case iNS:
2630 strcat(cigar, ibuf);
2631 if (bh_cigar[c].opcode == INS)
2632 strcat(cigar, "I");
2633 else
2634 strcat(cigar, "i");
2635 indel_distance += bh_cigar[c].length;
2636 break;
2637 case DEL:
2638 case dEL:
2639 strcat(cigar, ibuf);
2640 if (bh_cigar[c].opcode == DEL)
2641 strcat(cigar, "D");
2642 else
2643 strcat(cigar, "d");
2644 indel_distance += bh_cigar[c].length;
2645 break;
2646 case REF_SKIP:
2647 case rEF_SKIP:
2648 strcat(cigar, ibuf);
2649 if (bh_cigar[c].opcode == REF_SKIP)
2650 strcat(cigar, "N");
2651 else
2652 strcat(cigar, "n");
2653 break;
2654 case FUSION_FF:
2655 case FUSION_FR:
2656 case FUSION_RF:
2657 case FUSION_RR:
2658 fusion_dir = op.opcode;
2659 sprintf(ibuf, "%d", bh_cigar[c].length + 1);
2660 strcat(cigar, ibuf);
2661 strcat(cigar, "F");
2662 break;
2663 default:
2664 break;
2665 }
2666 }
2667
2668 char cigar1[256] = {0}, cigar2[256] = {0};
2669 string left_seq, right_seq, left_qual, right_qual;
2670 int left1 = -1, left2 = -1;
2671 extract_partial_hits(bh, seq, quals,
2672 cigar1, cigar2, left_seq, right_seq,
2673 left_qual, right_qual, left1, left2);
2674
2675 bool containsSplice = false;
2676 for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
2677 {
2678 if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP)
2679 {
2680 containsSplice = true;
2681 break;
2682 }
2683 }
2684
2685 vector<string> auxdata;
2686 if (!sam_readgroup_id.empty())
2687 {
2688 string nm("RG:Z:");
2689 nm += sam_readgroup_id;
2690 auxdata.push_back(nm);
2691 }
2692
2693 string nm("NM:i:");
2694 str_appendInt(nm, bh.edit_dist() + indel_distance);
2695 auxdata.push_back(nm);
2696
2697 if (containsSplice) {
2698 nm="XS:A:";
2699 nm+=(char)(bh.antisense_splice() ? '-' : '+');
2700 auxdata.push_back(nm);
2701 }
2702
2703 if (fusion_dir != FUSION_NOTHING)
2704 {
2705 char XF[2048] = {0};
2706 sprintf(XF,
2707 "XF:Z:1 %s-%s %u %s %s %s",
2708 ref_name,
2709 ref_name2,
2710 sam_pos,
2711 cigar,
2712 seq.c_str(),
2713 quals.c_str());
2714 auxdata.push_back(XF);
2715
2716 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, left1 + 1, map_quality,
2717 cigar1, mate_ref_name.c_str(), mate_pos,
2718 insert_size, left_seq.c_str(), left_qual.c_str(), &auxdata);
2719
2720 wbam.write(brec);
2721 delete brec;
2722
2723 sprintf(XF,
2724 "XF:Z:2 %s-%s %u %s %s %s",
2725 ref_name,
2726 ref_name2,
2727 sam_pos,
2728 cigar,
2729 seq.c_str(),
2730 quals.c_str());
2731 auxdata.back() = XF;
2732
2733 brec = wbam.new_record(read_name, sam_flag, ref_name, left2 + 1, map_quality,
2734 cigar2, mate_ref_name.c_str(), mate_pos,
2735 insert_size, right_seq.c_str(), right_qual.c_str(), &auxdata);
2736
2737 wbam.write(brec);
2738 delete brec;
2739 }
2740 else
2741 {
2742 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
2743 cigar, mate_ref_name.c_str(), mate_pos,
2744 insert_size, seq.c_str(), quals.c_str(), &auxdata);
2745
2746 wbam.write(brec);
2747 delete brec;
2748 }
2749 }
2750
2751 /**
2752 * Print a vector of cigar operations to a file.
2753 * @param bh_cigar A vector of CigarOps
2754 * @return a string representation of the cigar string
2755 */
2756 std::string print_cigar(const vector<CigarOp>& bh_cigar){
2757 char cigar[256];
2758 cigar[0] = 0;
2759 for (size_t c = 0; c < bh_cigar.size(); ++c)
2760 {
2761 char ibuf[64];
2762 sprintf(ibuf, "%d", bh_cigar[c].length);
2763 strcat(cigar, ibuf);
2764 switch(bh_cigar[c].opcode)
2765 {
2766 case MATCH:
2767 strcat(cigar, "M");
2768 break;
2769 case mATCH:
2770 strcat(cigar, "m");
2771 break;
2772 case INS:
2773 strcat(cigar, "I");
2774 break;
2775 case iNS:
2776 strcat(cigar, "i");
2777 break;
2778 case DEL:
2779 strcat(cigar, "D");
2780 break;
2781 case dEL:
2782 strcat(cigar, "d");
2783 break;
2784 case REF_SKIP:
2785 strcat(cigar, "N");
2786 break;
2787 case rEF_SKIP:
2788 strcat(cigar, "n");
2789 break;
2790 case FUSION_FF:
2791 case FUSION_FR:
2792 case FUSION_RF:
2793 case FUSION_RR:
2794 strcat(cigar, "F");
2795 break;
2796 default:
2797 break;
2798 }
2799 }
2800 string result(cigar);
2801 return result;
2802 }
2803
2804 void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual,
2805 char* cigar1, char* cigar2, string& seq1, string& seq2,
2806 string& qual1, string& qual2, int& left1, int& left2)
2807 {
2808 const int left = bh.left();
2809 int right = left;
2810 int fusion_left = -1, fusion_right = -1;
2811
2812 const vector<CigarOp>& bh_cigar = bh.cigar();
2813
2814 CigarOpCode fusion_dir = FUSION_NOTHING;
2815 size_t fusion_idx = 0;
2816 size_t left_part_len = 0;
2817 for (size_t c = 0; c < bh_cigar.size(); ++c)
2818 {
2819 const CigarOp& op = bh_cigar[c];
2820 switch(op.opcode)
2821 {
2822 case MATCH:
2823 case REF_SKIP:
2824 case DEL:
2825 right += op.length;
2826 break;
2827 case mATCH:
2828 case rEF_SKIP:
2829 case dEL:
2830 right -= op.length;
2831 break;
2832 case FUSION_FF:
2833 case FUSION_FR:
2834 case FUSION_RF:
2835 case FUSION_RR:
2836 {
2837 fusion_dir = op.opcode;
2838 fusion_idx = c;
2839 if (op.opcode == FUSION_FF || op.opcode == FUSION_FR)
2840 fusion_left = right - 1;
2841 else
2842 fusion_left = right + 1;
2843 fusion_right = right = op.length;
2844 }
2845 break;
2846 default:
2847 break;
2848 }
2849
2850 if (fusion_dir == FUSION_NOTHING)
2851 {
2852 if (op.opcode == MATCH || op.opcode == mATCH || op.opcode == INS || op.opcode == iNS)
2853 {
2854 left_part_len += op.length;
2855 }
2856 }
2857 }
2858
2859 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_FR)
2860 {
2861 for (size_t c = 0; c < fusion_idx; ++c)
2862 {
2863 const CigarOp& op = bh_cigar[c];
2864 char ibuf[64];
2865 sprintf(ibuf, "%d", op.length);
2866 strcat(cigar1, ibuf);
2867
2868 switch (op.opcode)
2869 {
2870 case MATCH:
2871 strcat(cigar1, "M");
2872 break;
2873 case INS:
2874 strcat(cigar1, "I");
2875 break;
2876 case DEL:
2877 strcat(cigar1, "D");
2878 break;
2879 case REF_SKIP:
2880 strcat(cigar1, "N");
2881 break;
2882 default:
2883 assert (0);
2884 break;
2885 }
2886 }
2887 }
2888 else if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2889 {
2890 assert (fusion_idx > 0);
2891 for (int c = fusion_idx - 1; c >=0; --c)
2892 {
2893 const CigarOp& op = bh_cigar[c];
2894 char ibuf[64];
2895 sprintf(ibuf, "%d", op.length);
2896 strcat(cigar1, ibuf);
2897
2898 switch (op.opcode)
2899 {
2900 case mATCH:
2901 strcat(cigar1, "M");
2902 break;
2903 case iNS:
2904 strcat(cigar1, "I");
2905 break;
2906 case dEL:
2907 strcat(cigar1, "D");
2908 break;
2909 case rEF_SKIP:
2910 strcat(cigar1, "N");
2911 break;
2912 default:
2913 assert (0);
2914 break;
2915 }
2916 }
2917 }
2918
2919 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RF)
2920 {
2921 for (size_t c = fusion_idx + 1; c < bh_cigar.size(); ++c)
2922 {
2923 const CigarOp& op = bh_cigar[c];
2924 char ibuf[64];
2925 sprintf(ibuf, "%d", op.length);
2926 strcat(cigar2, ibuf);
2927
2928 switch (op.opcode)
2929 {
2930 case MATCH:
2931 strcat(cigar2, "M");
2932 break;
2933 case INS:
2934 strcat(cigar2, "I");
2935 break;
2936 case DEL:
2937 strcat(cigar2, "D");
2938 break;
2939 case REF_SKIP:
2940 strcat(cigar2, "N");
2941 break;
2942 default:
2943 assert (0);
2944 break;
2945 }
2946 }
2947 }
2948 else if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR)
2949 {
2950 assert (bh_cigar.size() > 0);
2951 for (size_t c = bh_cigar.size() - 1; c > fusion_idx; --c)
2952 {
2953 const CigarOp& op = bh_cigar[c];
2954 char ibuf[64];
2955 sprintf(ibuf, "%d", op.length);
2956 strcat(cigar2, ibuf);
2957
2958 switch (op.opcode)
2959 {
2960 case mATCH:
2961 strcat(cigar2, "M");
2962 break;
2963 case iNS:
2964 strcat(cigar2, "I");
2965 break;
2966 case dEL:
2967 strcat(cigar2, "D");
2968 break;
2969 case rEF_SKIP:
2970 strcat(cigar2, "N");
2971 break;
2972 default:
2973 assert (0);
2974 break;
2975 }
2976 }
2977 }
2978
2979 if (fusion_dir != FUSION_NOTHING)
2980 {
2981 seq1 = seq.substr(0, left_part_len);
2982 qual1 = qual.substr(0, left_part_len);
2983
2984 if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2985 {
2986 reverse_complement(seq1);
2987 reverse(qual1.begin(), qual1.end());
2988 }
2989
2990 seq2 = seq.substr(left_part_len);
2991 qual2 = qual.substr(left_part_len);
2992
2993 if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR)
2994 {
2995 reverse_complement(seq2);
2996 reverse(qual2.begin(), qual2.end());
2997 }
2998
2999 left1 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_FR) ? left : fusion_left);
3000 left2 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_RF) ? fusion_right : right + 1);
3001 }
3002 }
3003
3004
3005 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
3006 {
3007 RefSequenceTable::Sequence* ref_str1 = rt.get_seq(_ref_id);
3008 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(_ref_id2);
3009
3010 if (!ref_str1 || !ref_str2)
3011 return false;
3012
3013 RefSequenceTable::Sequence* ref_str = ref_str1;
3014
3015 size_t pos_seq = 0;
3016 size_t pos_ref = _left;
3017 size_t mismatch = 0;
3018 bool bSawFusion = false;
3019 for (size_t i = 0; i < _cigar.size(); ++i)
3020 {
3021 CigarOp cigar = _cigar[i];
3022 switch(cigar.opcode)
3023 {
3024 case MATCH:
3025 {
3026 seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length);
3027 for (size_t j = 0; j < cigar.length; ++j)
3028 {
3029 if (_seq[pos_seq] != ref_seq[j])
3030 ++mismatch;
3031 ++pos_seq;
3032 }
3033
3034 pos_ref += cigar.length;
3035 }
3036 break;
3037 case mATCH:
3038 {
3039 seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1);
3040 seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>());
3041 seqan::reverseInPlace(ref_seq);
3042
3043 for (size_t j = 0; j < cigar.length; ++j)
3044 {
3045 if (_seq[pos_seq] != ref_seq[j])
3046 ++mismatch;
3047 ++pos_seq;
3048 }
3049
3050 pos_ref -= cigar.length;
3051 }
3052 break;
3053 case INS:
3054 case iNS:
3055 {
3056 pos_seq += cigar.length;
3057 }
3058 break;
3059
3060 case DEL:
3061 case REF_SKIP:
3062 {
3063 pos_ref += cigar.length;
3064 }
3065 break;
3066
3067 case dEL:
3068 case rEF_SKIP:
3069 {
3070 pos_ref -= cigar.length;
3071 }
3072 break;
3073
3074 case FUSION_FF:
3075 case FUSION_FR:
3076 case FUSION_RF:
3077 case FUSION_RR:
3078 {
3079 // We don't allow a read spans more than two chromosomes.
3080 if (bSawFusion)
3081 return false;
3082
3083 ref_str = ref_str2;
3084 pos_ref = cigar.length;
3085
3086 bSawFusion = true;
3087 }
3088 break;
3089
3090 default:
3091 break;
3092 }
3093 }
3094
3095 return mismatch == _edit_dist;
3096 }