ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 260
Committed: Tue Jul 24 20:10:50 2012 UTC (12 years, 1 month ago) by gpertea
File size: 99909 byte(s)
Log Message:
wip tophat.py - adding resume feature

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * long_spanning_reads.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <cstdlib>
22 #include <cstring>
23 #include <bitset>
24 //#include <stdexcept>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <seqan/modifier.h>
32 #include <getopt.h>
33
34 #include <boost/thread.hpp>
35
36 #include "common.h"
37 #include "utils.h"
38 #include "bwt_map.h"
39 #include "tokenize.h"
40 #include "segments.h"
41 #include "reads.h"
42
43 #include "junctions.h"
44 #include "insertions.h"
45 #include "deletions.h"
46 #include "fusions.h"
47
48 using namespace seqan;
49 using namespace std;
50
51 // daehwan
52 bool bDebug = false;
53
54 void print_usage()
55 {
56 fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
57 }
58
59 bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
60 const pair<uint32_t, HitsForRead>& rhs)
61 {
62 return lhs.first < rhs.first;
63 }
64
65 void get_seqs(istream& ref_stream,
66 RefSequenceTable& rt,
67 bool keep_seqs = true)
68 {
69 while(ref_stream.good() &&
70 !ref_stream.eof())
71 {
72 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
73 string name;
74 readMeta(ref_stream, name, Fasta());
75 string::size_type space_pos = name.find_first_of(" \t\r");
76 if (space_pos != string::npos)
77 {
78 name.resize(space_pos);
79 }
80 seqan::read(ref_stream, *ref_str, Fasta());
81
82 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
83 }
84 }
85
86
87 void look_right_for_hit_group(ReadTable& unmapped_reads,
88 vector<HitStream>& contig_hits,
89 size_t curr_file,
90 vector<HitStream>& spliced_hits,
91 const HitsForRead& targets,
92 vector<HitsForRead>& seg_hits_for_read)
93 {
94 int right_file = curr_file + 1;
95
96 HitStream& right = contig_hits[right_file];
97 uint64_t curr_next_group_id = targets.insert_id;
98 int curr_order = unmapped_reads.observation_order(curr_next_group_id);
99
100 assert (curr_order != -1);
101 while(true)
102 {
103 HitsForRead hit_group;
104 uint64_t right_next_group_id = right.next_group_id();
105 int right_order = unmapped_reads.observation_order(right_next_group_id);
106
107 // If we would have seen the hits by now, bail out.
108 if (curr_order < right_order || right_order == -1)
109 {
110 break;
111 }
112 if (right.next_read_hits(hit_group))
113 {
114 if (hit_group.insert_id == targets.insert_id)
115 {
116 // Some of the targets may be missing, we need to
117 // process them individually
118 seg_hits_for_read[right_file] = hit_group;
119 break;
120 }
121 }
122 }
123
124 HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
125
126 if (right_file < (int)spliced_hits.size() && right_file >= 0)
127 {
128 // Scan forward in the spliced hits file for this hit group
129 HitsForRead spliced_group;
130 HitsForRead curr_spliced_group;
131 while (spliced_hits[right_file].next_group_id() > 0 &&
132 spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
133 {
134 spliced_hits[right_file].next_read_hits(curr_spliced_group);
135
136 if (curr_spliced_group.insert_id == (uint32_t)curr_order)
137 {
138 spliced_group = curr_spliced_group;
139 break;
140 }
141 }
142
143 if (!spliced_group.hits.empty())
144 {
145 curr_seg_hits.insert_id = spliced_group.insert_id;
146 curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
147 spliced_group.hits.begin(),
148 spliced_group.hits.end());
149 }
150 }
151
152 if (curr_seg_hits.hits.empty())
153 return;
154 else if (right_file + 1 < (int)contig_hits.size())
155 {
156 look_right_for_hit_group(unmapped_reads,
157 contig_hits,
158 curr_file + 1,
159 spliced_hits,
160 curr_seg_hits,
161 seg_hits_for_read);
162 }
163 }
164
165 BowtieHit merge_chain_color(RefSequenceTable& rt,
166 const string& read_seq,
167 const string& read_quals,
168 std::set<Junction>& possible_juncs,
169 std::set<Insertion>& possible_insertions,
170 list<BowtieHit>& hit_chain)
171 {
172 bool antisense = hit_chain.front().antisense_align();
173 uint32_t reference_id = hit_chain.front().ref_id();
174 uint64_t insert_id = hit_chain.front().insert_id();
175
176 int left = hit_chain.front().left();
177
178 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
179 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
180
181 string seq;
182 string qual;
183 int old_read_length = 0;
184 int first_seg_length = hit_chain.front().seq().length();
185 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
186 {
187 seq += i->seq();
188 qual += i->qual();
189 old_read_length += i->read_len();
190 }
191
192 string rev_read_seq, rev_read_quals;
193 if (color && antisense)
194 {
195 rev_read_seq = read_seq;
196 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
197
198 rev_read_quals = read_quals;
199 reverse(rev_read_quals.begin(), rev_read_quals.end());
200 }
201
202 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
203 {
204 /*
205 * Note that the gap size may be negative, since left() and right() return
206 * signed integers, this will be OK.
207 */
208 int gap = curr_hit->left() - prev_hit->right();
209 if (gap < -(int)max_insertion_length ||
210 (gap > (int)max_deletion_length &&
211 (gap < min_report_intron_length || gap > max_report_intron_length)))
212 {
213 return BowtieHit();
214 }
215
216 ++prev_hit;
217 ++curr_hit;
218 }
219
220 prev_hit = hit_chain.begin();
221 curr_hit = ++(hit_chain.begin());
222
223 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
224 if (!ref_str)
225 return BowtieHit();
226
227 int curr_seg_index = 1;
228 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
229 {
230 /*
231 * This code is assuming that the cigar strings end and start with a match
232 * While segment alignments can actually end with a junction, insertion or deletion, the hope
233 * is that in those cases, the right and left ends of the alignments will correctly
234 * line up, so we won't get to this bit of code
235 */
236 if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
237 {
238 return BowtieHit();
239 }
240
241 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
242 {
243 /*
244 * There is no way that we can splice these together into a valid
245 * alignment
246 */
247 return BowtieHit();
248 }
249
250 bool found_closure = false;
251 /*
252 * Note that antisense_splice is the same for prev and curr
253 */
254 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
255 vector<CigarOp> new_cigar;
256 int new_left = -1;
257 int mismatch = 0;
258
259 /*
260 * Take the length of matched bases in each segment into consideration for closures,
261 * this can be a problem for reads of variable lengths.
262 */
263 int prev_right_end_match_length = prev_hit->cigar().back().length;
264 int curr_left_end_match_length = curr_hit->cigar().front().length;
265
266 if (prev_hit->right() > curr_hit->left())
267 {
268 std::set<Insertion>::iterator lb, ub;
269 /*
270 * Note, this offset is determined by the min-anchor length supplied to
271 * juncs_db, which is currently hard-coded at 3 in tophat.py
272 * as this value determines what sort of read segments
273 * should be able to align directly to the splice sequences
274 */
275 int left_boundary = prev_hit->right() - 4;
276 int right_boundary = curr_hit->left() + 4;
277
278 /*
279 * Create a dummy sequence to represent the maximum possible insertion
280 */
281 std::string maxInsertedSequence = "";
282 maxInsertedSequence.resize(max_insertion_length,'A');
283
284 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
285 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
286
287 int reference_mismatch = 0;
288
289 while (lb != ub && lb != possible_insertions.end())
290 {
291 /*
292 * In the following code, we will check to make sure that the segments have the proper
293 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
294 * In general, reads with insertions must match the inserted sequence exactly.
295 */
296 if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
297 {
298 /*
299 * Check we have enough matched bases on prev or curr segment.
300 */
301 int insert_to_prev_right = prev_hit->right() - lb->left - 1;
302 int curr_left_to_insert = lb->left - curr_hit->left() + 1;
303 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
304 {
305 ++lb;
306 continue;
307 }
308
309 /*
310 * Keep track of how many mismatches were made to the genome in the region
311 * where we should actually be matching against the insertion
312 */
313 int this_reference_mismatch = 0;
314 int insertion_mismatch = 0;
315 int insertion_len = lb->sequence.length();
316 const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
317
318 /*
319 * First check to see if we need to adjust number of observed errors for the left (prev)
320 * hit. This is only the case if this segment runs into the insertion. To be consistent
321 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
322 */
323 string colorSegmentSequence_prev;
324 if (insert_to_prev_right > 0)
325 {
326 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
327 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
328
329 if (color)
330 {
331 string color;
332
333 if (antisense)
334 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
335 else
336 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
337
338 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
339 colorSegmentSequence_prev = convert_color_to_bp(color);
340 }
341
342 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
343
344 /*
345 * Scan right in the read until we run out of read
346 */
347 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
348 {
349 /*
350 * Any mismatch to the insertion is a failure
351 */
352 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
353 {
354 ++this_reference_mismatch;
355 }
356
357 if (read_index < insertion_len)
358 {
359 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
360 {
361 ++insertion_mismatch;
362 break;
363 }
364 }
365 else
366 {
367 if (referenceSequence[read_index - insertion_len] == 'N' ||
368 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
369 {
370 --this_reference_mismatch;
371 }
372 }
373 }
374 }
375
376 string colorSegmentSequence_curr;
377 if (curr_left_to_insert > 0)
378 {
379 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
380 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
381
382 if (color)
383 {
384 string color;
385 if (antisense)
386 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
387 else
388 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
389
390 color.push_back(curr_hit->seq()[curr_left_to_insert]);
391 reverse(color.begin(), color.end());
392 string bp = convert_color_to_bp(color);
393 reverse(bp.begin(), bp.end());
394 colorSegmentSequence_curr = bp;
395 }
396
397 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
398
399 /*
400 * Scan left in the read until
401 * We ran out of read sequence (insertion extends past segment)
402 */
403 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
404 {
405 int segmentPosition = curr_left_to_insert - read_index - 1;
406 int insertionPosition = insertion_len - read_index - 1;
407
408 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
409 {
410 ++this_reference_mismatch;
411 }
412
413 if (read_index < insertion_len)
414 {
415 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
416 {
417 ++insertion_mismatch;
418 break;
419 }
420 }
421 else
422 {
423 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
424 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
425 {
426 --this_reference_mismatch;
427 }
428 }
429 }
430 }
431
432 if (found_closure)
433 {
434 fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
435 return BowtieHit();
436 }
437
438 if (insertion_mismatch == 0)
439 {
440 reference_mismatch = this_reference_mismatch;
441 mismatch = -reference_mismatch;
442 found_closure = true;
443 new_left = prev_hit->left();
444 new_cigar = prev_hit->cigar();
445
446 /*
447 * Need to make a new insert operation between the two match character that begin
448 * and end the intersection of these two junction. Note that we necessarily assume
449 * that this insertion can't span beyond the boundaries of these reads. That should
450 * probably be better enforced somewhere
451 */
452
453 new_cigar.back().length -= insert_to_prev_right;
454 if (new_cigar.back().length <= 0)
455 new_cigar.pop_back();
456
457 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
458 vector<CigarOp> new_right_cigar = curr_hit->cigar();
459 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
460
461 /*
462 * Finish stitching together the new cigar string
463 */
464 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
465 for (; c < new_right_cigar.size(); ++c)
466 {
467 new_cigar.push_back(new_right_cigar[c]);
468 }
469
470 if (color)
471 {
472 if (insert_to_prev_right > 0)
473 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
474
475 if (curr_left_to_insert > 0)
476 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
477 }
478 }
479 }
480 ++lb;
481 }
482
483 if (!found_closure)
484 {
485 return BowtieHit();
486 }
487 }
488
489 /*
490 * Stitch segments together using juctions or deletions if necessary.
491 */
492 else if (prev_hit->right() < curr_hit->left())
493 {
494 std::set<Junction>::iterator lb, ub;
495
496 int left_boundary = prev_hit->right() - 4;
497 int right_boundary = curr_hit->left() + 4;
498
499 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
500 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
501
502 int new_diff_mismatches = 0xff;
503 while (lb != ub && lb != possible_juncs.end())
504 {
505 int dist_to_left = lb->left - prev_hit->right() + 1;
506 int dist_to_right = lb->right - curr_hit->left();
507
508 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
509 {
510 /*
511 * Check we have enough matched bases on prev or curr segment.
512 */
513 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
514 {
515 ++lb;
516 continue;
517 }
518
519 Dna5String new_cmp_str, old_cmp_str;
520 int new_mismatch = 0, old_mismatch = 0;
521 string new_patch_str; // this is for colorspace reads
522 if (dist_to_left > 0)
523 {
524 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
525 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
526
527 string new_seq;
528 if (color)
529 {
530 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
531
532 string color, qual;
533 if (antisense)
534 {
535 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
536 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
537 }
538 else
539 {
540 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
541 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
542 }
543
544 BWA_decode(color, qual, ref, new_seq);
545 new_seq = new_seq.substr(1);
546 }
547
548 const string& curr_old_seq = curr_hit->seq();
549 const string& curr_seq = color ? new_seq : curr_hit->seq();
550 for (int i = 0; i < dist_to_left; ++i)
551 {
552 if (curr_seq[i] != new_cmp_str[i])
553 ++new_mismatch;
554
555 if (curr_old_seq[i] != old_cmp_str[i])
556 ++old_mismatch;
557 }
558
559 if (color)
560 new_patch_str = curr_seq.substr(0, dist_to_left);
561 }
562 else if (dist_to_left < 0)
563 {
564 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
565 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
566
567 size_t abs_dist = -dist_to_left;
568 string new_seq;
569 if (color)
570 {
571 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
572 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
573
574 string color, qual;
575 if (antisense)
576 {
577 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
578 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
579 }
580 else
581 {
582 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
583 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
584 }
585
586 BWA_decode(color, qual, ref, new_seq);
587 new_seq = new_seq.substr(1);
588 }
589
590 const string& prev_old_seq = prev_hit->seq();
591 size_t prev_old_seq_len = prev_old_seq.length();
592 const string& prev_seq = color ? new_seq : prev_hit->seq();
593 size_t prev_seq_len = prev_seq.length();
594 for (size_t i = 0; i < abs_dist; ++i)
595 {
596 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
597 ++new_mismatch;
598 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
599 ++old_mismatch;
600 }
601
602 if (color)
603 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
604 }
605
606 int temp_diff_mismatches = new_mismatch - old_mismatch;
607 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
608 {
609 ++lb;
610 continue;
611 }
612
613 if (color)
614 {
615 /*
616 * We need to recover the origianl sequence.
617 */
618 if (found_closure)
619 {
620 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
621 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
622 }
623
624 if (dist_to_left > 0)
625 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
626 else if (dist_to_left < 0)
627 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
628 }
629
630 new_diff_mismatches = temp_diff_mismatches;
631
632 new_left = prev_hit->left();
633 new_cigar = prev_hit->cigar();
634
635 int new_left_back_len = new_cigar.back().length;
636 new_left_back_len += dist_to_left;
637
638 vector<CigarOp> new_right_cig = curr_hit->cigar();
639 int new_right_front_len = new_right_cig.front().length;
640 new_right_front_len -= dist_to_right;
641 if (new_left_back_len > 0)
642 new_cigar.back().length = new_left_back_len;
643 else
644 new_cigar.pop_back();
645
646 /*
647 * FIXME, currently just differentiating between a deletion and a
648 * reference skip based on length. However, would probably be better
649 * to denote the difference explicitly, this would allow the user
650 * to supply their own (very large) deletions
651 */
652 if ((lb->right - lb->left - 1) <= max_deletion_length)
653 {
654 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
655 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
656 }
657 else
658 {
659 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
660 antisense_closure = lb->antisense;
661 }
662
663 new_right_cig.front().length = new_right_front_len;
664 size_t c = new_right_front_len > 0 ? 0 : 1;
665 for (; c < new_right_cig.size(); ++c)
666 new_cigar.push_back(new_right_cig[c]);
667
668 mismatch = new_diff_mismatches;
669 found_closure = true;
670 }
671 ++lb;
672 }
673
674 if (!found_closure)
675 {
676 return BowtieHit();
677 }
678 }
679
680 if (found_closure)
681 {
682 bool end = false;
683 int mismatches = prev_hit->mismatches() + curr_hit->mismatches() + mismatch;
684 BowtieHit merged_hit(reference_id,
685 reference_id,
686 insert_id,
687 new_left,
688 new_cigar,
689 antisense,
690 antisense_closure,
691 mismatches,
692 mismatches + gap_length(new_cigar),
693 prev_hit->splice_mms() + curr_hit->splice_mms(),
694 end);
695
696 if (curr_seg_index > 1)
697 merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
698 else
699 merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
700
701 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
702 /*
703 * prev_hit now points PAST the last element removed
704 */
705 prev_hit = hit_chain.insert(prev_hit, merged_hit);
706 /*
707 * merged_hit has been inserted before the old position of
708 * prev_hit. New location of prev_hit is merged_hit
709 */
710 curr_hit = prev_hit;
711 ++curr_hit;
712 ++curr_seg_index;
713 continue;
714 }
715
716 ++prev_hit;
717 ++curr_hit;
718 ++curr_seg_index;
719 }
720
721 bool saw_antisense_splice = false;
722 bool saw_sense_splice = false;
723 vector<CigarOp> long_cigar;
724 int num_mismatches = 0;
725 int num_splice_mms = 0;
726 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
727 {
728 num_mismatches += s->mismatches();
729 num_splice_mms += s->splice_mms();
730
731 /*
732 * Check whether the sequence contains any reference skips. Previously,
733 * this was just a check to see whether the sequence was contiguous; however
734 * we don't want to count an indel event as a splice
735 */
736 bool containsSplice = s->is_spliced();
737 if (containsSplice)
738 {
739 if (s->antisense_splice())
740 {
741 if (saw_sense_splice)
742 return BowtieHit();
743 saw_antisense_splice = true;
744 }
745 else
746 {
747 if (saw_antisense_splice)
748 return BowtieHit();
749 saw_sense_splice = true;
750 }
751 }
752 const vector<CigarOp>& cigar = s->cigar();
753 if (long_cigar.empty())
754 {
755 long_cigar = cigar;
756 }
757 else
758 {
759 CigarOp& last = long_cigar.back();
760 /*
761 * If necessary, merge the back and front
762 * cigar operations
763 */
764 if(last.opcode == cigar[0].opcode){
765 last.length += cigar[0].length;
766 for (size_t b = 1; b < cigar.size(); ++b)
767 {
768 long_cigar.push_back(cigar[b]);
769 }
770 }else{
771 for(size_t b = 0; b < cigar.size(); ++b)
772 {
773 long_cigar.push_back(cigar[b]);
774 }
775 }
776 }
777 }
778
779 bool end = false;
780 BowtieHit new_hit(reference_id,
781 reference_id,
782 insert_id,
783 left,
784 long_cigar,
785 antisense,
786 saw_antisense_splice,
787 num_mismatches,
788 num_mismatches + gap_length(long_cigar),
789 num_splice_mms,
790 end);
791
792 new_hit.seq(seq);
793 new_hit.qual(qual);
794
795 int new_read_len = new_hit.read_len();
796 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
797 {
798 fprintf(stderr, "Warning: malformed closure\n");
799 return BowtieHit();
800 }
801
802 return new_hit;
803 }
804
805 BowtieHit merge_chain(RefSequenceTable& rt,
806 const string& read_seq,
807 const string& read_quals,
808 std::set<Junction>& possible_juncs,
809 std::set<Insertion>& possible_insertions,
810 std::set<Fusion>& possible_fusions,
811 list<BowtieHit>& hit_chain,
812 int fusion_dir = FUSION_NOTHING)
813 {
814 bool antisense = hit_chain.front().antisense_align();
815 uint64_t insert_id = hit_chain.front().insert_id();
816
817 const int left = hit_chain.front().left();
818
819 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
820 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
821
822 string seq;
823 string qual;
824 int old_read_length = 0;
825 int first_seg_length = hit_chain.front().seq().length();
826 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
827 {
828 seq += i->seq();
829 qual += i->qual();
830 old_read_length += i->read_len();
831 }
832
833 string rev_read_seq, rev_read_quals;
834 if (color && antisense)
835 {
836 rev_read_seq = read_seq;
837 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
838
839 rev_read_quals = read_quals;
840 reverse(rev_read_quals.begin(), rev_read_quals.end());
841 }
842
843 size_t num_fusions = prev_hit->fusion_opcode() == FUSION_NOTHING ? 0 : 1;
844 bool fusion_passed = false;
845 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
846 {
847 if (prev_hit->ref_id() != prev_hit->ref_id2() || prev_hit->ref_id2() != curr_hit->ref_id())
848 fusion_passed = true;
849
850 if (prev_hit->ref_id2() != curr_hit->ref_id())
851 ++num_fusions;
852
853 if (curr_hit->fusion_opcode() != FUSION_NOTHING)
854 ++num_fusions;
855
856 if (prev_hit->ref_id2() == curr_hit->ref_id())
857 {
858 bool reversed = false;
859 if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
860 reversed = true;
861
862 /*
863 * Note that the gap size may be negative, since left() and right() return
864 * signed integers, this will be OK.
865 */
866 int gap;
867 if (reversed)
868 gap = prev_hit->right() - curr_hit->left();
869 else
870 gap = curr_hit->left() - prev_hit->right();
871
872 // daehwan
873 if (bDebug)
874 {
875 cout << "prev: " << prev_hit->ref_id() << ":" << prev_hit->left() << ":" << (prev_hit->antisense_align() ? "-" : "+")
876 << "\t" << prev_hit->ref_id2() << ":" << prev_hit->right() << ":" << (prev_hit->antisense_align2() ? "-" : "+") << endl
877 << "curr: " << curr_hit->ref_id() << ":" << curr_hit->left() << ":" << (curr_hit->antisense_align() ? "-" : "+")
878 << "\t" << curr_hit->ref_id2() << ":" << curr_hit->right() << ":" << (curr_hit->antisense_align2() ? "-" : "+") << endl
879 << "gap: " << gap << endl;
880 }
881
882 if (gap < -(int)max_insertion_length ||
883 (gap > (int)max_deletion_length &&
884 (gap < min_report_intron_length || gap > min(max_report_intron_length, (int)fusion_min_dist))))
885 {
886 fusion_passed = true;
887 ++num_fusions;
888 }
889 }
890
891 if (num_fusions >= 2)
892 return BowtieHit();
893
894 ++prev_hit;
895 ++curr_hit;
896 }
897
898 prev_hit = hit_chain.begin();
899 curr_hit = ++(hit_chain.begin());
900
901 // daehwan
902 if (bDebug)
903 {
904 cout << "daehwan - test" << endl;
905 }
906
907 int curr_seg_index = 1;
908 fusion_passed = false;
909 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
910 {
911 antisense = prev_hit->antisense_align();
912
913 // daehwan
914 if (bDebug)
915 {
916 cout << "daehwan - start - stitch" << endl;
917 cout << "prev right: " << prev_hit->right() << endl;
918 cout << "curr left: " << curr_hit->left() << endl;
919 cout << "prev back: " << prev_hit->cigar().back().opcode << endl;
920 cout << "curr front: " << curr_hit->cigar().front().opcode << endl;
921 cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
922 cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
923 }
924
925 if (prev_hit->fusion_opcode() != FUSION_NOTHING || prev_hit->ref_id2() != curr_hit->ref_id())
926 fusion_passed = true;
927
928 /*
929 * This code is assuming that the cigar strings end and start with a match
930 * While segment alignments can actually end with a junction, insertion or deletion, the hope
931 * is that in those cases, the right and left ends of the alignments will correctly
932 * line up, so we won't get to this bit of code
933 */
934 if (!(prev_hit->cigar().back().opcode == MATCH || curr_hit->cigar().front().opcode == MATCH ||
935 prev_hit->cigar().back().opcode == mATCH || curr_hit->cigar().front().opcode == mATCH))
936 {
937 return BowtieHit();
938 }
939
940 // daehwan
941 if (bDebug)
942 {
943 cout << "daehwan - pass - enough matched bases" << endl;
944 }
945
946 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
947 {
948 /*
949 * There is no way that we can splice these together into a valid
950 * alignment
951 */
952 return BowtieHit();
953 }
954
955 bool found_closure = false;
956 /*
957 * Note that antisense_splice is the same for prev and curr
958 */
959 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
960 vector<CigarOp> new_cigar;
961 int new_left = -1;
962 int mismatch = 0;
963
964 /*
965 * Take the length of matched bases in each segment into consideration for closures,
966 * this can be a problem for reads of variable lengths.
967 */
968 int prev_right_end_match_length = prev_hit->cigar().back().length;
969 int curr_left_end_match_length = curr_hit->cigar().front().length;
970
971 bool check_fusion = prev_hit->ref_id2() != curr_hit->ref_id();
972
973 if (prev_hit->ref_id2() == curr_hit->ref_id())
974 {
975 // daehwan
976 if (bDebug)
977 {
978 cout << "daehwan - start - junction or insertion" << endl;
979 cout << "prev right: " << prev_hit->right() << endl;
980 cout << "curr left: " << curr_hit->left() << endl;
981 cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
982 cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
983 }
984
985 bool reversed = false;
986 if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
987 reversed = true;
988
989 uint32_t reference_id = prev_hit->ref_id2();
990 RefSequenceTable::Sequence* ref_str = rt.get_seq(reference_id);
991
992 int left_boundary, right_boundary;
993 if (reversed)
994 {
995 left_boundary = curr_hit->left() - 4;
996 right_boundary = prev_hit->right() + 4;
997 }
998 else
999 {
1000 left_boundary = prev_hit->right() - 4;
1001 right_boundary = curr_hit->left() + 4;
1002 }
1003
1004 int dist_btw_two;
1005 if (reversed)
1006 dist_btw_two = prev_hit->right() - curr_hit->left();
1007 else
1008 dist_btw_two = curr_hit->left() - prev_hit->right();
1009
1010 if (dist_btw_two < 0 && dist_btw_two >= -(int)max_insertion_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
1011 {
1012 std::set<Insertion>::iterator lb, ub;
1013
1014 /*
1015 * Create a dummy sequence to represent the maximum possible insertion
1016 */
1017 std::string maxInsertedSequence = "";
1018 maxInsertedSequence.resize(max_insertion_length,'A');
1019
1020 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
1021 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
1022
1023 int reference_mismatch = 0;
1024
1025 while (lb != ub && lb != possible_insertions.end())
1026 {
1027 /*
1028 * In the following code, we will check to make sure that the segments have the proper
1029 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
1030 * In general, reads with insertions must match the inserted sequence exactly.
1031 */
1032 if (((int)lb->sequence.size()) == (reversed ? curr_hit->left() - prev_hit->right() : prev_hit->right() - curr_hit->left()))
1033 {
1034 /*
1035 * Check we have enough matched bases on prev or curr segment.
1036 */
1037 int insert_to_prev_right, curr_left_to_insert;
1038 if (reversed)
1039 {
1040 insert_to_prev_right = lb->left - prev_hit->right();
1041 curr_left_to_insert = curr_hit->left() - lb->left;
1042 }
1043 else
1044 {
1045 insert_to_prev_right = prev_hit->right() - lb->left - 1;
1046 curr_left_to_insert = lb->left - curr_hit->left() + 1;
1047 }
1048
1049 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
1050 {
1051 ++lb;
1052 continue;
1053 }
1054
1055 // daehwan
1056 if (bDebug)
1057 {
1058 cout << "insert_to_prev_right: " << insert_to_prev_right << endl;
1059 cout << "curr_left_to_insert: " << curr_left_to_insert << endl;
1060 cout << "curr_seg_index: " << curr_seg_index << endl;
1061 }
1062
1063 /*
1064 * Keep track of how many mismatches were made to the genome in the region
1065 * where we should actually be matching against the insertion
1066 */
1067 int this_reference_mismatch = 0;
1068 int insertion_mismatch = 0;
1069 int insertion_len = lb->sequence.length();
1070 seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
1071 if (reversed)
1072 {
1073 seqan::reverseComplement(insertionSequence);
1074 }
1075
1076 /*
1077 * First check to see if we need to adjust number of observed errors for the left (prev)
1078 * hit. This is only the case if this segment runs into the insertion. To be consistent
1079 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
1080 */
1081 string colorSegmentSequence_prev;
1082 if (insert_to_prev_right > 0)
1083 {
1084 seqan::Dna5String referenceSequence, oldSegmentSequence;
1085
1086 if (reversed)
1087 {
1088 referenceSequence = seqan::infix(*ref_str, prev_hit->right() + 1, lb->left + 1);
1089 seqan::reverseComplement(referenceSequence);
1090
1091 string temp;
1092
1093 // daehwan
1094 if (bDebug)
1095 {
1096 cout << "reversed: " << read_seq.length() << " " << read_seq << endl;
1097 }
1098
1099 temp = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right);
1100 oldSegmentSequence = seqan::Dna5String(temp);
1101 }
1102 else
1103 {
1104 referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
1105
1106 // daehwan
1107 if (bDebug)
1108 {
1109 cout << "non-reversed: " << prev_hit->seq() << endl;
1110 }
1111
1112 oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
1113 }
1114
1115 if (color)
1116 {
1117 string color;
1118
1119 if (antisense)
1120 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
1121 else
1122 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
1123
1124 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
1125 colorSegmentSequence_prev = convert_color_to_bp(color);
1126 }
1127
1128 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
1129
1130 // daehwan
1131 if (bDebug)
1132 {
1133 cout << "ref: " << referenceSequence << endl;
1134 cout << "old: " << oldSegmentSequence << endl;
1135 cout << "ins: " << insertionSequence << endl;
1136 }
1137
1138 /*
1139 * Scan right in the read until we run out of read
1140 */
1141 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
1142 {
1143 /*
1144 * Any mismatch to the insertion is a failure
1145 */
1146 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
1147 {
1148 ++this_reference_mismatch;
1149 }
1150
1151 if (read_index < insertion_len)
1152 {
1153 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
1154 {
1155 ++insertion_mismatch;
1156 break;
1157 }
1158 }
1159 else
1160 {
1161 if (referenceSequence[read_index - insertion_len] == 'N' ||
1162 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
1163 {
1164 --this_reference_mismatch;
1165 }
1166 }
1167 }
1168 }
1169
1170 string colorSegmentSequence_curr;
1171 if (curr_left_to_insert > 0)
1172 {
1173 seqan::Dna5String referenceSequence, oldSegmentSequence;
1174 if (reversed)
1175 {
1176 referenceSequence = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
1177 seqan::reverseComplement(referenceSequence);
1178
1179 string temp = read_seq.substr(curr_seg_index * segment_length, curr_left_to_insert);
1180 oldSegmentSequence = seqan::Dna5String(temp);
1181 }
1182 else
1183 {
1184 referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
1185 oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
1186 }
1187
1188 if (color)
1189 {
1190 string color;
1191 if (antisense)
1192 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
1193 else
1194 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
1195
1196 color.push_back(curr_hit->seq()[curr_left_to_insert]);
1197 reverse(color.begin(), color.end());
1198 string bp = convert_color_to_bp(color);
1199 reverse(bp.begin(), bp.end());
1200 colorSegmentSequence_curr = bp;
1201 }
1202
1203 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
1204
1205 // daehwan
1206 if (bDebug)
1207 {
1208 cout << "ref: " << referenceSequence << endl;
1209 cout << "old: " << oldSegmentSequence << endl;
1210 cout << "ins: " << insertionSequence << endl;
1211 }
1212
1213 /*
1214 * Scan left in the read until
1215 * We ran out of read sequence (insertion extends past segment)
1216 */
1217 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
1218 {
1219 int segmentPosition = curr_left_to_insert - read_index - 1;
1220 int insertionPosition = insertion_len - read_index - 1;
1221
1222 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
1223 {
1224 ++this_reference_mismatch;
1225 }
1226
1227 if (read_index < insertion_len)
1228 {
1229 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
1230 {
1231 ++insertion_mismatch;
1232 break;
1233 }
1234 }
1235 else
1236 {
1237 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
1238 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
1239 {
1240 --this_reference_mismatch;
1241 }
1242 }
1243 }
1244 }
1245
1246 if (found_closure)
1247 {
1248 // fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
1249 return BowtieHit();
1250 }
1251
1252 if (insertion_mismatch == 0)
1253 {
1254 reference_mismatch = this_reference_mismatch;
1255 mismatch = -reference_mismatch;
1256 found_closure = true;
1257 new_left = prev_hit->left();
1258 new_cigar = prev_hit->cigar();
1259
1260 /*
1261 * Need to make a new insert operation between the two match character that begin
1262 * and end the intersection of these two junction. Note that we necessarily assume
1263 * that this insertion can't span beyond the boundaries of these reads. That should
1264 * probably be better enforced somewhere
1265 */
1266
1267 new_cigar.back().length -= insert_to_prev_right;
1268
1269 if (new_cigar.back().length <= 0)
1270 new_cigar.pop_back();
1271
1272 if (reversed)
1273 new_cigar.push_back(CigarOp(iNS, lb->sequence.size()));
1274 else
1275 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
1276
1277 vector<CigarOp> new_right_cigar = curr_hit->cigar();
1278 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
1279
1280 /*
1281 * Finish stitching together the new cigar string
1282 */
1283 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
1284 for (; c < new_right_cigar.size(); ++c)
1285 {
1286 new_cigar.push_back(new_right_cigar[c]);
1287 }
1288
1289 if (color)
1290 {
1291 if (insert_to_prev_right > 0)
1292 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
1293
1294 if (curr_left_to_insert > 0)
1295 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
1296 }
1297 }
1298 }
1299 ++lb;
1300 }
1301
1302 if (!found_closure)
1303 {
1304 return BowtieHit();
1305 }
1306 }
1307
1308 /*
1309 * Stitch segments together using juctions or deletions if necessary.
1310 */
1311 else if (dist_btw_two > 0 && dist_btw_two <= max_report_intron_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
1312 {
1313 std::set<Junction>::iterator lb, ub;
1314
1315 // daehwan
1316 if (bDebug)
1317 {
1318 cout << "junction" << endl;
1319 cout << "min: " << left_boundary << "-" << right_boundary - 8 << endl;
1320 cout << "max: " << left_boundary + 8 << "-" << right_boundary << endl;
1321 }
1322
1323 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
1324 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
1325
1326 int new_diff_mismatches = 0xff;
1327 while (lb != ub && lb != possible_juncs.end())
1328 {
1329 int dist_to_left, dist_to_right;
1330 if (reversed)
1331 {
1332 dist_to_left = lb->left - curr_hit->left();
1333 dist_to_right = lb->right - prev_hit->right() - 1;
1334 }
1335 else
1336 {
1337 dist_to_left = lb->left - prev_hit->right() + 1;
1338 dist_to_right = lb->right - curr_hit->left();
1339 }
1340
1341 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1342 {
1343 /*
1344 * Check we have enough matched bases on prev or curr segment.
1345 */
1346 if ((reversed && (dist_to_left > prev_right_end_match_length || -dist_to_left > curr_left_end_match_length)) ||
1347 (!reversed && (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)))
1348 {
1349 ++lb;
1350 continue;
1351 }
1352
1353 // daehwan
1354 if (bDebug)
1355 {
1356 cout << "candidate junction: " << endl;
1357 cout << "coords: " << lb->left << "-" << lb->right << endl;
1358 cout << "dist to left: " << dist_to_left << endl;
1359 }
1360
1361 Dna5String new_cmp_str, old_cmp_str;
1362 int new_mismatch = 0, old_mismatch = 0;
1363 string new_patch_str; // this is for colorspace reads
1364 if (dist_to_left > 0)
1365 {
1366 if (reversed)
1367 {
1368 new_cmp_str = seqan::infix(*ref_str, curr_hit->left() + 1, lb->left + 1);
1369 seqan::reverseComplement(new_cmp_str);
1370
1371 old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb->right);
1372 seqan::reverseComplement(old_cmp_str);
1373 }
1374 else
1375 {
1376 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
1377 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
1378 }
1379
1380 string new_seq;
1381 if (color)
1382 {
1383 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
1384
1385 string color, qual;
1386 if (antisense)
1387 {
1388 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
1389 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
1390 }
1391 else
1392 {
1393 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
1394 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
1395 }
1396
1397 BWA_decode(color, qual, ref, new_seq);
1398 new_seq = new_seq.substr(1);
1399 }
1400
1401 string curr_hit_seq;
1402 if (reversed)
1403 curr_hit_seq = read_seq.substr(curr_seg_index * segment_length - dist_to_left, dist_to_left);
1404 else
1405 curr_hit_seq = curr_hit->seq();
1406
1407 const string& curr_old_seq = curr_hit_seq;
1408 const string& curr_seq = color ? new_seq : curr_hit_seq;
1409 for (int i = 0; i < dist_to_left; ++i)
1410 {
1411 if (curr_seq[i] != new_cmp_str[i])
1412 ++new_mismatch;
1413
1414 if (curr_old_seq[i] != old_cmp_str[i])
1415 ++old_mismatch;
1416 }
1417
1418 if (color)
1419 new_patch_str = curr_seq.substr(0, dist_to_left);
1420 }
1421 else if (dist_to_left < 0)
1422 {
1423 if (reversed)
1424 {
1425 new_cmp_str = seqan::infix(*ref_str, lb->right, prev_hit->right() + 1);
1426 seqan::reverseComplement(new_cmp_str);
1427
1428 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
1429 seqan::reverseComplement(old_cmp_str);
1430 }
1431 else
1432 {
1433 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
1434 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
1435 }
1436
1437 size_t abs_dist = -dist_to_left;
1438 string new_seq;
1439 if (color)
1440 {
1441 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
1442 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
1443
1444 string color, qual;
1445 if (antisense)
1446 {
1447 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
1448 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
1449 }
1450 else
1451 {
1452 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
1453 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
1454 }
1455
1456 BWA_decode(color, qual, ref, new_seq);
1457 new_seq = new_seq.substr(1);
1458 }
1459
1460 string prev_hit_seq;
1461 if (reversed)
1462 prev_hit_seq = read_seq.substr(curr_seg_index * segment_length, abs_dist);
1463 else
1464 prev_hit_seq = prev_hit->seq();
1465
1466 // daehwan
1467 if (bDebug)
1468 {
1469 cout << "reverse: " << (int)reversed << endl;
1470 cout << "new cmp str: " << new_cmp_str << endl;
1471 cout << "old cmp str: " << old_cmp_str << endl;
1472 cout << "hit seq: " << prev_hit_seq << endl;
1473 cout << "curr seq: " << curr_hit->seq() << endl;
1474
1475 cout << read_seq
1476 << endl;
1477 cout << read_seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, segment_length)
1478 << endl;
1479 }
1480
1481 const string& prev_old_seq = prev_hit_seq;
1482 size_t prev_old_seq_len = prev_old_seq.length();
1483 const string& prev_seq = color ? new_seq : prev_hit_seq;
1484 size_t prev_seq_len = prev_seq.length();
1485 for (size_t i = 0; i < abs_dist; ++i)
1486 {
1487 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1488 ++new_mismatch;
1489 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1490 ++old_mismatch;
1491 }
1492
1493 if (color)
1494 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
1495 }
1496
1497 int temp_diff_mismatches = new_mismatch - old_mismatch;
1498
1499 // daehwan
1500 if (bDebug)
1501 {
1502 cout << "new mismatch: " << new_mismatch << endl;
1503 cout << "old mismatch: " << old_mismatch << endl;
1504 cout << "new_diff_mismatch: " << new_diff_mismatches << endl;
1505 cout << "temp mismatch: " << temp_diff_mismatches << endl;
1506 }
1507
1508 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1509 {
1510 ++lb;
1511 continue;
1512 }
1513
1514 if (color)
1515 {
1516 /*
1517 * We need to recover the origianl sequence.
1518 */
1519 if (found_closure)
1520 {
1521 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
1522 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
1523 }
1524
1525 if (dist_to_left > 0)
1526 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
1527 else if (dist_to_left < 0)
1528 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
1529 }
1530
1531 new_diff_mismatches = temp_diff_mismatches;
1532
1533 new_left = prev_hit->left();
1534 new_cigar = prev_hit->cigar();
1535
1536 int new_left_back_len = new_cigar.back().length;
1537 if (reversed)
1538 new_left_back_len -= dist_to_left;
1539 else
1540 new_left_back_len += dist_to_left;
1541
1542 vector<CigarOp> new_right_cig = curr_hit->cigar();
1543 int new_right_front_len = new_right_cig.front().length;
1544 if (reversed)
1545 new_right_front_len += dist_to_right;
1546 else
1547 new_right_front_len -= dist_to_right;
1548 if (new_left_back_len > 0)
1549 new_cigar.back().length = new_left_back_len;
1550 else
1551 new_cigar.pop_back();
1552
1553 /*
1554 * FIXME, currently just differentiating between a deletion and a
1555 * reference skip based on length. However, would probably be better
1556 * to denote the difference explicitly, this would allow the user
1557 * to supply their own (very large) deletions
1558 */
1559 if ((lb->right - lb->left - 1) <= max_deletion_length)
1560 {
1561 if (reversed)
1562 new_cigar.push_back(CigarOp(dEL, lb->right - lb->left - 1));
1563 else
1564 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
1565 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1566 }
1567 else
1568 {
1569 if (reversed)
1570 new_cigar.push_back(CigarOp(rEF_SKIP, lb->right - lb->left - 1));
1571 else
1572 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
1573 antisense_closure = lb->antisense;
1574 }
1575
1576 new_right_cig.front().length = new_right_front_len;
1577 size_t c = new_right_front_len > 0 ? 0 : 1;
1578 for (; c < new_right_cig.size(); ++c)
1579 new_cigar.push_back(new_right_cig[c]);
1580
1581 mismatch = new_diff_mismatches;
1582 found_closure = true;
1583 }
1584 ++lb;
1585 }
1586
1587 if (!found_closure)
1588 {
1589 return BowtieHit();
1590 }
1591 }
1592 else if (!(dist_btw_two == 0 && prev_hit->antisense_align2() == curr_hit->antisense_align()))
1593 check_fusion = true;
1594 }
1595
1596 if (check_fusion)
1597 {
1598 std::set<Fusion>::iterator lb, ub;
1599
1600 uint32_t ref_id1 = prev_hit->ref_id2();
1601 uint32_t ref_id2 = curr_hit->ref_id();
1602 uint32_t left = prev_hit->right() - 4;
1603 uint32_t right = curr_hit->left() - 4;
1604
1605 // daehwan
1606 if (bDebug)
1607 {
1608 cout << "daehwan - start - fusion" << endl
1609 << "ref_id1: " << ref_id1 << endl
1610 << "ref_id2: " << ref_id2 << endl
1611 << "left: " << left << endl
1612 << "right: " << right << endl
1613 << "dir: " << fusion_dir << endl;
1614 }
1615
1616 bool reversed = false;
1617 if (fusion_dir != FUSION_FF &&
1618 (ref_id2 < ref_id1 || (ref_id1 == ref_id2 && left > right)))
1619 {
1620 reversed = true;
1621
1622 uint32_t temp = ref_id1;
1623 ref_id1 = ref_id2;
1624 ref_id2 = temp;
1625
1626 temp = left;
1627 left = right;
1628 right = temp;
1629 }
1630
1631 lb = possible_fusions.upper_bound(Fusion(ref_id1, ref_id2, left, right));
1632 ub = possible_fusions.lower_bound(Fusion(ref_id1, ref_id2, left + 8, right + 8));
1633
1634 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id2());
1635 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(curr_hit->ref_id());
1636
1637 int new_diff_mismatches = 0xff;
1638 while (lb != ub && lb != possible_fusions.end())
1639 {
1640 int lb_left = lb->left;
1641 int lb_right = lb->right;
1642
1643 if (reversed)
1644 {
1645 lb_left = lb->right;
1646 lb_right = lb->left;
1647 }
1648
1649 int dist_to_left, dist_to_right;
1650 if (fusion_dir == FUSION_RF)
1651 dist_to_left = prev_hit->right() - lb_left + 1;
1652 else
1653 dist_to_left = lb_left - prev_hit->right() + 1;
1654
1655 if (fusion_dir == FUSION_FR)
1656 dist_to_right = curr_hit->left() - lb_right;
1657 else
1658 dist_to_right = lb_right - curr_hit->left();
1659
1660 // daehwan
1661 if (bDebug)
1662 {
1663 cout << "daehwan - fusion gap" << endl;
1664 cout << "dist left: " << dist_to_left << endl;
1665 cout << "dist right: " << dist_to_right << endl;
1666 }
1667
1668 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1669 {
1670 /*
1671 * Check we have enough matched bases on prev or curr segment.
1672 */
1673 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)
1674 {
1675 ++lb;
1676 continue;
1677 }
1678
1679 Dna5String new_cmp_str, old_cmp_str;
1680 int new_mismatch = 0, old_mismatch = 0;
1681 string new_patch_str; // this is for colorspace reads
1682 if (dist_to_left > 0)
1683 {
1684 if (fusion_dir == FUSION_RF)
1685 {
1686 new_cmp_str = seqan::infix(*ref_str, lb_left, prev_hit->right() + 1);
1687 seqan::reverseComplement(new_cmp_str);
1688 }
1689 else
1690 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb_left + 1);
1691
1692 if (fusion_dir == FUSION_FR)
1693 {
1694 old_cmp_str = seqan::infix(*ref_str2, lb_right + 1, curr_hit->left() + 1);
1695 seqan::reverseComplement(old_cmp_str);
1696 }
1697 else
1698 old_cmp_str = seqan::infix(*ref_str2, curr_hit->left(), lb_right);
1699
1700 // daehwan
1701 if (bDebug)
1702 {
1703 cout << "new str: " << new_cmp_str << endl;
1704 cout << "old str: " << old_cmp_str << endl;
1705 cout << "curr seq: " << curr_hit->seq() << endl;
1706 }
1707
1708 string curr_hit_seq;
1709 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1710 curr_hit_seq = curr_hit->seq();
1711 else
1712 curr_hit_seq = read_seq.substr(curr_seg_index * segment_length, segment_length);
1713
1714 string new_seq;
1715 const string& curr_old_seq = curr_hit_seq;
1716 const string& curr_seq = color ? new_seq : curr_hit_seq;
1717 for (int i = 0; i < dist_to_left; ++i)
1718 {
1719 if (curr_seq[i] != new_cmp_str[i])
1720 ++new_mismatch;
1721
1722 if (curr_old_seq[i] != old_cmp_str[i])
1723 ++old_mismatch;
1724 }
1725 }
1726 else if (dist_to_left < 0)
1727 {
1728 if (fusion_dir == FUSION_FR)
1729 {
1730 new_cmp_str = seqan::infix(*ref_str2, curr_hit->left() + 1, lb_right + 1);
1731 seqan::reverseComplement(new_cmp_str);
1732 }
1733 else
1734 new_cmp_str = seqan::infix(*ref_str2, lb_right, curr_hit->left());
1735
1736 if (fusion_dir == FUSION_RF)
1737 {
1738 old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb_left);
1739 seqan::reverseComplement(old_cmp_str);
1740 }
1741 else
1742 old_cmp_str = seqan::infix(*ref_str, lb_left + 1, prev_hit->right());
1743
1744 string prev_hit_seq;
1745 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1746 prev_hit_seq = prev_hit->seq();
1747 else
1748 prev_hit_seq = read_seq.substr((curr_seg_index - 1) * segment_length, segment_length);
1749
1750 size_t abs_dist = -dist_to_left;
1751 string new_seq;
1752
1753 const string& prev_old_seq = prev_hit_seq;
1754 size_t prev_old_seq_len = prev_old_seq.length();
1755 const string& prev_seq = color ? new_seq : prev_hit_seq;
1756 size_t prev_seq_len = prev_seq.length();
1757 for (size_t i = 0; i < abs_dist; ++i)
1758 {
1759 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1760 ++new_mismatch;
1761 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1762 ++old_mismatch;
1763 }
1764 }
1765
1766 int temp_diff_mismatches = new_mismatch - old_mismatch;
1767 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1768 {
1769 ++lb;
1770 continue;
1771 }
1772 new_diff_mismatches = temp_diff_mismatches;
1773
1774 new_left = prev_hit->left();
1775 new_cigar = prev_hit->cigar();
1776
1777 int new_left_back_len = new_cigar.back().length;
1778 new_left_back_len += dist_to_left;
1779
1780 vector<CigarOp> new_right_cig = curr_hit->cigar();
1781 int new_right_front_len = new_right_cig.front().length;
1782 new_right_front_len -= dist_to_right;
1783 if (new_left_back_len > 0)
1784 new_cigar.back().length = new_left_back_len;
1785 else
1786 new_cigar.pop_back();
1787
1788 new_cigar.push_back(CigarOp((CigarOpCode)fusion_dir, lb_right));
1789 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1790
1791 new_right_cig.front().length = new_right_front_len;
1792 size_t c = new_right_front_len > 0 ? 0 : 1;
1793 for (; c < new_right_cig.size(); ++c)
1794 new_cigar.push_back(new_right_cig[c]);
1795
1796 mismatch = new_diff_mismatches;
1797 found_closure = true;
1798 ++num_fusions;
1799
1800 // daehwan
1801 if (bDebug)
1802 {
1803 cout << "daehwan - fusion gap - found" << endl;
1804 }
1805
1806 }
1807 ++lb;
1808 }
1809
1810 // daehwan
1811 if (bDebug)
1812 {
1813 cout << "daehwan2 - end - fusion: " << (found_closure ? "found" : "not found") << endl;
1814 }
1815
1816 if (!found_closure)
1817 {
1818 return BowtieHit();
1819 }
1820 }
1821
1822 if (found_closure)
1823 {
1824 bool end = false;
1825 int mismatches = prev_hit->mismatches() + curr_hit->mismatches() + mismatch;
1826 BowtieHit merged_hit(prev_hit->ref_id(),
1827 curr_hit->ref_id2(),
1828 insert_id,
1829 new_left,
1830 new_cigar,
1831 antisense,
1832 antisense_closure,
1833 mismatches,
1834 mismatches + gap_length(new_cigar),
1835 prev_hit->splice_mms() + curr_hit->splice_mms(),
1836 end);
1837
1838 // daehwan - should fix this for SOLiD dataset
1839 merged_hit.seq(prev_hit->seq() + curr_hit->seq());
1840
1841 // daehwan
1842 if (bDebug)
1843 {
1844 cout << "fusing of " << merged_hit.left() << " and " << merged_hit.right() << endl;
1845 cout << print_cigar(merged_hit.cigar()) << endl;
1846 if (!merged_hit.check_editdist_consistency(rt, bDebug))
1847 {
1848 prev_hit->check_editdist_consistency(rt, bDebug);
1849 curr_hit->check_editdist_consistency(rt, bDebug);
1850 cout << "btw " << print_cigar(prev_hit->cigar()) << " and " << print_cigar(curr_hit->cigar()) << endl;
1851 cout << "this is a malformed hit" << endl;
1852 exit(1);
1853 }
1854 }
1855
1856 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
1857 /*
1858 * prev_hit now points PAST the last element removed
1859 */
1860 prev_hit = hit_chain.insert(prev_hit, merged_hit);
1861 /*
1862 * merged_hit has been inserted before the old position of
1863 * prev_hit. New location of prev_hit is merged_hit
1864 */
1865 curr_hit = prev_hit;
1866 ++curr_hit;
1867 ++curr_seg_index;
1868 continue;
1869 }
1870
1871 // daehwan
1872 if (bDebug)
1873 {
1874 cout << "daehwan - test 0.3" << endl;
1875 }
1876
1877 ++prev_hit;
1878 ++curr_hit;
1879 ++curr_seg_index;
1880 }
1881
1882 // daehwan
1883 if (bDebug)
1884 {
1885 cout << "daehwan - test2" << endl;
1886 }
1887
1888 bool saw_antisense_splice = false;
1889 bool saw_sense_splice = false;
1890 vector<CigarOp> long_cigar;
1891 int num_mismatches = 0;
1892 int num_splice_mms = 0;
1893 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
1894 {
1895 num_mismatches += s->mismatches();
1896 num_splice_mms += s->splice_mms();
1897
1898 /*
1899 * Check whether the sequence contains any reference skips. Previously,
1900 * this was just a check to see whether the sequence was contiguous; however
1901 * we don't want to count an indel event as a splice
1902 */
1903 bool containsSplice = s->is_spliced();
1904 if (containsSplice)
1905 {
1906 if (s->antisense_splice())
1907 {
1908 if (saw_sense_splice)
1909 return BowtieHit();
1910 saw_antisense_splice = true;
1911 }
1912 else
1913 {
1914 if (saw_antisense_splice)
1915 return BowtieHit();
1916 saw_sense_splice = true;
1917 }
1918 }
1919 const vector<CigarOp>& cigar = s->cigar();
1920 if (long_cigar.empty())
1921 {
1922 long_cigar = cigar;
1923 }
1924 else
1925 {
1926 CigarOp& last = long_cigar.back();
1927 /*
1928 * If necessary, merge the back and front
1929 * cigar operations
1930 */
1931 if(last.opcode == cigar[0].opcode){
1932 last.length += cigar[0].length;
1933 for (size_t b = 1; b < cigar.size(); ++b)
1934 {
1935 long_cigar.push_back(cigar[b]);
1936 }
1937 }else{
1938 for(size_t b = 0; b < cigar.size(); ++b)
1939 {
1940 long_cigar.push_back(cigar[b]);
1941 }
1942 }
1943 }
1944 }
1945
1946 bool end = false;
1947 BowtieHit new_hit(hit_chain.front().ref_id(),
1948 hit_chain.back().ref_id2(),
1949 insert_id,
1950 left,
1951 long_cigar,
1952 antisense,
1953 saw_antisense_splice,
1954 num_mismatches,
1955 num_mismatches + gap_length(long_cigar),
1956 num_splice_mms,
1957 end);
1958
1959 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1960 {
1961 new_hit.seq(seq);
1962 if (bowtie2)
1963 {
1964 // for the time being, let's compare "seq" and "read_seq"
1965 if (seq != read_seq)
1966 {
1967 string temp_qual = read_quals;
1968 reverse(temp_qual.begin(), temp_qual.end());
1969 new_hit.qual(temp_qual);
1970 }
1971 else
1972 new_hit.qual(read_quals);
1973 }
1974 else
1975 new_hit.qual(qual);
1976 }
1977 else
1978 {
1979 new_hit.seq(read_seq);
1980 new_hit.qual(read_quals);
1981 }
1982
1983 bool do_reverse = new_hit.ref_id() > new_hit.ref_id2();
1984 if (new_hit.ref_id() == new_hit.ref_id2())
1985 {
1986 vector<Fusion> fusions;
1987 bool auto_sort = false;
1988 fusions_from_spliced_hit(new_hit, fusions, auto_sort);
1989 if (fusions.size() > 0)
1990 {
1991 const Fusion& fusion = fusions[0];
1992 do_reverse = fusion.left > fusion.right;
1993 }
1994 }
1995
1996 if (do_reverse)
1997 {
1998 new_hit = new_hit.reverse();
1999 }
2000
2001 if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2002 {
2003 new_hit.antisense_align(!new_hit.antisense_align());
2004 }
2005
2006 // daehwan
2007 if (bDebug)
2008 {
2009 cout << "daehwan - test3" << endl;
2010 cout << new_hit.left() << " " << print_cigar(new_hit.cigar()) << endl;
2011 cout << new_hit.ref_id() << "-" << new_hit.ref_id2() << ": " << new_hit.fusion_opcode() << endl;
2012 }
2013
2014 int new_read_len = new_hit.read_len();
2015 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt, bDebug))
2016 {
2017 // daehwan
2018 if (bDebug)
2019 {
2020 cout << "Warning: " << new_hit.insert_id() << " malformed closure: " << print_cigar(new_hit.cigar()) << endl;
2021 exit(1);
2022 }
2023
2024 fprintf(stderr, "Warning: %d malformed closure\n", new_hit.insert_id());
2025 return BowtieHit();
2026 }
2027
2028 return new_hit;
2029 }
2030
2031
2032 int multi_closure = 0;
2033 int anchor_too_short = 0;
2034 int gap_too_short = 0;
2035
2036 bool valid_hit(const BowtieHit& bh)
2037 {
2038 if (bh.insert_id())
2039 {
2040 /*
2041 * validate the cigar chain - no gaps shorter than an intron, etc.
2042 * also,
2043 * -Don't start or end with an indel or refskip
2044 * -Only a match operation is allowed is allowed
2045 * adjacent to an indel or refskip
2046 * -Indels should confrom to length restrictions
2047 */
2048 const CigarOp* prevCig = &(bh.cigar()[0]);
2049 const CigarOp* currCig = &(bh.cigar()[1]);
2050 for (size_t i = 1; i < bh.cigar().size(); ++i){
2051 currCig = &(bh.cigar()[i]);
2052 if(!(currCig->opcode == MATCH || currCig->opcode == mATCH) &&
2053 !(prevCig->opcode == MATCH || prevCig->opcode == mATCH)){
2054 return false;
2055 }
2056 if(currCig->opcode == INS || currCig->opcode == iNS){
2057 if(currCig->length > max_insertion_length){
2058 return false;
2059 }
2060 }
2061 if(currCig->opcode == DEL || currCig->opcode == dEL){
2062 if(currCig->length > max_deletion_length){
2063 return false;
2064 }
2065 }
2066 if(currCig->opcode == REF_SKIP || currCig->opcode == rEF_SKIP){
2067 if(currCig->length < (uint64_t)min_report_intron_length){
2068 gap_too_short++;
2069 return false;
2070 }
2071 }
2072 prevCig = currCig;
2073 }
2074 if (!(bh.cigar().front().opcode == MATCH || bh.cigar().front().opcode == mATCH) ||
2075 !(bh.cigar().back().opcode == MATCH || bh.cigar().back().opcode == mATCH)/* ||
2076 (int)bh.cigar().front().length < min_anchor_len||
2077 (int)bh.cigar().back().length < min_anchor_len*/ )
2078 {
2079 anchor_too_short++;
2080 return false;
2081 }
2082 }
2083 else
2084 {
2085 multi_closure++;
2086 return false;
2087 }
2088
2089 return true;
2090 }
2091
2092 void merge_segment_chain(RefSequenceTable& rt,
2093 const string& read_seq,
2094 const string& read_quals,
2095 std::set<Junction>& possible_juncs,
2096 std::set<Insertion>& possible_insertions,
2097 std::set<Fusion>& possible_fusions,
2098 vector<BowtieHit>& hits,
2099 vector<BowtieHit>& merged_hits,
2100 int fusion_dir = FUSION_NOTHING)
2101 {
2102 if (hits.size() == 0)
2103 return;
2104
2105 BowtieHit bh;
2106 if (hits.size() > 1)
2107 {
2108 list<BowtieHit> hit_chain;
2109 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
2110 {
2111 if (hits.front().antisense_align())
2112 copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
2113 else
2114 copy(hits.begin(), hits.end(), back_inserter(hit_chain));
2115 }
2116 else
2117 {
2118 bool bSawFusion = false;
2119 for (size_t i = 0; i < hits.size(); ++i)
2120 {
2121 bool pushed = false;
2122 if (!bSawFusion)
2123 {
2124 if (i > 0)
2125 {
2126 if (hits[i-1].ref_id() != hits[i].ref_id())
2127 bSawFusion = true;
2128 else if(hits[i-1].antisense_align() != hits[i].antisense_align())
2129 bSawFusion = true;
2130 else
2131 {
2132 int dist = 0;
2133 if (hits[i].antisense_align())
2134 dist = hits[i-1].left() - hits[i].right();
2135 else
2136 dist = hits[i].left() - hits[i-1].right();
2137
2138 if (dist >= max_report_intron_length || dist < -(int)max_insertion_length)
2139 bSawFusion = true;
2140 }
2141 }
2142 }
2143
2144 if (hits[i].fusion_opcode() == FUSION_NOTHING &&
2145 ((fusion_dir == FUSION_FR && bSawFusion) || (fusion_dir == FUSION_RF && !bSawFusion)) &&
2146 hits[i].left() < hits[i].right())
2147 {
2148 hit_chain.push_back(hits[i].reverse());
2149 pushed = true;
2150 }
2151
2152 if (i > 0 &&
2153 hits[i].fusion_opcode() != FUSION_NOTHING &&
2154 hits[i].ref_id() != hits[i-1].ref_id())
2155 {
2156 hit_chain.push_back(hits[i].reverse());
2157 pushed = true;
2158 }
2159
2160 if (!bSawFusion)
2161 {
2162 if (hits[i].fusion_opcode() != FUSION_NOTHING)
2163 bSawFusion = true;
2164 }
2165
2166 if (!pushed)
2167 hit_chain.push_back(hits[i]);
2168 }
2169 }
2170
2171 // todo: merge_chain_color needs to be merged into merge_chain fuction.
2172 if (color)
2173 bh = merge_chain_color(rt,
2174 read_seq,
2175 read_quals,
2176 possible_juncs,
2177 possible_insertions,
2178 hit_chain);
2179 else
2180 bh = merge_chain(rt,
2181 read_seq,
2182 read_quals,
2183 possible_juncs,
2184 possible_insertions,
2185 possible_fusions,
2186 hit_chain,
2187 fusion_dir);
2188 }
2189 else
2190 {
2191 bh = hits[0];
2192 bool do_reverse = bh.ref_id() > bh.ref_id2();
2193 if (bh.ref_id() == bh.ref_id2())
2194 {
2195 vector<Fusion> fusions;
2196 bool auto_sort = false;
2197 fusions_from_spliced_hit(bh, fusions, auto_sort);
2198 if (fusions.size() > 0)
2199 {
2200 const Fusion& fusion = fusions[0];
2201 do_reverse = fusion.left > fusion.right;
2202 }
2203 }
2204
2205 if (do_reverse)
2206 bh = bh.reverse();
2207 }
2208
2209 if (valid_hit(bh))
2210 merged_hits.push_back(bh);
2211 }
2212
2213 bool dfs_seg_hits(RefSequenceTable& rt,
2214 const string& read_seq,
2215 const string& read_quals,
2216 std::set<Junction>& possible_juncs,
2217 std::set<Insertion>& possible_insertions,
2218 std::set<Fusion>& possible_fusions,
2219 vector<HitsForRead>& seg_hits_for_read,
2220 size_t curr,
2221 vector<BowtieHit>& seg_hit_stack,
2222 vector<BowtieHit>& joined_hits,
2223 int& num_try,
2224 int fusion_dir = FUSION_NOTHING)
2225 {
2226 if (num_try <= 0)
2227 return false;
2228
2229 assert (!seg_hit_stack.empty());
2230 bool join_success = false;
2231
2232 if (curr < seg_hits_for_read.size())
2233 {
2234 for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
2235 {
2236 /*
2237 * As we reverse segments depending on directions like FR or RF,
2238 * it's necessary to recover the original segments.
2239 */
2240 BowtieHit bh = seg_hits_for_read[curr].hits[i];
2241 BowtieHit bh_prev = seg_hit_stack.back();
2242
2243 BowtieHit* prevHit = &bh_prev;
2244 BowtieHit* currHit = &bh;
2245
2246 // daehwan - for debugging purposes
2247 // if (bh.insert_id() == 792140)
2248 // bDebug = true;
2249
2250 /*
2251 * Each segment has at most one fusion by assumption,
2252 */
2253 bool prevHit_fused = prevHit->fusion_opcode() != FUSION_NOTHING;
2254 bool currHit_fused = currHit->fusion_opcode() != FUSION_NOTHING;
2255
2256 /*
2257 * Count the number of fusions on prev and curr segments,
2258 * but this doesn't take into account the gap (if exists) between the two segments.
2259 */
2260 size_t num_fusions = prevHit_fused ? 1 : 0;
2261 num_fusions += currHit_fused ? 1 : 0;
2262
2263 int dir = prevHit_fused ? prevHit->fusion_opcode() : currHit->fusion_opcode();
2264
2265 /*
2266 * We don't allow reads that span more than two fusion points.
2267 */
2268 if (num_fusions >= 2)
2269 continue;
2270
2271 if (fusion_dir != FUSION_NOTHING && currHit_fused)
2272 continue;
2273
2274 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
2275 {
2276 if ((currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id()) ||
2277 (!currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id2()))
2278 continue;
2279 }
2280
2281 if (bDebug)
2282 {
2283 cout << "daehwan - prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": "
2284 << print_cigar(prevHit->cigar()) << endl;
2285 cout << "daehwan - prev sense: "
2286 << (prevHit->antisense_align() ? "-" : "+")
2287 << "\t"
2288 << (prevHit->antisense_align2() ? "-" : "+")
2289 << endl;
2290 cout << "daehwan - prev coords: "
2291 << prevHit->left()
2292 << "\t"
2293 << prevHit->right()
2294 << endl;
2295
2296 cout << "daehwan - curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": "
2297 << print_cigar(currHit->cigar()) << endl;
2298 cout << "daehwan - curr sense: "
2299 << (currHit->antisense_align() ? "-" : "+")
2300 << "\t"
2301 << (currHit->antisense_align2() ? "-" : "+")
2302 << endl;
2303 cout << "daehwan - curr coords: "
2304 << currHit->left()
2305 << "\t"
2306 << currHit->right()
2307 << endl;
2308 }
2309
2310 if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF) &&
2311 prevHit->ref_id2() != currHit->ref_id())
2312 continue;
2313
2314 if ((fusion_dir == FUSION_FR && !currHit->antisense_align()) ||
2315 (fusion_dir == FUSION_RF && currHit->antisense_align()))
2316 continue;
2317
2318 if (currHit_fused && dir == FUSION_RR)
2319 *currHit = currHit->reverse();
2320
2321 if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RF ||
2322 (currHit_fused && currHit->ref_id() == currHit->ref_id2() && (dir == FUSION_FR || dir == FUSION_RF)))
2323 {
2324 if (currHit_fused)
2325 {
2326 if ((dir == FUSION_FR && currHit->antisense_align()) ||
2327 (dir == FUSION_RF && !currHit->antisense_align()))
2328 *currHit = currHit->reverse();
2329 }
2330 else
2331 {
2332 if (fusion_dir == FUSION_FR && currHit->antisense_align())
2333 *currHit = currHit->reverse();
2334 }
2335 }
2336 /*
2337 * Switch prevHit and currHit in FUSION_NOTHING and FUSION_FF cases
2338 * to make it easier to check the distance in the gap between the two segments.
2339 */
2340 else if ((num_fusions == 0 && prevHit->antisense_align() && currHit->antisense_align() && prevHit->ref_id() == currHit->ref_id() && prevHit->left() <= currHit->right() + (int)max_report_intron_length && prevHit->left() + (int)max_insertion_length >= currHit->right()) ||
2341 (num_fusions == 1 && (dir == FUSION_FF || dir == FUSION_RR)&&
2342 ((!prevHit_fused && prevHit->antisense_align()) || (!currHit_fused && currHit->antisense_align())))
2343 )
2344 {
2345 BowtieHit* tempHit = prevHit;
2346 prevHit = currHit;
2347 currHit = tempHit;
2348 }
2349 else if (num_fusions == 0)
2350 {
2351 if (prevHit->ref_id2() == currHit->ref_id() && prevHit->antisense_align() == currHit->antisense_align())
2352 {
2353 int dist = 0;
2354 if (prevHit->antisense_align())
2355 dist = prevHit->left() - currHit->right();
2356 else
2357 dist = currHit->left() - prevHit->right();
2358
2359 if (dist > max_report_intron_length || dist < -(int)max_insertion_length)
2360 {
2361 if ((prevHit->antisense_align() && prevHit->left() > currHit->left()) ||
2362 (!prevHit->antisense_align() && prevHit->left() < currHit->left()))
2363 dir = FUSION_FF;
2364 else
2365 dir = FUSION_RR;
2366 }
2367 }
2368 else
2369 {
2370 if (prevHit->antisense_align() == currHit->antisense_align())
2371 {
2372 if ((prevHit->antisense_align() && prevHit->ref_id() > currHit->ref_id()) ||
2373 (!prevHit->antisense_align() && prevHit->ref_id() < currHit->ref_id()))
2374 dir = FUSION_FF;
2375 else
2376 dir = FUSION_RR;
2377 }
2378 else if (!prevHit->antisense_align())
2379 dir = FUSION_FR;
2380 else
2381 dir = FUSION_RF;
2382
2383 if (dir == FUSION_FR)
2384 *currHit = currHit->reverse();
2385 else if(dir == FUSION_RF)
2386 *prevHit = prevHit->reverse();
2387 }
2388 }
2389
2390 // daehwan - test
2391 if (bDebug)
2392 {
2393 cout << "insert id: " << prevHit->insert_id() << endl;
2394 cout << "(" << curr - 1 << ") prev: " << prevHit->seq() << " : " << (prevHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
2395 cout << "(" << curr << ") curr: " << currHit->seq() << " : " << (currHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
2396 cout << "prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl;
2397 cout << "curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl;
2398 cout << "prev coords: "
2399 << prevHit->left()
2400 << "\t"
2401 << prevHit->right()
2402 << endl;
2403 cout << "curr corrds: "
2404 << currHit->left()
2405 << "\t"
2406 << currHit->right()
2407 << endl;
2408 cout << "prev sense: "
2409 << (prevHit->antisense_align() ? "-" : "+")
2410 << "\t"
2411 << (prevHit->antisense_align2() ? "-" : "+")
2412 << endl;
2413 cout << "curr sense: "
2414 << (currHit->antisense_align() ? "-" : "+")
2415 << "\t"
2416 << (currHit->antisense_align2() ? "-" : "+")
2417 << endl;
2418 }
2419
2420 if (num_fusions == 1)
2421 {
2422 // daehwan
2423 if (bDebug)
2424 {
2425 cout << "direction: " << (int)dir << endl;
2426 }
2427
2428 /*
2429 * orient the fused segment, which depends on a fusion direction.
2430 */
2431 if (dir != FUSION_FF && dir != FUSION_RR)
2432 {
2433 bool prevHit_rep = false;
2434 bool currHit_rep = false;
2435
2436 if (prevHit_fused)
2437 {
2438 if ((dir == FUSION_FR && !currHit->antisense_align()) ||
2439 (dir == FUSION_RF && currHit->antisense_align()))
2440 continue;
2441
2442 if (prevHit->ref_id2() != currHit->ref_id())
2443 prevHit_rep = true;
2444 else if ((dir == FUSION_FR && prevHit->antisense_align()) ||
2445 (dir == FUSION_RF && !prevHit->antisense_align()))
2446 prevHit_rep = true;
2447 }
2448
2449 if (currHit_fused)
2450 {
2451 if ((dir == FUSION_FR && prevHit->antisense_align()) ||
2452 (dir == FUSION_RF && !prevHit->antisense_align()))
2453 continue;
2454
2455 if (currHit->ref_id() != prevHit->ref_id2())
2456 currHit_rep = true;
2457 }
2458
2459 if (bDebug)
2460 {
2461 if (prevHit_rep) cout << "1. reversed in prev" << endl;
2462 if (currHit_rep) cout << "1. reversed in curr" << endl;
2463 }
2464
2465 if (prevHit_rep)
2466 *prevHit = prevHit->reverse();
2467 if (currHit_rep)
2468 *currHit = currHit->reverse();
2469
2470 prevHit_rep = false;
2471 currHit_rep = false;
2472
2473 if (prevHit_fused)
2474 {
2475 if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
2476 currHit_rep = true;
2477 }
2478 else
2479 {
2480 if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
2481 prevHit_rep = true;
2482 }
2483
2484 if (prevHit_rep)
2485 *prevHit = prevHit->reverse();
2486 if (currHit_rep)
2487 *currHit = currHit->reverse();
2488
2489 // daehwan
2490 if (bDebug)
2491 {
2492 if (prevHit_rep) cout << "2. reversed in prev" << endl;
2493 if (currHit_rep) cout << "2. reversed in curr" << endl;
2494 }
2495 }
2496 }
2497
2498 bool same_contig = prevHit->ref_id2() == currHit->ref_id();
2499 if (!same_contig && num_fusions > 0)
2500 continue;
2501
2502 if (same_contig && num_fusions >= 1)
2503 {
2504 if (prevHit->antisense_align2() != currHit->antisense_align())
2505 continue;
2506 }
2507
2508 int bh_l = 0, back_right = 0, dist = 0;
2509 if (same_contig)
2510 {
2511 if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || dir == FUSION_FR || dir == FUSION_RF) &&
2512 prevHit->antisense_align2())
2513 {
2514 bh_l = prevHit->right() + 1;
2515 back_right = currHit->left() + 1;
2516 }
2517 else
2518 {
2519 bh_l = currHit->left();
2520 back_right = prevHit->right();
2521 }
2522
2523 dist = bh_l - back_right;
2524 }
2525
2526 // daehwan - pass
2527 if (bDebug)
2528 {
2529 cout << "daehwan - pass" << endl;
2530 cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl;
2531 cout << "curr coords: " << currHit->left() << "\t" << currHit->right() << endl;
2532 }
2533
2534 if (!same_contig ||
2535 (same_contig && num_fusions == 0 && dir != FUSION_NOTHING && fusion_dir == FUSION_NOTHING) ||
2536 (same_contig && dist <= max_report_intron_length && dist >= -(int)max_insertion_length && prevHit->is_forwarding_right() == currHit->is_forwarding_left()))
2537 {
2538 // daehwan
2539 if (bDebug)
2540 {
2541 cout << "daehwan - really passed!!" << endl;
2542 }
2543
2544 BowtieHit tempHit = seg_hit_stack.back();
2545 seg_hit_stack.back() = bh_prev;
2546
2547 // these hits are compatible, so push bh onto the
2548 // stack, recurse, and pop it when done.
2549 seg_hit_stack.push_back(bh);
2550 bool success = dfs_seg_hits(rt,
2551 read_seq,
2552 read_quals,
2553 possible_juncs,
2554 possible_insertions,
2555 possible_fusions,
2556 seg_hits_for_read,
2557 curr + 1,
2558 seg_hit_stack,
2559 joined_hits,
2560 num_try,
2561 dir == FUSION_NOTHING ? fusion_dir : dir);
2562
2563 if (success)
2564 join_success = true;
2565
2566 seg_hit_stack.pop_back();
2567 seg_hit_stack.back() = tempHit;
2568 }
2569 }
2570 }
2571 else
2572 {
2573 --num_try;
2574 merge_segment_chain(rt,
2575 read_seq,
2576 read_quals,
2577 possible_juncs,
2578 possible_insertions,
2579 possible_fusions,
2580 seg_hit_stack,
2581 joined_hits,
2582 fusion_dir);
2583
2584 return join_success = true;
2585 }
2586 return join_success;
2587 }
2588
2589 bool join_segments_for_read(RefSequenceTable& rt,
2590 const string& read_seq,
2591 const string& read_quals,
2592 std::set<Junction>& possible_juncs,
2593 std::set<Insertion>& possible_insertions,
2594 std::set<Fusion>& possible_fusions,
2595 vector<HitsForRead>& seg_hits_for_read,
2596 vector<BowtieHit>& joined_hits)
2597 {
2598 vector<BowtieHit> seg_hit_stack;
2599 bool join_success = false;
2600
2601 // ignore segments that map to more than this many places.
2602 if (bowtie2)
2603 {
2604 for (size_t s = 0; s < seg_hits_for_read.size(); ++s)
2605 {
2606 if (seg_hits_for_read[s].hits.size() > max_seg_multihits)
2607 return join_success;
2608 }
2609 }
2610
2611 for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
2612 {
2613 BowtieHit& bh = seg_hits_for_read[0].hits[i];
2614
2615 // daehwan - remove this
2616 //if (bh.insert_id() == 16487)
2617 // bDebug = true;
2618
2619 if (bh.fusion_opcode() == FUSION_RR)
2620 seg_hit_stack.push_back(bh.reverse());
2621 else
2622 seg_hit_stack.push_back(bh);
2623
2624 const int max_try = 1000;
2625 int num_try = max_try;
2626 bool success = dfs_seg_hits(rt,
2627 read_seq,
2628 read_quals,
2629 possible_juncs,
2630 possible_insertions,
2631 possible_fusions,
2632 seg_hits_for_read,
2633 1,
2634 seg_hit_stack,
2635 joined_hits,
2636 num_try);
2637 if (success)
2638 join_success = true;
2639 seg_hit_stack.pop_back();
2640 }
2641
2642 return join_success;
2643 }
2644
2645 struct JoinSegmentsWorker
2646 {
2647 void operator()()
2648 {
2649 ReadTable it;
2650 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header_fname.c_str(), bam_output_fname + ".index");
2651 ReadStream readstream(reads_fname);
2652 if (readstream.file() == NULL)
2653 err_die("Error: cannot open %s for reading\n", reads_fname.c_str());
2654
2655 if (read_offset > 0)
2656 readstream.seek(read_offset);
2657
2658 bool need_seq = true;
2659 bool need_qual = true;
2660
2661 vector<HitStream> contig_hits;
2662 vector<HitStream> spliced_hits;
2663 vector<HitFactory*> factories;
2664 for (size_t i = 0; i < segmap_fnames.size(); ++i)
2665 {
2666 HitFactory* fac = new BAMHitFactory(it, *rt);
2667 factories.push_back(fac);
2668 HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
2669
2670 if (seg_offsets[i] > 0)
2671 hs.seek(seg_offsets[i]);
2672
2673 contig_hits.push_back(hs);
2674 }
2675
2676 for (size_t i = 0; i < spliced_segmap_fnames.size(); ++i)
2677 {
2678 int anchor_length = 0;
2679 HitFactory* fac = new SplicedBAMHitFactory(it, *rt, anchor_length);
2680 factories.push_back(fac);
2681
2682 HitStream hs(spliced_segmap_fnames[i], fac, true, false, false, need_seq, need_qual);
2683 if (spliced_seg_offsets[i] > 0)
2684 hs.seek(spliced_seg_offsets[i]);
2685
2686 spliced_hits.push_back(hs);
2687 }
2688
2689 uint32_t curr_contig_obs_order = VMAXINT32;
2690 HitStream* first_seg_contig_stream = NULL;
2691 uint64_t next_contig_id = 0;
2692
2693 if (contig_hits.size() > 0)
2694 {
2695 first_seg_contig_stream = &(contig_hits.front());
2696 next_contig_id = first_seg_contig_stream->next_group_id();
2697 curr_contig_obs_order = it.observation_order(next_contig_id);
2698 }
2699
2700 HitsForRead curr_hit_group;
2701
2702 uint32_t curr_spliced_obs_order = VMAXINT32;
2703 HitStream* first_seg_spliced_stream = NULL;
2704 uint64_t next_spliced_id = 0;
2705
2706 if (spliced_hits.size() > 0)
2707 {
2708 first_seg_spliced_stream = &(spliced_hits.front());
2709 next_spliced_id = first_seg_spliced_stream->next_group_id();
2710 curr_spliced_obs_order = it.observation_order(next_spliced_id);
2711 }
2712
2713 while((curr_contig_obs_order != VMAXINT32 || curr_spliced_obs_order != VMAXINT32) &&
2714 (curr_contig_obs_order < end_id || curr_spliced_obs_order < end_id))
2715 {
2716 uint32_t read_in_process;
2717 vector<HitsForRead> seg_hits_for_read;
2718 seg_hits_for_read.resize(contig_hits.size());
2719
2720 if (curr_contig_obs_order < curr_spliced_obs_order)
2721 {
2722 first_seg_contig_stream->next_read_hits(curr_hit_group);
2723 seg_hits_for_read.front() = curr_hit_group;
2724
2725 next_contig_id = first_seg_contig_stream->next_group_id();
2726 uint32_t next_order = it.observation_order(next_contig_id);
2727
2728 read_in_process = curr_contig_obs_order;
2729 curr_contig_obs_order = next_order;
2730 }
2731 else if (curr_spliced_obs_order < curr_contig_obs_order)
2732 {
2733 first_seg_spliced_stream->next_read_hits(curr_hit_group);
2734 seg_hits_for_read.front() = curr_hit_group;
2735
2736 next_spliced_id = first_seg_spliced_stream->next_group_id();
2737 uint32_t next_order = it.observation_order(next_spliced_id);
2738
2739 read_in_process = curr_spliced_obs_order;
2740 curr_spliced_obs_order = next_order;
2741
2742 if (read_in_process < begin_id)
2743 continue;
2744 }
2745 else if (curr_contig_obs_order == curr_spliced_obs_order &&
2746 curr_contig_obs_order != VMAXINT32 &&
2747 curr_spliced_obs_order != VMAXINT32)
2748 {
2749 first_seg_contig_stream->next_read_hits(curr_hit_group);
2750
2751 HitsForRead curr_spliced_group;
2752 first_seg_spliced_stream->next_read_hits(curr_spliced_group);
2753
2754 curr_hit_group.hits.insert(curr_hit_group.hits.end(),
2755 curr_spliced_group.hits.begin(),
2756 curr_spliced_group.hits.end());
2757 seg_hits_for_read.front() = curr_hit_group;
2758 read_in_process = curr_spliced_obs_order;
2759
2760 next_contig_id = first_seg_contig_stream->next_group_id();
2761 uint32_t next_order = it.observation_order(next_contig_id);
2762
2763 next_spliced_id = first_seg_spliced_stream->next_group_id();
2764 uint32_t next_spliced_order = it.observation_order(next_spliced_id);
2765
2766 curr_spliced_obs_order = next_spliced_order;
2767 curr_contig_obs_order = next_order;
2768 }
2769 else
2770 {
2771 break;
2772 }
2773
2774 if (contig_hits.size() > 1)
2775 {
2776 look_right_for_hit_group(it,
2777 contig_hits,
2778 0,
2779 spliced_hits,
2780 curr_hit_group,
2781 seg_hits_for_read);
2782 }
2783
2784 size_t last_non_empty = seg_hits_for_read.size() - 1;
2785 while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
2786 {
2787 --last_non_empty;
2788 }
2789
2790 seg_hits_for_read.resize(last_non_empty + 1);
2791 if (!seg_hits_for_read[last_non_empty].hits[0].end())
2792 continue;
2793
2794 if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
2795 {
2796 uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
2797 if (insert_id >= begin_id && insert_id < end_id)
2798 {
2799 Read read;
2800 if (readstream.getRead(insert_id, read))
2801 {
2802 vector<BowtieHit> joined_hits;
2803 join_segments_for_read(*rt,
2804 read.seq.c_str(),
2805 read.qual.c_str(),
2806 *possible_juncs,
2807 *possible_insertions,
2808 *possible_fusions,
2809 seg_hits_for_read,
2810 joined_hits);
2811
2812 sort(joined_hits.begin(), joined_hits.end());
2813 vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
2814 joined_hits.erase(new_end, joined_hits.end());
2815 for (size_t i = 0; i < joined_hits.size(); i++)
2816 {
2817 if (joined_hits[i].mismatches() > read_mismatches ||
2818 joined_hits[i].gap_length() > read_gap_length ||
2819 joined_hits[i].edit_dist() > read_edit_dist)
2820 continue;
2821
2822 const char* ref_name = rt->get_name(joined_hits[i].ref_id());
2823 const char* ref_name2 = "";
2824 if (joined_hits[i].fusion_opcode() != FUSION_NOTHING)
2825 ref_name2 = rt->get_name(joined_hits[i].ref_id2());
2826
2827 vector<string> extra_fields;
2828
2829 if (!color)
2830 bowtie_sam_extra(joined_hits[i], *rt, extra_fields);
2831
2832 if (color)
2833 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2834 joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true, &extra_fields);
2835 else
2836 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2837 read.seq.c_str(), read.qual.c_str(), false, &extra_fields);
2838 }
2839 }
2840 else
2841 {
2842 err_die("Error: could not get read # %d from stream\n",
2843 read_in_process);
2844 }
2845 }
2846 }
2847 else
2848 {
2849 //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
2850 }
2851 }
2852
2853 for (size_t fac = 0; fac < factories.size(); ++fac)
2854 {
2855 delete factories[fac];
2856 }
2857 factories.clear();
2858 }
2859
2860 string bam_output_fname;
2861 string sam_header_fname;
2862 string reads_fname;
2863
2864 vector<string> segmap_fnames;
2865 vector<string> spliced_segmap_fnames;
2866
2867 std::set<Junction>* possible_juncs;
2868 std::set<Insertion>* possible_insertions;
2869 std::set<Fusion>* possible_fusions;
2870
2871 RefSequenceTable* rt;
2872
2873 uint64_t begin_id;
2874 uint64_t end_id;
2875 int64_t read_offset;
2876 vector<int64_t> seg_offsets;
2877 vector<int64_t> spliced_seg_offsets;
2878 };
2879
2880 void driver(const string& bam_output_fname,
2881 istream& ref_stream,
2882 vector<FILE*>& possible_juncs_files,
2883 vector<FILE*>& possible_insertions_files,
2884 vector<FILE*>& possible_deletions_files,
2885 vector<FILE*>& possible_fusions_files,
2886 vector<string>& spliced_segmap_fnames, //.bam files
2887 vector<string>& segmap_fnames, //.bam files
2888 const string& reads_fname)
2889 {
2890 if (!parallel)
2891 num_threads = 1;
2892
2893 if (segmap_fnames.size() == 0)
2894 {
2895 fprintf(stderr, "No hits to process, exiting\n");
2896 exit(0);
2897 }
2898
2899 RefSequenceTable rt(sam_header, true);
2900 fprintf (stderr, "Loading reference sequences...\n");
2901 get_seqs(ref_stream, rt, true);
2902 fprintf (stderr, " reference sequences loaded.\n");
2903 fprintf(stderr, "Loading junctions...");
2904
2905 std::set<Junction> possible_juncs;
2906
2907 for (size_t i = 0; i < possible_juncs_files.size(); ++i)
2908 {
2909 char buf[2048];
2910 while(!feof(possible_juncs_files[i]) &&
2911 fgets(buf, sizeof(buf), possible_juncs_files[i]))
2912 {
2913 char junc_ref_name[256];
2914 int left;
2915 int right;
2916 char orientation;
2917 int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
2918 if (ret != 4)
2919 continue;
2920 uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
2921 possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
2922 }
2923 }
2924 fprintf(stderr, "done\n");
2925
2926 fprintf(stderr, "Loading deletions...");
2927
2928 for (size_t i = 0; i < possible_deletions_files.size(); ++i)
2929 {
2930 char splice_buf[2048];
2931 FILE* deletions_file = possible_deletions_files[i];
2932 if(!deletions_file){
2933 continue;
2934 }
2935 while(fgets(splice_buf, 2048, deletions_file)){
2936 char* nl = strrchr(splice_buf, '\n');
2937 char* buf = splice_buf;
2938 if (nl) *nl = 0;
2939
2940 char* ref_name = get_token((char**)&buf, "\t");
2941 char* scan_left_coord = get_token((char**)&buf, "\t");
2942 char* scan_right_coord = get_token((char**)&buf, "\t");
2943
2944 if (!scan_left_coord || !scan_right_coord)
2945 {
2946 err_die("Error: malformed deletion coordinate record\n");
2947 }
2948
2949 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2950 uint32_t left_coord = atoi(scan_left_coord);
2951 uint32_t right_coord = atoi(scan_right_coord);
2952 possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
2953 }
2954 }
2955 fprintf(stderr, "done\n");
2956
2957 /*
2958 * Read the insertions from the list of insertion
2959 * files into a set
2960 */
2961 fprintf(stderr, "Loading insertions...");
2962 std::set<Insertion> possible_insertions;
2963 for (size_t i=0; i < possible_insertions_files.size(); ++i)
2964 {
2965 char splice_buf[2048];
2966 FILE* insertions_file = possible_insertions_files[i];
2967 if(!insertions_file){
2968 continue;
2969 }
2970 while(fgets(splice_buf, 2048, insertions_file)){
2971 char* nl = strrchr(splice_buf, '\n');
2972 char* buf = splice_buf;
2973 if (nl) *nl = 0;
2974
2975 char* ref_name = get_token((char**)&buf, "\t");
2976 char* scan_left_coord = get_token((char**)&buf, "\t");
2977 char* scan_right_coord = get_token((char**)&buf, "\t");
2978 char* scan_sequence = get_token((char**)&buf, "\t");
2979
2980 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
2981 {
2982 err_die("Error: malformed insertion coordinate record\n");
2983 }
2984
2985 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2986 uint32_t left_coord = atoi(scan_left_coord);
2987 std::string sequence(scan_sequence);
2988 possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
2989 }
2990 }
2991 fprintf(stderr, "done\n");
2992
2993 vector<uint64_t> read_ids;
2994 vector<vector<int64_t> > offsets;
2995 if (num_threads > 1)
2996 {
2997 vector<string> fnames;
2998 fnames.push_back(reads_fname);
2999 fnames.insert(fnames.end(), spliced_segmap_fnames.rbegin(), spliced_segmap_fnames.rend());
3000 fnames.insert(fnames.end(), segmap_fnames.rbegin(), segmap_fnames.rend());
3001 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
3002 if (!enough_data)
3003 num_threads = 1;
3004 }
3005
3006 std::set<Fusion> possible_fusions;
3007 if (fusion_search)
3008 {
3009 fprintf(stderr, "Loading fusions...");
3010 for (size_t i=0; i < possible_fusions_files.size(); ++i)
3011 {
3012 char splice_buf[2048];
3013 FILE* fusions_file = possible_fusions_files[i];
3014 if(!fusions_file){
3015 continue;
3016 }
3017 while(fgets(splice_buf, 2048, fusions_file)){
3018 char* nl = strrchr(splice_buf, '\n');
3019 char* buf = splice_buf;
3020 if (nl) *nl = 0;
3021
3022 char* ref_name1 = strsep((char**)&buf, "\t");
3023 char* scan_left_coord = strsep((char**)&buf, "\t");
3024 char* ref_name2 = strsep((char**)&buf, "\t");
3025 char* scan_right_coord = strsep((char**)&buf, "\t");
3026 char* scan_dir = strsep((char**)&buf, "\t");
3027
3028 if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
3029 {
3030 fprintf(stderr,"Error: malformed insertion coordinate record\n");
3031 exit(1);
3032 }
3033
3034 uint32_t ref_id1 = rt.get_id(ref_name1,NULL,0);
3035 uint32_t ref_id2 = rt.get_id(ref_name2,NULL,0);
3036 uint32_t left_coord = atoi(scan_left_coord);
3037 uint32_t right_coord = atoi(scan_right_coord);
3038 uint32_t dir = FUSION_FF;
3039 if (strcmp(scan_dir, "fr") == 0)
3040 dir = FUSION_FR;
3041 else if(strcmp(scan_dir, "rf") == 0)
3042 dir = FUSION_RF;
3043 else if (strcmp(scan_dir, "rr") == 0)
3044 dir = FUSION_RR;
3045
3046 possible_fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
3047 }
3048 }
3049 fprintf(stderr, "done\n");
3050 }
3051
3052 vector<boost::thread*> threads;
3053 for (int i = 0; i < num_threads; ++i)
3054 {
3055 JoinSegmentsWorker worker;
3056
3057 if (num_threads == 1)
3058 worker.bam_output_fname = bam_output_fname;
3059 else
3060 {
3061 string filename_base = bam_output_fname.substr(0, bam_output_fname.length() - 4);
3062 char filename[1024] = {0};
3063 sprintf(filename, "%s%d.bam", filename_base.c_str(), i);
3064 worker.bam_output_fname = filename;
3065 }
3066
3067 worker.sam_header_fname = sam_header;
3068 worker.reads_fname = reads_fname;
3069 worker.segmap_fnames = segmap_fnames;
3070 worker.spliced_segmap_fnames = spliced_segmap_fnames;
3071 worker.possible_juncs = &possible_juncs;
3072 worker.possible_insertions = &possible_insertions;
3073 worker.possible_fusions = &possible_fusions;
3074 worker.rt = &rt;
3075 if (i == 0)
3076 {
3077 worker.begin_id = 0;
3078 worker.seg_offsets = vector<int64_t>(segmap_fnames.size(), 0);
3079 worker.spliced_seg_offsets = vector<int64_t>(spliced_segmap_fnames.size(), 0);
3080 worker.read_offset = 0;
3081 }
3082 else
3083 {
3084 worker.begin_id = read_ids[i-1];
3085 worker.seg_offsets.insert(worker.seg_offsets.end(),
3086 offsets[i-1].rbegin(),
3087 offsets[i-1].rbegin() + segmap_fnames.size());
3088 worker.spliced_seg_offsets.insert(worker.spliced_seg_offsets.end(),
3089 offsets[i-1].rbegin() + segmap_fnames.size(),
3090 offsets[i-1].rend() - 1);
3091 worker.read_offset = offsets[i-1][0];
3092 }
3093
3094 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
3095
3096 if (num_threads > 1 && i + 1 < num_threads)
3097 threads.push_back(new boost::thread(worker));
3098 else
3099 worker();
3100 }
3101
3102 for (size_t i = 0; i < threads.size(); ++i)
3103 {
3104 threads[i]->join();
3105 delete threads[i];
3106 threads[i] = NULL;
3107 }
3108 threads.clear();
3109
3110 } //driver
3111
3112 int main(int argc, char** argv)
3113 {
3114 fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
3115 fprintf(stderr, "--------------------------------------------\n");
3116
3117 int parse_ret = parse_options(argc, argv, print_usage);
3118 if (parse_ret)
3119 return parse_ret;
3120
3121 if(optind >= argc)
3122 {
3123 print_usage();
3124 return 1;
3125 }
3126
3127 string ref_file_name = argv[optind++];
3128
3129 if(optind >= argc)
3130 {
3131 print_usage();
3132 return 1;
3133 }
3134
3135
3136 string reads_file_name = argv[optind++];
3137
3138 if(optind >= argc)
3139 {
3140 print_usage();
3141 return 1;
3142 }
3143
3144 string juncs_file_list = argv[optind++];
3145
3146 if(optind >= argc)
3147 {
3148 print_usage();
3149 return 1;
3150 }
3151
3152 string insertions_file_list = argv[optind++];
3153 if(optind >= argc)
3154 {
3155 print_usage();
3156 return 1;
3157 }
3158
3159 string deletions_file_list = argv[optind++];
3160
3161 if(optind >= argc)
3162 {
3163 print_usage();
3164 return 1;
3165 }
3166
3167 string fusions_file_list = argv[optind++];
3168
3169 if(optind >= argc)
3170 {
3171 print_usage();
3172 return 1;
3173 }
3174
3175 string bam_output_fname = argv[optind++];
3176
3177 if(optind >= argc)
3178 {
3179 print_usage();
3180 return 1;
3181 }
3182
3183 string segmap_file_list = argv[optind++];
3184 string spliced_segmap_flist;
3185 if(optind < argc)
3186 {
3187 spliced_segmap_flist = argv[optind++];
3188 }
3189
3190 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
3191 if (!ref_stream.good())
3192 err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
3193
3194 checkSamHeader();
3195
3196 //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
3197 vector<string> segmap_file_names;
3198 tokenize(segmap_file_list, ",",segmap_file_names);
3199
3200 vector<string> juncs_file_names;
3201 vector<FILE*> juncs_files;
3202 tokenize(juncs_file_list, ",",juncs_file_names);
3203 for (size_t i = 0; i < juncs_file_names.size(); ++i) {
3204 //fprintf(stderr, "Opening %s for reading\n",
3205 // juncs_file_names[i].c_str());
3206 FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
3207 if (juncs_file == NULL) {
3208 fprintf(stderr, "Warning: cannot open %s for reading\n",
3209 juncs_file_names[i].c_str());
3210 continue;
3211 }
3212 juncs_files.push_back(juncs_file);
3213 }
3214
3215 /*
3216 * Read in the deletion file names
3217 */
3218 vector<string> deletions_file_names;
3219 vector<FILE*> deletions_files;
3220 tokenize(deletions_file_list, ",",deletions_file_names);
3221 for (size_t i = 0; i < deletions_file_names.size(); ++i)
3222 {
3223 //fprintf(stderr, "Opening %s for reading\n",
3224 // deletions_file_names[i].c_str());
3225 FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
3226 if (deletions_file == NULL) {
3227 fprintf(stderr, "Warning: cannot open %s for reading\n",
3228 deletions_file_names[i].c_str());
3229 continue;
3230 }
3231 deletions_files.push_back(deletions_file);
3232 }
3233
3234 /*
3235 * Read in the list of filenames that contain
3236 * insertion coordinates
3237 */
3238
3239 vector<string> insertions_file_names;
3240 vector<FILE*> insertions_files;
3241 tokenize(insertions_file_list, ",",insertions_file_names);
3242 for (size_t i = 0; i < insertions_file_names.size(); ++i)
3243 {
3244 //fprintf(stderr, "Opening %s for reading\n",
3245 // insertions_file_names[i].c_str());
3246 FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
3247 if (insertions_file == NULL)
3248 {
3249 fprintf(stderr, "Warning: cannot open %s for reading\n",
3250 insertions_file_names[i].c_str());
3251 continue;
3252 }
3253 insertions_files.push_back(insertions_file);
3254 }
3255
3256 vector<string> spliced_segmap_file_names;
3257 vector<FZPipe> spliced_segmap_files;
3258 string unzcmd;
3259 tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
3260
3261 vector<string> fusions_file_names;
3262 vector<FILE*> fusions_files;
3263 tokenize(fusions_file_list, ",",fusions_file_names);
3264 for (size_t i = 0; i < fusions_file_names.size(); ++i)
3265 {
3266 fprintf(stderr, "Opening %s for reading\n",
3267 fusions_file_names[i].c_str());
3268 FILE* fusions_file = fopen(fusions_file_names[i].c_str(), "r");
3269 if (fusions_file == NULL)
3270 {
3271 fprintf(stderr, "Warning: cannot open %s for reading\n",
3272 fusions_file_names[i].c_str());
3273 continue;
3274 }
3275 fusions_files.push_back(fusions_file);
3276 }
3277
3278
3279 driver(bam_output_fname,
3280 ref_stream,
3281 juncs_files,
3282 insertions_files,
3283 deletions_files,
3284 fusions_files,
3285 spliced_segmap_file_names,
3286 segmap_file_names,
3287 reads_file_name);
3288
3289 return 0;
3290 }