ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 206
Committed: Mon Mar 12 18:02:35 2012 UTC (12 years, 5 months ago) by gpertea
File size: 99305 byte(s)
Log Message:
updated

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * long_spanning_reads.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <cstdlib>
22 #include <cstring>
23 #include <bitset>
24 //#include <stdexcept>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <seqan/modifier.h>
32 #include <getopt.h>
33
34 #include <boost/thread.hpp>
35
36 #include "common.h"
37 #include "utils.h"
38 #include "bwt_map.h"
39 #include "tokenize.h"
40 #include "segments.h"
41 #include "reads.h"
42
43 #include "junctions.h"
44 #include "insertions.h"
45 #include "deletions.h"
46 #include "fusions.h"
47
48 using namespace seqan;
49 using namespace std;
50
51 // daehwan
52 bool bDebug = false;
53
54 void print_usage()
55 {
56 fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
57 }
58
59 bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
60 const pair<uint32_t, HitsForRead>& rhs)
61 {
62 return lhs.first < rhs.first;
63 }
64
65 void get_seqs(istream& ref_stream,
66 RefSequenceTable& rt,
67 bool keep_seqs = true)
68 {
69 while(ref_stream.good() &&
70 !ref_stream.eof())
71 {
72 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
73 string name;
74 readMeta(ref_stream, name, Fasta());
75 string::size_type space_pos = name.find_first_of(" \t\r");
76 if (space_pos != string::npos)
77 {
78 name.resize(space_pos);
79 }
80 seqan::read(ref_stream, *ref_str, Fasta());
81
82 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
83 }
84 }
85
86
87 void look_right_for_hit_group(ReadTable& unmapped_reads,
88 vector<HitStream>& contig_hits,
89 size_t curr_file,
90 vector<HitStream>& spliced_hits,
91 const HitsForRead& targets,
92 vector<HitsForRead>& seg_hits_for_read)
93 {
94 int right_file = curr_file + 1;
95
96 HitStream& right = contig_hits[right_file];
97 uint64_t curr_next_group_id = targets.insert_id;
98 int curr_order = unmapped_reads.observation_order(curr_next_group_id);
99
100 assert (curr_order != -1);
101 while(true)
102 {
103 HitsForRead hit_group;
104 uint64_t right_next_group_id = right.next_group_id();
105 int right_order = unmapped_reads.observation_order(right_next_group_id);
106
107 // If we would have seen the hits by now, bail out.
108 if (curr_order < right_order || right_order == -1)
109 {
110 break;
111 }
112 if (right.next_read_hits(hit_group))
113 {
114 if (hit_group.insert_id == targets.insert_id)
115 {
116 // Some of the targets may be missing, we need to
117 // process them individually
118 seg_hits_for_read[right_file] = hit_group;
119 break;
120 }
121 }
122 }
123
124 HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
125
126 if (right_file < (int)spliced_hits.size() && right_file >= 0)
127 {
128 // Scan forward in the spliced hits file for this hit group
129 HitsForRead spliced_group;
130 HitsForRead curr_spliced_group;
131 while (spliced_hits[right_file].next_group_id() > 0 &&
132 spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
133 {
134 spliced_hits[right_file].next_read_hits(curr_spliced_group);
135
136 if (curr_spliced_group.insert_id == (uint32_t)curr_order)
137 {
138 spliced_group = curr_spliced_group;
139 break;
140 }
141 }
142
143 if (!spliced_group.hits.empty())
144 {
145 curr_seg_hits.insert_id = spliced_group.insert_id;
146 curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
147 spliced_group.hits.begin(),
148 spliced_group.hits.end());
149 }
150 }
151
152 if (curr_seg_hits.hits.empty())
153 return;
154 else if (right_file + 1 < (int)contig_hits.size())
155 {
156 look_right_for_hit_group(unmapped_reads,
157 contig_hits,
158 curr_file + 1,
159 spliced_hits,
160 curr_seg_hits,
161 seg_hits_for_read);
162 }
163 }
164
165 BowtieHit merge_chain_color(RefSequenceTable& rt,
166 const string& read_seq,
167 const string& read_quals,
168 std::set<Junction>& possible_juncs,
169 std::set<Insertion>& possible_insertions,
170 list<BowtieHit>& hit_chain)
171 {
172 bool antisense = hit_chain.front().antisense_align();
173 uint32_t reference_id = hit_chain.front().ref_id();
174 uint64_t insert_id = hit_chain.front().insert_id();
175
176 int left = hit_chain.front().left();
177
178 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
179 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
180
181 string seq;
182 string qual;
183 int old_read_length = 0;
184 int first_seg_length = hit_chain.front().seq().length();
185 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
186 {
187 seq += i->seq();
188 qual += i->qual();
189 old_read_length += i->read_len();
190 }
191
192 string rev_read_seq, rev_read_quals;
193 if (color && antisense)
194 {
195 rev_read_seq = read_seq;
196 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
197
198 rev_read_quals = read_quals;
199 reverse(rev_read_quals.begin(), rev_read_quals.end());
200 }
201
202 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
203 {
204 /*
205 * Note that the gap size may be negative, since left() and right() return
206 * signed integers, this will be OK.
207 */
208 int gap = curr_hit->left() - prev_hit->right();
209 if (gap < -(int)max_insertion_length ||
210 (gap > (int)max_deletion_length &&
211 (gap < min_report_intron_length || gap > max_report_intron_length)))
212 {
213 return BowtieHit();
214 }
215
216 ++prev_hit;
217 ++curr_hit;
218 }
219
220 prev_hit = hit_chain.begin();
221 curr_hit = ++(hit_chain.begin());
222
223 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
224 if (!ref_str)
225 return BowtieHit();
226
227 int curr_seg_index = 1;
228 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
229 {
230 /*
231 * This code is assuming that the cigar strings end and start with a match
232 * While segment alignments can actually end with a junction, insertion or deletion, the hope
233 * is that in those cases, the right and left ends of the alignments will correctly
234 * line up, so we won't get to this bit of code
235 */
236 if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
237 {
238 return BowtieHit();
239 }
240
241 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
242 {
243 /*
244 * There is no way that we can splice these together into a valid
245 * alignment
246 */
247 return BowtieHit();
248 }
249
250 bool found_closure = false;
251 /*
252 * Note that antisense_splice is the same for prev and curr
253 */
254 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
255 vector<CigarOp> new_cigar;
256 int new_left = -1;
257 int mismatch = 0;
258
259 /*
260 * Take the length of matched bases in each segment into consideration for closures,
261 * this can be a problem for reads of variable lengths.
262 */
263 int prev_right_end_match_length = prev_hit->cigar().back().length;
264 int curr_left_end_match_length = curr_hit->cigar().front().length;
265
266 if (prev_hit->right() > curr_hit->left())
267 {
268 std::set<Insertion>::iterator lb, ub;
269 /*
270 * Note, this offset is determined by the min-anchor length supplied to
271 * juncs_db, which is currently hard-coded at 3 in tophat.py
272 * as this value determines what sort of read segments
273 * should be able to align directly to the splice sequences
274 */
275 int left_boundary = prev_hit->right() - 4;
276 int right_boundary = curr_hit->left() + 4;
277
278 /*
279 * Create a dummy sequence to represent the maximum possible insertion
280 */
281 std::string maxInsertedSequence = "";
282 maxInsertedSequence.resize(max_insertion_length,'A');
283
284 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
285 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
286
287 int reference_mismatch = 0;
288
289 while (lb != ub && lb != possible_insertions.end())
290 {
291 /*
292 * In the following code, we will check to make sure that the segments have the proper
293 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
294 * In general, reads with insertions must match the inserted sequence exactly.
295 */
296 if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
297 {
298 /*
299 * Check we have enough matched bases on prev or curr segment.
300 */
301 int insert_to_prev_right = prev_hit->right() - lb->left - 1;
302 int curr_left_to_insert = lb->left - curr_hit->left() + 1;
303 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
304 {
305 ++lb;
306 continue;
307 }
308
309 /*
310 * Keep track of how many mismatches were made to the genome in the region
311 * where we should actually be matching against the insertion
312 */
313 int this_reference_mismatch = 0;
314 int insertion_mismatch = 0;
315 int insertion_len = lb->sequence.length();
316 const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
317
318 /*
319 * First check to see if we need to adjust number of observed errors for the left (prev)
320 * hit. This is only the case if this segment runs into the insertion. To be consistent
321 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
322 */
323 string colorSegmentSequence_prev;
324 if (insert_to_prev_right > 0)
325 {
326 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
327 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
328
329 if (color)
330 {
331 string color;
332
333 if (antisense)
334 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
335 else
336 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
337
338 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
339 colorSegmentSequence_prev = convert_color_to_bp(color);
340 }
341
342 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
343
344 /*
345 * Scan right in the read until we run out of read
346 */
347 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
348 {
349 /*
350 * Any mismatch to the insertion is a failure
351 */
352 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
353 {
354 ++this_reference_mismatch;
355 }
356
357 if (read_index < insertion_len)
358 {
359 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
360 {
361 ++insertion_mismatch;
362 break;
363 }
364 }
365 else
366 {
367 if (referenceSequence[read_index - insertion_len] == 'N' ||
368 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
369 {
370 --this_reference_mismatch;
371 }
372 }
373 }
374 }
375
376 string colorSegmentSequence_curr;
377 if (curr_left_to_insert > 0)
378 {
379 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
380 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
381
382 if (color)
383 {
384 string color;
385 if (antisense)
386 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
387 else
388 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
389
390 color.push_back(curr_hit->seq()[curr_left_to_insert]);
391 reverse(color.begin(), color.end());
392 string bp = convert_color_to_bp(color);
393 reverse(bp.begin(), bp.end());
394 colorSegmentSequence_curr = bp;
395 }
396
397 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
398
399 /*
400 * Scan left in the read until
401 * We ran out of read sequence (insertion extends past segment)
402 */
403 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
404 {
405 int segmentPosition = curr_left_to_insert - read_index - 1;
406 int insertionPosition = insertion_len - read_index - 1;
407
408 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
409 {
410 ++this_reference_mismatch;
411 }
412
413 if (read_index < insertion_len)
414 {
415 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
416 {
417 ++insertion_mismatch;
418 break;
419 }
420 }
421 else
422 {
423 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
424 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
425 {
426 --this_reference_mismatch;
427 }
428 }
429 }
430 }
431
432 if (found_closure)
433 {
434 fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
435 return BowtieHit();
436 }
437
438 if (insertion_mismatch == 0)
439 {
440 reference_mismatch = this_reference_mismatch;
441 mismatch = -reference_mismatch;
442 found_closure = true;
443 new_left = prev_hit->left();
444 new_cigar = prev_hit->cigar();
445
446 /*
447 * Need to make a new insert operation between the two match character that begin
448 * and end the intersection of these two junction. Note that we necessarily assume
449 * that this insertion can't span beyond the boundaries of these reads. That should
450 * probably be better enforced somewhere
451 */
452
453 new_cigar.back().length -= insert_to_prev_right;
454 if (new_cigar.back().length <= 0)
455 new_cigar.pop_back();
456
457 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
458 vector<CigarOp> new_right_cigar = curr_hit->cigar();
459 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
460
461 /*
462 * Finish stitching together the new cigar string
463 */
464 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
465 for (; c < new_right_cigar.size(); ++c)
466 {
467 new_cigar.push_back(new_right_cigar[c]);
468 }
469
470 if (color)
471 {
472 if (insert_to_prev_right > 0)
473 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
474
475 if (curr_left_to_insert > 0)
476 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
477 }
478 }
479 }
480 ++lb;
481 }
482
483 if (!found_closure)
484 {
485 return BowtieHit();
486 }
487 }
488
489 /*
490 * Stitch segments together using juctions or deletions if necessary.
491 */
492 else if (prev_hit->right() < curr_hit->left())
493 {
494 std::set<Junction>::iterator lb, ub;
495
496 int left_boundary = prev_hit->right() - 4;
497 int right_boundary = curr_hit->left() + 4;
498
499 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
500 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
501
502 int new_diff_mismatches = 0xff;
503 while (lb != ub && lb != possible_juncs.end())
504 {
505 int dist_to_left = lb->left - prev_hit->right() + 1;
506 int dist_to_right = lb->right - curr_hit->left();
507
508 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
509 {
510 /*
511 * Check we have enough matched bases on prev or curr segment.
512 */
513 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
514 {
515 ++lb;
516 continue;
517 }
518
519 Dna5String new_cmp_str, old_cmp_str;
520 int new_mismatch = 0, old_mismatch = 0;
521 string new_patch_str; // this is for colorspace reads
522 if (dist_to_left > 0)
523 {
524 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
525 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
526
527 string new_seq;
528 if (color)
529 {
530 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
531
532 string color, qual;
533 if (antisense)
534 {
535 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
536 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
537 }
538 else
539 {
540 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
541 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
542 }
543
544 BWA_decode(color, qual, ref, new_seq);
545 new_seq = new_seq.substr(1);
546 }
547
548 const string& curr_old_seq = curr_hit->seq();
549 const string& curr_seq = color ? new_seq : curr_hit->seq();
550 for (int i = 0; i < dist_to_left; ++i)
551 {
552 if (curr_seq[i] != new_cmp_str[i])
553 ++new_mismatch;
554
555 if (curr_old_seq[i] != old_cmp_str[i])
556 ++old_mismatch;
557 }
558
559 if (color)
560 new_patch_str = curr_seq.substr(0, dist_to_left);
561 }
562 else if (dist_to_left < 0)
563 {
564 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
565 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
566
567 size_t abs_dist = -dist_to_left;
568 string new_seq;
569 if (color)
570 {
571 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
572 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
573
574 string color, qual;
575 if (antisense)
576 {
577 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
578 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
579 }
580 else
581 {
582 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
583 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
584 }
585
586 BWA_decode(color, qual, ref, new_seq);
587 new_seq = new_seq.substr(1);
588 }
589
590 const string& prev_old_seq = prev_hit->seq();
591 size_t prev_old_seq_len = prev_old_seq.length();
592 const string& prev_seq = color ? new_seq : prev_hit->seq();
593 size_t prev_seq_len = prev_seq.length();
594 for (size_t i = 0; i < abs_dist; ++i)
595 {
596 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
597 ++new_mismatch;
598 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
599 ++old_mismatch;
600 }
601
602 if (color)
603 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
604 }
605
606 int temp_diff_mismatches = new_mismatch - old_mismatch;
607 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
608 {
609 ++lb;
610 continue;
611 }
612
613 if (color)
614 {
615 /*
616 * We need to recover the origianl sequence.
617 */
618 if (found_closure)
619 {
620 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
621 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
622 }
623
624 if (dist_to_left > 0)
625 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
626 else if (dist_to_left < 0)
627 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
628 }
629
630 new_diff_mismatches = temp_diff_mismatches;
631
632 new_left = prev_hit->left();
633 new_cigar = prev_hit->cigar();
634
635 int new_left_back_len = new_cigar.back().length;
636 new_left_back_len += dist_to_left;
637
638 vector<CigarOp> new_right_cig = curr_hit->cigar();
639 int new_right_front_len = new_right_cig.front().length;
640 new_right_front_len -= dist_to_right;
641 if (new_left_back_len > 0)
642 new_cigar.back().length = new_left_back_len;
643 else
644 new_cigar.pop_back();
645
646 /*
647 * FIXME, currently just differentiating between a deletion and a
648 * reference skip based on length. However, would probably be better
649 * to denote the difference explicitly, this would allow the user
650 * to supply their own (very large) deletions
651 */
652 if ((lb->right - lb->left - 1) <= max_deletion_length)
653 {
654 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
655 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
656 }
657 else
658 {
659 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
660 antisense_closure = lb->antisense;
661 }
662
663 new_right_cig.front().length = new_right_front_len;
664 size_t c = new_right_front_len > 0 ? 0 : 1;
665 for (; c < new_right_cig.size(); ++c)
666 new_cigar.push_back(new_right_cig[c]);
667
668 mismatch = new_diff_mismatches;
669 found_closure = true;
670 }
671 ++lb;
672 }
673
674 if (!found_closure)
675 {
676 return BowtieHit();
677 }
678 }
679
680 if (found_closure)
681 {
682 bool end = false;
683 BowtieHit merged_hit(reference_id,
684 reference_id,
685 insert_id,
686 new_left,
687 new_cigar,
688 antisense,
689 antisense_closure,
690 prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
691 prev_hit->splice_mms() + curr_hit->splice_mms(),
692 end);
693
694 if (curr_seg_index > 1)
695 merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
696 else
697 merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
698
699 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
700 /*
701 * prev_hit now points PAST the last element removed
702 */
703 prev_hit = hit_chain.insert(prev_hit, merged_hit);
704 /*
705 * merged_hit has been inserted before the old position of
706 * prev_hit. New location of prev_hit is merged_hit
707 */
708 curr_hit = prev_hit;
709 ++curr_hit;
710 ++curr_seg_index;
711 continue;
712 }
713
714 ++prev_hit;
715 ++curr_hit;
716 ++curr_seg_index;
717 }
718
719 bool saw_antisense_splice = false;
720 bool saw_sense_splice = false;
721 vector<CigarOp> long_cigar;
722 int num_mismatches = 0;
723 int num_splice_mms = 0;
724 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
725 {
726 num_mismatches += s->edit_dist();
727 num_splice_mms += s->splice_mms();
728
729 /*
730 * Check whether the sequence contains any reference skips. Previously,
731 * this was just a check to see whether the sequence was contiguous; however
732 * we don't want to count an indel event as a splice
733 */
734 bool containsSplice = s->is_spliced();
735 if (containsSplice)
736 {
737 if (s->antisense_splice())
738 {
739 if (saw_sense_splice)
740 return BowtieHit();
741 saw_antisense_splice = true;
742 }
743 else
744 {
745 if (saw_antisense_splice)
746 return BowtieHit();
747 saw_sense_splice = true;
748 }
749 }
750 const vector<CigarOp>& cigar = s->cigar();
751 if (long_cigar.empty())
752 {
753 long_cigar = cigar;
754 }
755 else
756 {
757 CigarOp& last = long_cigar.back();
758 /*
759 * If necessary, merge the back and front
760 * cigar operations
761 */
762 if(last.opcode == cigar[0].opcode){
763 last.length += cigar[0].length;
764 for (size_t b = 1; b < cigar.size(); ++b)
765 {
766 long_cigar.push_back(cigar[b]);
767 }
768 }else{
769 for(size_t b = 0; b < cigar.size(); ++b)
770 {
771 long_cigar.push_back(cigar[b]);
772 }
773 }
774 }
775 }
776
777 bool end = false;
778 BowtieHit new_hit(reference_id,
779 reference_id,
780 insert_id,
781 left,
782 long_cigar,
783 antisense,
784 saw_antisense_splice,
785 num_mismatches,
786 num_splice_mms,
787 end);
788
789 new_hit.seq(seq);
790 new_hit.qual(qual);
791
792 int new_read_len = new_hit.read_len();
793 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
794 {
795 fprintf(stderr, "Warning: malformed closure\n");
796 return BowtieHit();
797 }
798
799 return new_hit;
800 }
801
802 BowtieHit merge_chain(RefSequenceTable& rt,
803 const string& read_seq,
804 const string& read_quals,
805 std::set<Junction>& possible_juncs,
806 std::set<Insertion>& possible_insertions,
807 std::set<Fusion>& possible_fusions,
808 list<BowtieHit>& hit_chain,
809 int fusion_dir = FUSION_NOTHING)
810 {
811 bool antisense = hit_chain.front().antisense_align();
812 uint64_t insert_id = hit_chain.front().insert_id();
813
814 const int left = hit_chain.front().left();
815
816 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
817 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
818
819 string seq;
820 string qual;
821 int old_read_length = 0;
822 int first_seg_length = hit_chain.front().seq().length();
823 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
824 {
825 seq += i->seq();
826 qual += i->qual();
827 old_read_length += i->read_len();
828 }
829
830 string rev_read_seq, rev_read_quals;
831 if (color && antisense)
832 {
833 rev_read_seq = read_seq;
834 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
835
836 rev_read_quals = read_quals;
837 reverse(rev_read_quals.begin(), rev_read_quals.end());
838 }
839
840 size_t num_fusions = prev_hit->fusion_opcode() == FUSION_NOTHING ? 0 : 1;
841 bool fusion_passed = false;
842 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
843 {
844 if (prev_hit->ref_id() != prev_hit->ref_id2() || prev_hit->ref_id2() != curr_hit->ref_id())
845 fusion_passed = true;
846
847 if (prev_hit->ref_id2() != curr_hit->ref_id())
848 ++num_fusions;
849
850 if (curr_hit->fusion_opcode() != FUSION_NOTHING)
851 ++num_fusions;
852
853 if (prev_hit->ref_id2() == curr_hit->ref_id())
854 {
855 bool reversed = false;
856 if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
857 reversed = true;
858
859 /*
860 * Note that the gap size may be negative, since left() and right() return
861 * signed integers, this will be OK.
862 */
863 int gap;
864 if (reversed)
865 gap = prev_hit->right() - curr_hit->left();
866 else
867 gap = curr_hit->left() - prev_hit->right();
868
869 // daehwan
870 if (bDebug)
871 {
872 cout << "prev: " << prev_hit->ref_id() << ":" << prev_hit->left() << ":" << (prev_hit->antisense_align() ? "-" : "+")
873 << "\t" << prev_hit->ref_id2() << ":" << prev_hit->right() << ":" << (prev_hit->antisense_align2() ? "-" : "+") << endl
874 << "curr: " << curr_hit->ref_id() << ":" << curr_hit->left() << ":" << (curr_hit->antisense_align() ? "-" : "+")
875 << "\t" << curr_hit->ref_id2() << ":" << curr_hit->right() << ":" << (curr_hit->antisense_align2() ? "-" : "+") << endl
876 << "gap: " << gap << endl;
877 }
878
879 if (gap < -(int)max_insertion_length ||
880 (gap > (int)max_deletion_length &&
881 (gap < min_report_intron_length || gap > min(max_report_intron_length, (int)fusion_min_dist))))
882 {
883 fusion_passed = true;
884 ++num_fusions;
885 }
886 }
887
888 if (num_fusions >= 2)
889 return BowtieHit();
890
891 ++prev_hit;
892 ++curr_hit;
893 }
894
895 prev_hit = hit_chain.begin();
896 curr_hit = ++(hit_chain.begin());
897
898 // daehwan
899 if (bDebug)
900 {
901 cout << "daehwan - test" << endl;
902 }
903
904 int curr_seg_index = 1;
905 fusion_passed = false;
906 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
907 {
908 antisense = prev_hit->antisense_align();
909
910 // daehwan
911 if (bDebug)
912 {
913 cout << "daehwan - start - stitch" << endl;
914 cout << "prev right: " << prev_hit->right() << endl;
915 cout << "curr left: " << curr_hit->left() << endl;
916 cout << "prev back: " << prev_hit->cigar().back().opcode << endl;
917 cout << "curr front: " << curr_hit->cigar().front().opcode << endl;
918 cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
919 cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
920 }
921
922 if (prev_hit->fusion_opcode() != FUSION_NOTHING || prev_hit->ref_id2() != curr_hit->ref_id())
923 fusion_passed = true;
924
925 /*
926 * This code is assuming that the cigar strings end and start with a match
927 * While segment alignments can actually end with a junction, insertion or deletion, the hope
928 * is that in those cases, the right and left ends of the alignments will correctly
929 * line up, so we won't get to this bit of code
930 */
931 if (!(prev_hit->cigar().back().opcode == MATCH || curr_hit->cigar().front().opcode == MATCH ||
932 prev_hit->cigar().back().opcode == mATCH || curr_hit->cigar().front().opcode == mATCH))
933 {
934 return BowtieHit();
935 }
936
937 // daehwan
938 if (bDebug)
939 {
940 cout << "daehwan - pass - enough matched bases" << endl;
941 }
942
943 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
944 {
945 /*
946 * There is no way that we can splice these together into a valid
947 * alignment
948 */
949 return BowtieHit();
950 }
951
952 bool found_closure = false;
953 /*
954 * Note that antisense_splice is the same for prev and curr
955 */
956 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
957 vector<CigarOp> new_cigar;
958 int new_left = -1;
959 int mismatch = 0;
960
961 /*
962 * Take the length of matched bases in each segment into consideration for closures,
963 * this can be a problem for reads of variable lengths.
964 */
965 int prev_right_end_match_length = prev_hit->cigar().back().length;
966 int curr_left_end_match_length = curr_hit->cigar().front().length;
967
968 bool check_fusion = prev_hit->ref_id2() != curr_hit->ref_id();
969
970 if (prev_hit->ref_id2() == curr_hit->ref_id())
971 {
972 // daehwan
973 if (bDebug)
974 {
975 cout << "daehwan - start - junction or insertion" << endl;
976 cout << "prev right: " << prev_hit->right() << endl;
977 cout << "curr left: " << curr_hit->left() << endl;
978 cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
979 cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
980 }
981
982 bool reversed = false;
983 if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
984 reversed = true;
985
986 uint32_t reference_id = prev_hit->ref_id2();
987 RefSequenceTable::Sequence* ref_str = rt.get_seq(reference_id);
988
989 int left_boundary, right_boundary;
990 if (reversed)
991 {
992 left_boundary = curr_hit->left() - 4;
993 right_boundary = prev_hit->right() + 4;
994 }
995 else
996 {
997 left_boundary = prev_hit->right() - 4;
998 right_boundary = curr_hit->left() + 4;
999 }
1000
1001 int dist_btw_two;
1002 if (reversed)
1003 dist_btw_two = prev_hit->right() - curr_hit->left();
1004 else
1005 dist_btw_two = curr_hit->left() - prev_hit->right();
1006
1007 if (dist_btw_two < 0 && dist_btw_two >= -max_insertion_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
1008 {
1009 std::set<Insertion>::iterator lb, ub;
1010
1011 /*
1012 * Create a dummy sequence to represent the maximum possible insertion
1013 */
1014 std::string maxInsertedSequence = "";
1015 maxInsertedSequence.resize(max_insertion_length,'A');
1016
1017 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
1018 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
1019
1020 int reference_mismatch = 0;
1021
1022 while (lb != ub && lb != possible_insertions.end())
1023 {
1024 /*
1025 * In the following code, we will check to make sure that the segments have the proper
1026 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
1027 * In general, reads with insertions must match the inserted sequence exactly.
1028 */
1029 if (((int)lb->sequence.size()) == (reversed ? curr_hit->left() - prev_hit->right() : prev_hit->right() - curr_hit->left()))
1030 {
1031 /*
1032 * Check we have enough matched bases on prev or curr segment.
1033 */
1034 int insert_to_prev_right, curr_left_to_insert;
1035 if (reversed)
1036 {
1037 insert_to_prev_right = lb->left - prev_hit->right();
1038 curr_left_to_insert = curr_hit->left() - lb->left;
1039 }
1040 else
1041 {
1042 insert_to_prev_right = prev_hit->right() - lb->left - 1;
1043 curr_left_to_insert = lb->left - curr_hit->left() + 1;
1044 }
1045
1046 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
1047 {
1048 ++lb;
1049 continue;
1050 }
1051
1052 // daehwan
1053 if (bDebug)
1054 {
1055 cout << "insert_to_prev_right: " << insert_to_prev_right << endl;
1056 cout << "curr_left_to_insert: " << curr_left_to_insert << endl;
1057 cout << "curr_seg_index: " << curr_seg_index << endl;
1058 }
1059
1060 /*
1061 * Keep track of how many mismatches were made to the genome in the region
1062 * where we should actually be matching against the insertion
1063 */
1064 int this_reference_mismatch = 0;
1065 int insertion_mismatch = 0;
1066 int insertion_len = lb->sequence.length();
1067 seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
1068 if (reversed)
1069 {
1070 seqan::reverseComplement(insertionSequence);
1071 }
1072
1073 /*
1074 * First check to see if we need to adjust number of observed errors for the left (prev)
1075 * hit. This is only the case if this segment runs into the insertion. To be consistent
1076 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
1077 */
1078 string colorSegmentSequence_prev;
1079 if (insert_to_prev_right > 0)
1080 {
1081 seqan::Dna5String referenceSequence, oldSegmentSequence;
1082
1083 if (reversed)
1084 {
1085 referenceSequence = seqan::infix(*ref_str, prev_hit->right() + 1, lb->left + 1);
1086 seqan::reverseComplement(referenceSequence);
1087
1088 string temp;
1089
1090 // daehwan
1091 if (bDebug)
1092 {
1093 cout << "reversed: " << read_seq.length() << " " << read_seq << endl;
1094 }
1095
1096 temp = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right);
1097 oldSegmentSequence = seqan::Dna5String(temp);
1098 }
1099 else
1100 {
1101 referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
1102
1103 // daehwan
1104 if (bDebug)
1105 {
1106 cout << "non-reversed: " << prev_hit->seq() << endl;
1107 }
1108
1109 oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
1110 }
1111
1112 if (color)
1113 {
1114 string color;
1115
1116 if (antisense)
1117 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
1118 else
1119 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
1120
1121 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
1122 colorSegmentSequence_prev = convert_color_to_bp(color);
1123 }
1124
1125 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
1126
1127 // daehwan
1128 if (bDebug)
1129 {
1130 cout << "ref: " << referenceSequence << endl;
1131 cout << "old: " << oldSegmentSequence << endl;
1132 cout << "ins: " << insertionSequence << endl;
1133 }
1134
1135 /*
1136 * Scan right in the read until we run out of read
1137 */
1138 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
1139 {
1140 /*
1141 * Any mismatch to the insertion is a failure
1142 */
1143 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
1144 {
1145 ++this_reference_mismatch;
1146 }
1147
1148 if (read_index < insertion_len)
1149 {
1150 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
1151 {
1152 ++insertion_mismatch;
1153 break;
1154 }
1155 }
1156 else
1157 {
1158 if (referenceSequence[read_index - insertion_len] == 'N' ||
1159 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
1160 {
1161 --this_reference_mismatch;
1162 }
1163 }
1164 }
1165 }
1166
1167 string colorSegmentSequence_curr;
1168 if (curr_left_to_insert > 0)
1169 {
1170 seqan::Dna5String referenceSequence, oldSegmentSequence;
1171 if (reversed)
1172 {
1173 referenceSequence = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
1174 seqan::reverseComplement(referenceSequence);
1175
1176 string temp = read_seq.substr(curr_seg_index * segment_length, curr_left_to_insert);
1177 oldSegmentSequence = seqan::Dna5String(temp);
1178 }
1179 else
1180 {
1181 referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
1182 oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
1183 }
1184
1185 if (color)
1186 {
1187 string color;
1188 if (antisense)
1189 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
1190 else
1191 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
1192
1193 color.push_back(curr_hit->seq()[curr_left_to_insert]);
1194 reverse(color.begin(), color.end());
1195 string bp = convert_color_to_bp(color);
1196 reverse(bp.begin(), bp.end());
1197 colorSegmentSequence_curr = bp;
1198 }
1199
1200 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
1201
1202 // daehwan
1203 if (bDebug)
1204 {
1205 cout << "ref: " << referenceSequence << endl;
1206 cout << "old: " << oldSegmentSequence << endl;
1207 cout << "ins: " << insertionSequence << endl;
1208 }
1209
1210 /*
1211 * Scan left in the read until
1212 * We ran out of read sequence (insertion extends past segment)
1213 */
1214 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
1215 {
1216 int segmentPosition = curr_left_to_insert - read_index - 1;
1217 int insertionPosition = insertion_len - read_index - 1;
1218
1219 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
1220 {
1221 ++this_reference_mismatch;
1222 }
1223
1224 if (read_index < insertion_len)
1225 {
1226 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
1227 {
1228 ++insertion_mismatch;
1229 break;
1230 }
1231 }
1232 else
1233 {
1234 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
1235 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
1236 {
1237 --this_reference_mismatch;
1238 }
1239 }
1240 }
1241 }
1242
1243 if (found_closure)
1244 {
1245 // fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
1246 return BowtieHit();
1247 }
1248
1249 if (insertion_mismatch == 0)
1250 {
1251 reference_mismatch = this_reference_mismatch;
1252 mismatch = -reference_mismatch;
1253 found_closure = true;
1254 new_left = prev_hit->left();
1255 new_cigar = prev_hit->cigar();
1256
1257 /*
1258 * Need to make a new insert operation between the two match character that begin
1259 * and end the intersection of these two junction. Note that we necessarily assume
1260 * that this insertion can't span beyond the boundaries of these reads. That should
1261 * probably be better enforced somewhere
1262 */
1263
1264 new_cigar.back().length -= insert_to_prev_right;
1265
1266 if (new_cigar.back().length <= 0)
1267 new_cigar.pop_back();
1268
1269 if (reversed)
1270 new_cigar.push_back(CigarOp(iNS, lb->sequence.size()));
1271 else
1272 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
1273
1274 vector<CigarOp> new_right_cigar = curr_hit->cigar();
1275 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
1276
1277 /*
1278 * Finish stitching together the new cigar string
1279 */
1280 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
1281 for (; c < new_right_cigar.size(); ++c)
1282 {
1283 new_cigar.push_back(new_right_cigar[c]);
1284 }
1285
1286 if (color)
1287 {
1288 if (insert_to_prev_right > 0)
1289 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
1290
1291 if (curr_left_to_insert > 0)
1292 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
1293 }
1294 }
1295 }
1296 ++lb;
1297 }
1298
1299 if (!found_closure)
1300 {
1301 return BowtieHit();
1302 }
1303 }
1304
1305 /*
1306 * Stitch segments together using juctions or deletions if necessary.
1307 */
1308 else if (dist_btw_two > 0 && dist_btw_two <= max_report_intron_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
1309 {
1310 std::set<Junction>::iterator lb, ub;
1311
1312 // daehwan
1313 if (bDebug)
1314 {
1315 cout << "junction" << endl;
1316 cout << "min: " << left_boundary << "-" << right_boundary - 8 << endl;
1317 cout << "max: " << left_boundary + 8 << "-" << right_boundary << endl;
1318 }
1319
1320 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
1321 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
1322
1323 int new_diff_mismatches = 0xff;
1324 while (lb != ub && lb != possible_juncs.end())
1325 {
1326 int dist_to_left, dist_to_right;
1327 if (reversed)
1328 {
1329 dist_to_left = lb->left - curr_hit->left();
1330 dist_to_right = lb->right - prev_hit->right() - 1;
1331 }
1332 else
1333 {
1334 dist_to_left = lb->left - prev_hit->right() + 1;
1335 dist_to_right = lb->right - curr_hit->left();
1336 }
1337
1338 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1339 {
1340 /*
1341 * Check we have enough matched bases on prev or curr segment.
1342 */
1343 if ((reversed && (dist_to_left > prev_right_end_match_length || -dist_to_left > curr_left_end_match_length)) ||
1344 (!reversed && (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)))
1345 {
1346 ++lb;
1347 continue;
1348 }
1349
1350 // daehwan
1351 if (bDebug)
1352 {
1353 cout << "candidate junction: " << endl;
1354 cout << "coords: " << lb->left << "-" << lb->right << endl;
1355 cout << "dist to left: " << dist_to_left << endl;
1356 }
1357
1358 Dna5String new_cmp_str, old_cmp_str;
1359 int new_mismatch = 0, old_mismatch = 0;
1360 string new_patch_str; // this is for colorspace reads
1361 if (dist_to_left > 0)
1362 {
1363 if (reversed)
1364 {
1365 new_cmp_str = seqan::infix(*ref_str, curr_hit->left() + 1, lb->left + 1);
1366 seqan::reverseComplement(new_cmp_str);
1367
1368 old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb->right);
1369 seqan::reverseComplement(old_cmp_str);
1370 }
1371 else
1372 {
1373 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
1374 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
1375 }
1376
1377 string new_seq;
1378 if (color)
1379 {
1380 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
1381
1382 string color, qual;
1383 if (antisense)
1384 {
1385 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
1386 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
1387 }
1388 else
1389 {
1390 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
1391 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
1392 }
1393
1394 BWA_decode(color, qual, ref, new_seq);
1395 new_seq = new_seq.substr(1);
1396 }
1397
1398 string curr_hit_seq;
1399 if (reversed)
1400 curr_hit_seq = read_seq.substr(curr_seg_index * segment_length - dist_to_left, dist_to_left);
1401 else
1402 curr_hit_seq = curr_hit->seq();
1403
1404 const string& curr_old_seq = curr_hit_seq;
1405 const string& curr_seq = color ? new_seq : curr_hit_seq;
1406 for (int i = 0; i < dist_to_left; ++i)
1407 {
1408 if (curr_seq[i] != new_cmp_str[i])
1409 ++new_mismatch;
1410
1411 if (curr_old_seq[i] != old_cmp_str[i])
1412 ++old_mismatch;
1413 }
1414
1415 if (color)
1416 new_patch_str = curr_seq.substr(0, dist_to_left);
1417 }
1418 else if (dist_to_left < 0)
1419 {
1420 if (reversed)
1421 {
1422 new_cmp_str = seqan::infix(*ref_str, lb->right, prev_hit->right() + 1);
1423 seqan::reverseComplement(new_cmp_str);
1424
1425 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
1426 seqan::reverseComplement(old_cmp_str);
1427 }
1428 else
1429 {
1430 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
1431 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
1432 }
1433
1434 size_t abs_dist = -dist_to_left;
1435 string new_seq;
1436 if (color)
1437 {
1438 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
1439 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
1440
1441 string color, qual;
1442 if (antisense)
1443 {
1444 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
1445 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
1446 }
1447 else
1448 {
1449 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
1450 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
1451 }
1452
1453 BWA_decode(color, qual, ref, new_seq);
1454 new_seq = new_seq.substr(1);
1455 }
1456
1457 string prev_hit_seq;
1458 if (reversed)
1459 prev_hit_seq = read_seq.substr(curr_seg_index * segment_length, abs_dist);
1460 else
1461 prev_hit_seq = prev_hit->seq();
1462
1463 // daehwan
1464 if (bDebug)
1465 {
1466 cout << "reverse: " << (int)reversed << endl;
1467 cout << "new cmp str: " << new_cmp_str << endl;
1468 cout << "old cmp str: " << old_cmp_str << endl;
1469 cout << "hit seq: " << prev_hit_seq << endl;
1470 cout << "curr seq: " << curr_hit->seq() << endl;
1471
1472 cout << read_seq
1473 << endl;
1474 cout << read_seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, segment_length)
1475 << endl;
1476 }
1477
1478 const string& prev_old_seq = prev_hit_seq;
1479 size_t prev_old_seq_len = prev_old_seq.length();
1480 const string& prev_seq = color ? new_seq : prev_hit_seq;
1481 size_t prev_seq_len = prev_seq.length();
1482 for (size_t i = 0; i < abs_dist; ++i)
1483 {
1484 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1485 ++new_mismatch;
1486 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1487 ++old_mismatch;
1488 }
1489
1490 if (color)
1491 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
1492 }
1493
1494 int temp_diff_mismatches = new_mismatch - old_mismatch;
1495
1496 // daehwan
1497 if (bDebug)
1498 {
1499 cout << "new mismatch: " << new_mismatch << endl;
1500 cout << "old mismatch: " << old_mismatch << endl;
1501 cout << "new_diff_mismatch: " << new_diff_mismatches << endl;
1502 cout << "temp mismatch: " << temp_diff_mismatches << endl;
1503 }
1504
1505 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1506 {
1507 ++lb;
1508 continue;
1509 }
1510
1511 if (color)
1512 {
1513 /*
1514 * We need to recover the origianl sequence.
1515 */
1516 if (found_closure)
1517 {
1518 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
1519 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
1520 }
1521
1522 if (dist_to_left > 0)
1523 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
1524 else if (dist_to_left < 0)
1525 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
1526 }
1527
1528 new_diff_mismatches = temp_diff_mismatches;
1529
1530 new_left = prev_hit->left();
1531 new_cigar = prev_hit->cigar();
1532
1533 int new_left_back_len = new_cigar.back().length;
1534 if (reversed)
1535 new_left_back_len -= dist_to_left;
1536 else
1537 new_left_back_len += dist_to_left;
1538
1539 vector<CigarOp> new_right_cig = curr_hit->cigar();
1540 int new_right_front_len = new_right_cig.front().length;
1541 if (reversed)
1542 new_right_front_len += dist_to_right;
1543 else
1544 new_right_front_len -= dist_to_right;
1545 if (new_left_back_len > 0)
1546 new_cigar.back().length = new_left_back_len;
1547 else
1548 new_cigar.pop_back();
1549
1550 /*
1551 * FIXME, currently just differentiating between a deletion and a
1552 * reference skip based on length. However, would probably be better
1553 * to denote the difference explicitly, this would allow the user
1554 * to supply their own (very large) deletions
1555 */
1556 if ((lb->right - lb->left - 1) <= max_deletion_length)
1557 {
1558 if (reversed)
1559 new_cigar.push_back(CigarOp(dEL, lb->right - lb->left - 1));
1560 else
1561 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
1562 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1563 }
1564 else
1565 {
1566 if (reversed)
1567 new_cigar.push_back(CigarOp(rEF_SKIP, lb->right - lb->left - 1));
1568 else
1569 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
1570 antisense_closure = lb->antisense;
1571 }
1572
1573 new_right_cig.front().length = new_right_front_len;
1574 size_t c = new_right_front_len > 0 ? 0 : 1;
1575 for (; c < new_right_cig.size(); ++c)
1576 new_cigar.push_back(new_right_cig[c]);
1577
1578 mismatch = new_diff_mismatches;
1579 found_closure = true;
1580 }
1581 ++lb;
1582 }
1583
1584 if (!found_closure)
1585 {
1586 return BowtieHit();
1587 }
1588 }
1589 else if (!(dist_btw_two == 0 && prev_hit->antisense_align2() == curr_hit->antisense_align()))
1590 check_fusion = true;
1591 }
1592
1593 if (check_fusion)
1594 {
1595 std::set<Fusion>::iterator lb, ub;
1596
1597 uint32_t ref_id1 = prev_hit->ref_id2();
1598 uint32_t ref_id2 = curr_hit->ref_id();
1599 uint32_t left = prev_hit->right() - 4;
1600 uint32_t right = curr_hit->left() - 4;
1601
1602 // daehwan
1603 if (bDebug)
1604 {
1605 cout << "daehwan - start - fusion" << endl
1606 << "ref_id1: " << ref_id1 << endl
1607 << "ref_id2: " << ref_id2 << endl
1608 << "left: " << left << endl
1609 << "right: " << right << endl
1610 << "dir: " << fusion_dir << endl;
1611 }
1612
1613 bool reversed = false;
1614 if (fusion_dir != FUSION_FF &&
1615 (ref_id2 < ref_id1 || (ref_id1 == ref_id2 && left > right)))
1616 {
1617 reversed = true;
1618
1619 uint32_t temp = ref_id1;
1620 ref_id1 = ref_id2;
1621 ref_id2 = temp;
1622
1623 temp = left;
1624 left = right;
1625 right = temp;
1626 }
1627
1628 lb = possible_fusions.upper_bound(Fusion(ref_id1, ref_id2, left, right));
1629 ub = possible_fusions.lower_bound(Fusion(ref_id1, ref_id2, left + 8, right + 8));
1630
1631 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id2());
1632 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(curr_hit->ref_id());
1633
1634 int new_diff_mismatches = 0xff;
1635 while (lb != ub && lb != possible_fusions.end())
1636 {
1637 int lb_left = lb->left;
1638 int lb_right = lb->right;
1639
1640 if (reversed)
1641 {
1642 lb_left = lb->right;
1643 lb_right = lb->left;
1644 }
1645
1646 int dist_to_left, dist_to_right;
1647 if (fusion_dir == FUSION_RF)
1648 dist_to_left = prev_hit->right() - lb_left + 1;
1649 else
1650 dist_to_left = lb_left - prev_hit->right() + 1;
1651
1652 if (fusion_dir == FUSION_FR)
1653 dist_to_right = curr_hit->left() - lb_right;
1654 else
1655 dist_to_right = lb_right - curr_hit->left();
1656
1657 // daehwan
1658 if (bDebug)
1659 {
1660 cout << "daehwan - fusion gap" << endl;
1661 cout << "dist left: " << dist_to_left << endl;
1662 cout << "dist right: " << dist_to_right << endl;
1663 }
1664
1665 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1666 {
1667 /*
1668 * Check we have enough matched bases on prev or curr segment.
1669 */
1670 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)
1671 {
1672 ++lb;
1673 continue;
1674 }
1675
1676 Dna5String new_cmp_str, old_cmp_str;
1677 int new_mismatch = 0, old_mismatch = 0;
1678 string new_patch_str; // this is for colorspace reads
1679 if (dist_to_left > 0)
1680 {
1681 if (fusion_dir == FUSION_RF)
1682 {
1683 new_cmp_str = seqan::infix(*ref_str, lb_left, prev_hit->right() + 1);
1684 seqan::reverseComplement(new_cmp_str);
1685 }
1686 else
1687 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb_left + 1);
1688
1689 if (fusion_dir == FUSION_FR)
1690 {
1691 old_cmp_str = seqan::infix(*ref_str2, lb_right + 1, curr_hit->left() + 1);
1692 seqan::reverseComplement(old_cmp_str);
1693 }
1694 else
1695 old_cmp_str = seqan::infix(*ref_str2, curr_hit->left(), lb_right);
1696
1697 // daehwan
1698 if (bDebug)
1699 {
1700 cout << "new str: " << new_cmp_str << endl;
1701 cout << "old str: " << old_cmp_str << endl;
1702 cout << "curr seq: " << curr_hit->seq() << endl;
1703 }
1704
1705 string curr_hit_seq;
1706 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1707 curr_hit_seq = curr_hit->seq();
1708 else
1709 curr_hit_seq = read_seq.substr(curr_seg_index * segment_length, segment_length);
1710
1711 string new_seq;
1712 const string& curr_old_seq = curr_hit_seq;
1713 const string& curr_seq = color ? new_seq : curr_hit_seq;
1714 for (int i = 0; i < dist_to_left; ++i)
1715 {
1716 if (curr_seq[i] != new_cmp_str[i])
1717 ++new_mismatch;
1718
1719 if (curr_old_seq[i] != old_cmp_str[i])
1720 ++old_mismatch;
1721 }
1722 }
1723 else if (dist_to_left < 0)
1724 {
1725 if (fusion_dir == FUSION_FR)
1726 {
1727 new_cmp_str = seqan::infix(*ref_str2, curr_hit->left() + 1, lb_right + 1);
1728 seqan::reverseComplement(new_cmp_str);
1729 }
1730 else
1731 new_cmp_str = seqan::infix(*ref_str2, lb_right, curr_hit->left());
1732
1733 if (fusion_dir == FUSION_RF)
1734 {
1735 old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb_left);
1736 seqan::reverseComplement(old_cmp_str);
1737 }
1738 else
1739 old_cmp_str = seqan::infix(*ref_str, lb_left + 1, prev_hit->right());
1740
1741 string prev_hit_seq;
1742 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1743 prev_hit_seq = prev_hit->seq();
1744 else
1745 prev_hit_seq = read_seq.substr((curr_seg_index - 1) * segment_length, segment_length);
1746
1747 size_t abs_dist = -dist_to_left;
1748 string new_seq;
1749
1750 const string& prev_old_seq = prev_hit_seq;
1751 size_t prev_old_seq_len = prev_old_seq.length();
1752 const string& prev_seq = color ? new_seq : prev_hit_seq;
1753 size_t prev_seq_len = prev_seq.length();
1754 for (size_t i = 0; i < abs_dist; ++i)
1755 {
1756 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1757 ++new_mismatch;
1758 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1759 ++old_mismatch;
1760 }
1761 }
1762
1763 int temp_diff_mismatches = new_mismatch - old_mismatch;
1764 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1765 {
1766 ++lb;
1767 continue;
1768 }
1769 new_diff_mismatches = temp_diff_mismatches;
1770
1771 new_left = prev_hit->left();
1772 new_cigar = prev_hit->cigar();
1773
1774 int new_left_back_len = new_cigar.back().length;
1775 new_left_back_len += dist_to_left;
1776
1777 vector<CigarOp> new_right_cig = curr_hit->cigar();
1778 int new_right_front_len = new_right_cig.front().length;
1779 new_right_front_len -= dist_to_right;
1780 if (new_left_back_len > 0)
1781 new_cigar.back().length = new_left_back_len;
1782 else
1783 new_cigar.pop_back();
1784
1785 new_cigar.push_back(CigarOp((CigarOpCode)fusion_dir, lb_right));
1786 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1787
1788 new_right_cig.front().length = new_right_front_len;
1789 size_t c = new_right_front_len > 0 ? 0 : 1;
1790 for (; c < new_right_cig.size(); ++c)
1791 new_cigar.push_back(new_right_cig[c]);
1792
1793 mismatch = new_diff_mismatches;
1794 found_closure = true;
1795 ++num_fusions;
1796
1797 // daehwan
1798 if (bDebug)
1799 {
1800 cout << "daehwan - fusion gap - found" << endl;
1801 }
1802
1803 }
1804 ++lb;
1805 }
1806
1807 // daehwan
1808 if (bDebug)
1809 {
1810 cout << "daehwan2 - end - fusion: " << (found_closure ? "found" : "not found") << endl;
1811 }
1812
1813 if (!found_closure)
1814 {
1815 return BowtieHit();
1816 }
1817 }
1818
1819 if (found_closure)
1820 {
1821 bool end = false;
1822 BowtieHit merged_hit(prev_hit->ref_id(),
1823 curr_hit->ref_id2(),
1824 insert_id,
1825 new_left,
1826 new_cigar,
1827 antisense,
1828 antisense_closure,
1829 prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
1830 prev_hit->splice_mms() + curr_hit->splice_mms(),
1831 end);
1832
1833 // daehwan - should fix this for SOLiD dataset
1834 merged_hit.seq(prev_hit->seq() + curr_hit->seq());
1835
1836 // daehwan
1837 if (bDebug)
1838 {
1839 cout << "fusing of " << merged_hit.left() << " and " << merged_hit.right() << endl;
1840 cout << print_cigar(merged_hit.cigar()) << endl;
1841 if (!merged_hit.check_editdist_consistency(rt, bDebug))
1842 {
1843 prev_hit->check_editdist_consistency(rt, bDebug);
1844 curr_hit->check_editdist_consistency(rt, bDebug);
1845 cout << "btw " << print_cigar(prev_hit->cigar()) << " and " << print_cigar(curr_hit->cigar()) << endl;
1846 cout << "this is a malformed hit" << endl;
1847 exit(1);
1848 }
1849 }
1850
1851 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
1852 /*
1853 * prev_hit now points PAST the last element removed
1854 */
1855 prev_hit = hit_chain.insert(prev_hit, merged_hit);
1856 /*
1857 * merged_hit has been inserted before the old position of
1858 * prev_hit. New location of prev_hit is merged_hit
1859 */
1860 curr_hit = prev_hit;
1861 ++curr_hit;
1862 ++curr_seg_index;
1863 continue;
1864 }
1865
1866 // daehwan
1867 if (bDebug)
1868 {
1869 cout << "daehwan - test 0.3" << endl;
1870 }
1871
1872 ++prev_hit;
1873 ++curr_hit;
1874 ++curr_seg_index;
1875 }
1876
1877 // daehwan
1878 if (bDebug)
1879 {
1880 cout << "daehwan - test2" << endl;
1881 }
1882
1883 bool saw_antisense_splice = false;
1884 bool saw_sense_splice = false;
1885 vector<CigarOp> long_cigar;
1886 int num_mismatches = 0;
1887 int num_splice_mms = 0;
1888 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
1889 {
1890 num_mismatches += s->edit_dist();
1891 num_splice_mms += s->splice_mms();
1892
1893 /*
1894 * Check whether the sequence contains any reference skips. Previously,
1895 * this was just a check to see whether the sequence was contiguous; however
1896 * we don't want to count an indel event as a splice
1897 */
1898 bool containsSplice = s->is_spliced();
1899 if (containsSplice)
1900 {
1901 if (s->antisense_splice())
1902 {
1903 if (saw_sense_splice)
1904 return BowtieHit();
1905 saw_antisense_splice = true;
1906 }
1907 else
1908 {
1909 if (saw_antisense_splice)
1910 return BowtieHit();
1911 saw_sense_splice = true;
1912 }
1913 }
1914 const vector<CigarOp>& cigar = s->cigar();
1915 if (long_cigar.empty())
1916 {
1917 long_cigar = cigar;
1918 }
1919 else
1920 {
1921 CigarOp& last = long_cigar.back();
1922 /*
1923 * If necessary, merge the back and front
1924 * cigar operations
1925 */
1926 if(last.opcode == cigar[0].opcode){
1927 last.length += cigar[0].length;
1928 for (size_t b = 1; b < cigar.size(); ++b)
1929 {
1930 long_cigar.push_back(cigar[b]);
1931 }
1932 }else{
1933 for(size_t b = 0; b < cigar.size(); ++b)
1934 {
1935 long_cigar.push_back(cigar[b]);
1936 }
1937 }
1938 }
1939 }
1940
1941 bool end = false;
1942 BowtieHit new_hit(hit_chain.front().ref_id(),
1943 hit_chain.back().ref_id2(),
1944 insert_id,
1945 left,
1946 long_cigar,
1947 antisense,
1948 saw_antisense_splice,
1949 num_mismatches,
1950 num_splice_mms,
1951 end);
1952
1953 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1954 {
1955 new_hit.seq(seq);
1956 if (bowtie2)
1957 {
1958 // for the time being, let's compare "seq" and "read_seq"
1959 if (seq != read_seq)
1960 {
1961 string temp_qual = read_quals;
1962 reverse(temp_qual.begin(), temp_qual.end());
1963 new_hit.qual(temp_qual);
1964 }
1965 else
1966 new_hit.qual(read_quals);
1967 }
1968 else
1969 new_hit.qual(qual);
1970 }
1971 else
1972 {
1973 new_hit.seq(read_seq);
1974 new_hit.qual(read_quals);
1975 }
1976
1977 bool do_reverse = new_hit.ref_id() > new_hit.ref_id2();
1978 if (new_hit.ref_id() == new_hit.ref_id2())
1979 {
1980 vector<Fusion> fusions;
1981 bool auto_sort = false;
1982 fusions_from_spliced_hit(new_hit, fusions, auto_sort);
1983 if (fusions.size() > 0)
1984 {
1985 const Fusion& fusion = fusions[0];
1986 do_reverse = fusion.left > fusion.right;
1987 }
1988 }
1989
1990 if (do_reverse)
1991 {
1992 new_hit = new_hit.reverse();
1993 }
1994
1995 if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
1996 {
1997 new_hit.antisense_align(!new_hit.antisense_align());
1998 }
1999
2000 // daehwan
2001 if (bDebug)
2002 {
2003 cout << "daehwan - test3" << endl;
2004 cout << new_hit.left() << " " << print_cigar(new_hit.cigar()) << endl;
2005 cout << new_hit.ref_id() << "-" << new_hit.ref_id2() << ": " << new_hit.fusion_opcode() << endl;
2006 }
2007
2008 int new_read_len = new_hit.read_len();
2009 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt, bDebug))
2010 {
2011 // daehwan
2012 if (bDebug)
2013 {
2014 cout << "Warning: " << new_hit.insert_id() << " malformed closure: " << print_cigar(new_hit.cigar()) << endl;
2015 exit(1);
2016 }
2017
2018 fprintf(stderr, "Warning: %d malformed closure\n", new_hit.insert_id());
2019 return BowtieHit();
2020 }
2021
2022 return new_hit;
2023 }
2024
2025
2026 int multi_closure = 0;
2027 int anchor_too_short = 0;
2028 int gap_too_short = 0;
2029
2030 bool valid_hit(const BowtieHit& bh)
2031 {
2032 if (bh.insert_id())
2033 {
2034 /*
2035 * validate the cigar chain - no gaps shorter than an intron, etc.
2036 * also,
2037 * -Don't start or end with an indel or refskip
2038 * -Only a match operation is allowed is allowed
2039 * adjacent to an indel or refskip
2040 * -Indels should confrom to length restrictions
2041 */
2042 const CigarOp* prevCig = &(bh.cigar()[0]);
2043 const CigarOp* currCig = &(bh.cigar()[1]);
2044 for (size_t i = 1; i < bh.cigar().size(); ++i){
2045 currCig = &(bh.cigar()[i]);
2046 if(!(currCig->opcode == MATCH || currCig->opcode == mATCH) &&
2047 !(prevCig->opcode == MATCH || prevCig->opcode == mATCH)){
2048 return false;
2049 }
2050 if(currCig->opcode == INS || currCig->opcode == iNS){
2051 if(currCig->length > max_insertion_length){
2052 return false;
2053 }
2054 }
2055 if(currCig->opcode == DEL || currCig->opcode == dEL){
2056 if(currCig->length > max_deletion_length){
2057 return false;
2058 }
2059 }
2060 if(currCig->opcode == REF_SKIP || currCig->opcode == rEF_SKIP){
2061 if(currCig->length < (uint64_t)min_report_intron_length){
2062 gap_too_short++;
2063 return false;
2064 }
2065 }
2066 prevCig = currCig;
2067 }
2068 if (!(bh.cigar().front().opcode == MATCH || bh.cigar().front().opcode == mATCH) ||
2069 !(bh.cigar().back().opcode == MATCH || bh.cigar().back().opcode == mATCH)/* ||
2070 (int)bh.cigar().front().length < min_anchor_len||
2071 (int)bh.cigar().back().length < min_anchor_len*/ )
2072 {
2073 anchor_too_short++;
2074 return false;
2075 }
2076 }
2077 else
2078 {
2079 multi_closure++;
2080 return false;
2081 }
2082
2083 return true;
2084 }
2085
2086 void merge_segment_chain(RefSequenceTable& rt,
2087 const string& read_seq,
2088 const string& read_quals,
2089 std::set<Junction>& possible_juncs,
2090 std::set<Insertion>& possible_insertions,
2091 std::set<Fusion>& possible_fusions,
2092 vector<BowtieHit>& hits,
2093 vector<BowtieHit>& merged_hits,
2094 int fusion_dir = FUSION_NOTHING)
2095 {
2096 if (hits.size() == 0)
2097 return;
2098
2099 BowtieHit bh;
2100 if (hits.size() > 1)
2101 {
2102 list<BowtieHit> hit_chain;
2103 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
2104 {
2105 if (hits.front().antisense_align())
2106 copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
2107 else
2108 copy(hits.begin(), hits.end(), back_inserter(hit_chain));
2109 }
2110 else
2111 {
2112 bool bSawFusion = false;
2113 for (size_t i = 0; i < hits.size(); ++i)
2114 {
2115 bool pushed = false;
2116 if (!bSawFusion)
2117 {
2118 if (i > 0)
2119 {
2120 if (hits[i-1].ref_id() != hits[i].ref_id())
2121 bSawFusion = true;
2122 else if(hits[i-1].antisense_align() != hits[i].antisense_align())
2123 bSawFusion = true;
2124 else
2125 {
2126 int dist = 0;
2127 if (hits[i].antisense_align())
2128 dist = hits[i-1].left() - hits[i].right();
2129 else
2130 dist = hits[i].left() - hits[i-1].right();
2131
2132 if (dist >= max_report_intron_length || dist < -(int)max_insertion_length)
2133 bSawFusion = true;
2134 }
2135 }
2136 }
2137
2138 if (hits[i].fusion_opcode() == FUSION_NOTHING &&
2139 ((fusion_dir == FUSION_FR && bSawFusion) || (fusion_dir == FUSION_RF && !bSawFusion)) &&
2140 hits[i].left() < hits[i].right())
2141 {
2142 hit_chain.push_back(hits[i].reverse());
2143 pushed = true;
2144 }
2145
2146 if (i > 0 &&
2147 hits[i].fusion_opcode() != FUSION_NOTHING &&
2148 hits[i].ref_id() != hits[i-1].ref_id())
2149 {
2150 hit_chain.push_back(hits[i].reverse());
2151 pushed = true;
2152 }
2153
2154 if (!bSawFusion)
2155 {
2156 if (hits[i].fusion_opcode() != FUSION_NOTHING)
2157 bSawFusion = true;
2158 }
2159
2160 if (!pushed)
2161 hit_chain.push_back(hits[i]);
2162 }
2163 }
2164
2165 // todo: merge_chain_color needs to be merged into merge_chain fuction.
2166 if (color)
2167 bh = merge_chain_color(rt,
2168 read_seq,
2169 read_quals,
2170 possible_juncs,
2171 possible_insertions,
2172 hit_chain);
2173 else
2174 bh = merge_chain(rt,
2175 read_seq,
2176 read_quals,
2177 possible_juncs,
2178 possible_insertions,
2179 possible_fusions,
2180 hit_chain,
2181 fusion_dir);
2182 }
2183 else
2184 {
2185 bh = hits[0];
2186 bool do_reverse = bh.ref_id() > bh.ref_id2();
2187 if (bh.ref_id() == bh.ref_id2())
2188 {
2189 vector<Fusion> fusions;
2190 bool auto_sort = false;
2191 fusions_from_spliced_hit(bh, fusions, auto_sort);
2192 if (fusions.size() > 0)
2193 {
2194 const Fusion& fusion = fusions[0];
2195 do_reverse = fusion.left > fusion.right;
2196 }
2197 }
2198
2199 if (do_reverse)
2200 bh = bh.reverse();
2201 }
2202
2203 if (valid_hit(bh))
2204 merged_hits.push_back(bh);
2205 }
2206
2207 bool dfs_seg_hits(RefSequenceTable& rt,
2208 const string& read_seq,
2209 const string& read_quals,
2210 std::set<Junction>& possible_juncs,
2211 std::set<Insertion>& possible_insertions,
2212 std::set<Fusion>& possible_fusions,
2213 vector<HitsForRead>& seg_hits_for_read,
2214 size_t curr,
2215 vector<BowtieHit>& seg_hit_stack,
2216 vector<BowtieHit>& joined_hits,
2217 int fusion_dir = FUSION_NOTHING)
2218 {
2219 assert (!seg_hit_stack.empty());
2220 bool join_success = false;
2221
2222 if (curr < seg_hits_for_read.size())
2223 {
2224 for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
2225 {
2226 /*
2227 * As we reverse segments depending on directions like FR or RF,
2228 * it's necessary to recover the original segments.
2229 */
2230 BowtieHit bh = seg_hits_for_read[curr].hits[i];
2231 BowtieHit bh_prev = seg_hit_stack.back();
2232
2233 BowtieHit* prevHit = &bh_prev;
2234 BowtieHit* currHit = &bh;
2235
2236 // daehwan
2237 // if (bh.insert_id() == 11921733)
2238 // bDebug = true;
2239
2240 /*
2241 * Each segment has at most one fusion by assumption,
2242 */
2243 bool prevHit_fused = prevHit->fusion_opcode() != FUSION_NOTHING;
2244 bool currHit_fused = currHit->fusion_opcode() != FUSION_NOTHING;
2245
2246 /*
2247 * Count the number of fusions on prev and curr segments,
2248 * but this doesn't take into account the gap (if exists) between the two segments.
2249 */
2250 size_t num_fusions = prevHit_fused ? 1 : 0;
2251 num_fusions += currHit_fused ? 1 : 0;
2252
2253 int dir = prevHit_fused ? prevHit->fusion_opcode() : currHit->fusion_opcode();
2254
2255 /*
2256 * We don't allow reads that span more than two fusion points.
2257 */
2258 if (num_fusions >= 2)
2259 continue;
2260
2261 if (fusion_dir != FUSION_NOTHING && currHit_fused)
2262 continue;
2263
2264 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
2265 {
2266 if ((currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id()) ||
2267 (!currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id2()))
2268 continue;
2269 }
2270
2271 if (bDebug)
2272 {
2273 cout << "daehwan - prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": "
2274 << print_cigar(prevHit->cigar()) << endl;
2275 cout << "daehwan - prev sense: "
2276 << (prevHit->antisense_align() ? "-" : "+")
2277 << "\t"
2278 << (prevHit->antisense_align2() ? "-" : "+")
2279 << endl;
2280 cout << "daehwan - prev coords: "
2281 << prevHit->left()
2282 << "\t"
2283 << prevHit->right()
2284 << endl;
2285
2286 cout << "daehwan - curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": "
2287 << print_cigar(currHit->cigar()) << endl;
2288 cout << "daehwan - curr sense: "
2289 << (currHit->antisense_align() ? "-" : "+")
2290 << "\t"
2291 << (currHit->antisense_align2() ? "-" : "+")
2292 << endl;
2293 cout << "daehwan - curr coords: "
2294 << currHit->left()
2295 << "\t"
2296 << currHit->right()
2297 << endl;
2298 }
2299
2300 if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF) &&
2301 prevHit->ref_id2() != currHit->ref_id())
2302 continue;
2303
2304 if ((fusion_dir == FUSION_FR && !currHit->antisense_align()) ||
2305 (fusion_dir == FUSION_RF && currHit->antisense_align()))
2306 continue;
2307
2308 if (currHit_fused && dir == FUSION_RR)
2309 *currHit = currHit->reverse();
2310
2311 if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RF ||
2312 (currHit_fused && currHit->ref_id() == currHit->ref_id2() && (dir == FUSION_FR || dir == FUSION_RF)))
2313 {
2314 if (currHit_fused)
2315 {
2316 if ((dir == FUSION_FR && currHit->antisense_align()) ||
2317 (dir == FUSION_RF && !currHit->antisense_align()))
2318 *currHit = currHit->reverse();
2319 }
2320 else
2321 {
2322 if (fusion_dir == FUSION_FR && currHit->antisense_align())
2323 *currHit = currHit->reverse();
2324 }
2325 }
2326 /*
2327 * Switch prevHit and currHit in FUSION_NOTHING and FUSION_FF cases
2328 * to make it easier to check the distance in the gap between the two segments.
2329 */
2330 else if ((num_fusions == 0 && prevHit->antisense_align() && currHit->antisense_align() && prevHit->ref_id() == currHit->ref_id() && prevHit->left() <= currHit->right() + max_report_intron_length && prevHit->left() + max_insertion_length >= currHit->right()) ||
2331 (num_fusions == 1 && (dir == FUSION_FF || dir == FUSION_RR)&&
2332 ((!prevHit_fused && prevHit->antisense_align()) || (!currHit_fused && currHit->antisense_align())))
2333 )
2334 {
2335 BowtieHit* tempHit = prevHit;
2336 prevHit = currHit;
2337 currHit = tempHit;
2338 }
2339 else if (num_fusions == 0)
2340 {
2341 if (prevHit->ref_id2() == currHit->ref_id() && prevHit->antisense_align() == currHit->antisense_align())
2342 {
2343 int dist = 0;
2344 if (prevHit->antisense_align())
2345 dist = prevHit->left() - currHit->right();
2346 else
2347 dist = currHit->left() - prevHit->right();
2348
2349 if (dist > max_report_intron_length || dist < -(int)max_insertion_length)
2350 {
2351 if ((prevHit->antisense_align() && prevHit->left() > currHit->left()) ||
2352 (!prevHit->antisense_align() && prevHit->left() < currHit->left()))
2353 dir = FUSION_FF;
2354 else
2355 dir = FUSION_RR;
2356 }
2357 }
2358 else
2359 {
2360 if (prevHit->antisense_align() == currHit->antisense_align())
2361 {
2362 if ((prevHit->antisense_align() && prevHit->ref_id() > currHit->ref_id()) ||
2363 (!prevHit->antisense_align() && prevHit->ref_id() < currHit->ref_id()))
2364 dir = FUSION_FF;
2365 else
2366 dir = FUSION_RR;
2367 }
2368 else if (!prevHit->antisense_align())
2369 dir = FUSION_FR;
2370 else
2371 dir = FUSION_RF;
2372
2373 if (dir == FUSION_FR)
2374 *currHit = currHit->reverse();
2375 else if(dir == FUSION_RF)
2376 *prevHit = prevHit->reverse();
2377 }
2378 }
2379
2380 // daehwan - test
2381 if (bDebug)
2382 {
2383 cout << "insert id: " << prevHit->insert_id() << endl;
2384 cout << "(" << curr - 1 << ") prev: " << prevHit->seq() << " : " << (prevHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
2385 cout << "(" << curr << ") curr: " << currHit->seq() << " : " << (currHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
2386 cout << "prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl;
2387 cout << "curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl;
2388 cout << "prev coords: "
2389 << prevHit->left()
2390 << "\t"
2391 << prevHit->right()
2392 << endl;
2393 cout << "curr corrds: "
2394 << currHit->left()
2395 << "\t"
2396 << currHit->right()
2397 << endl;
2398 cout << "prev sense: "
2399 << (prevHit->antisense_align() ? "-" : "+")
2400 << "\t"
2401 << (prevHit->antisense_align2() ? "-" : "+")
2402 << endl;
2403 cout << "curr sense: "
2404 << (currHit->antisense_align() ? "-" : "+")
2405 << "\t"
2406 << (currHit->antisense_align2() ? "-" : "+")
2407 << endl;
2408 }
2409
2410 if (num_fusions == 1)
2411 {
2412 // daehwan
2413 if (bDebug)
2414 {
2415 cout << "direction: " << (int)dir << endl;
2416 }
2417
2418 /*
2419 * orient the fused segment, which depends on a fusion direction.
2420 */
2421 if (dir != FUSION_FF && dir != FUSION_RR)
2422 {
2423 bool prevHit_rep = false;
2424 bool currHit_rep = false;
2425
2426 if (prevHit_fused)
2427 {
2428 if ((dir == FUSION_FR && !currHit->antisense_align()) ||
2429 (dir == FUSION_RF && currHit->antisense_align()))
2430 continue;
2431
2432 if (prevHit->ref_id2() != currHit->ref_id())
2433 prevHit_rep = true;
2434 else if ((dir == FUSION_FR && prevHit->antisense_align()) ||
2435 (dir == FUSION_RF && !prevHit->antisense_align()))
2436 prevHit_rep = true;
2437 }
2438
2439 if (currHit_fused)
2440 {
2441 if ((dir == FUSION_FR && prevHit->antisense_align()) ||
2442 (dir == FUSION_RF && !prevHit->antisense_align()))
2443 continue;
2444
2445 if (currHit->ref_id() != prevHit->ref_id2())
2446 currHit_rep = true;
2447 }
2448
2449 if (bDebug)
2450 {
2451 if (prevHit_rep) cout << "1. reversed in prev" << endl;
2452 if (currHit_rep) cout << "1. reversed in curr" << endl;
2453 }
2454
2455 if (prevHit_rep)
2456 *prevHit = prevHit->reverse();
2457 if (currHit_rep)
2458 *currHit = currHit->reverse();
2459
2460 prevHit_rep = false;
2461 currHit_rep = false;
2462
2463 if (prevHit_fused)
2464 {
2465 if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
2466 currHit_rep = true;
2467 }
2468 else
2469 {
2470 if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
2471 prevHit_rep = true;
2472 }
2473
2474 if (prevHit_rep)
2475 *prevHit = prevHit->reverse();
2476 if (currHit_rep)
2477 *currHit = currHit->reverse();
2478
2479 // daehwan
2480 if (bDebug)
2481 {
2482 if (prevHit_rep) cout << "2. reversed in prev" << endl;
2483 if (currHit_rep) cout << "2. reversed in curr" << endl;
2484 }
2485 }
2486 }
2487
2488 bool same_contig = prevHit->ref_id2() == currHit->ref_id();
2489 if (!same_contig && num_fusions > 0)
2490 continue;
2491
2492 if (same_contig && num_fusions >= 1)
2493 {
2494 if (prevHit->antisense_align2() != currHit->antisense_align())
2495 continue;
2496 }
2497
2498 int bh_l = 0, back_right = 0, dist = 0;
2499 if (same_contig)
2500 {
2501 if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || dir == FUSION_FR || dir == FUSION_RF) &&
2502 prevHit->antisense_align2())
2503 {
2504 bh_l = prevHit->right() + 1;
2505 back_right = currHit->left() + 1;
2506 }
2507 else
2508 {
2509 bh_l = currHit->left();
2510 back_right = prevHit->right();
2511 }
2512
2513 dist = bh_l - back_right;
2514 }
2515
2516 // daehwan - pass
2517 if (bDebug)
2518 {
2519 cout << "daehwan - pass" << endl;
2520 cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl;
2521 cout << "curr coords: " << currHit->left() << "\t" << currHit->right() << endl;
2522 }
2523
2524 if (!same_contig ||
2525 (same_contig && num_fusions == 0 && dir != FUSION_NOTHING && fusion_dir == FUSION_NOTHING) ||
2526 (same_contig && dist <= max_report_intron_length && dist >= -(int)max_insertion_length && prevHit->is_forwarding_right() == currHit->is_forwarding_left()))
2527 {
2528 // daehwan
2529 if (bDebug)
2530 {
2531 cout << "daehwan - really passed!!" << endl;
2532 }
2533
2534 BowtieHit tempHit = seg_hit_stack.back();
2535 seg_hit_stack.back() = bh_prev;
2536
2537 // these hits are compatible, so push bh onto the
2538 // stack, recurse, and pop it when done.
2539 seg_hit_stack.push_back(bh);
2540 bool success = dfs_seg_hits(rt,
2541 read_seq,
2542 read_quals,
2543 possible_juncs,
2544 possible_insertions,
2545 possible_fusions,
2546 seg_hits_for_read,
2547 curr + 1,
2548 seg_hit_stack,
2549 joined_hits,
2550 dir == FUSION_NOTHING ? fusion_dir : dir);
2551
2552 if (success)
2553 join_success = true;
2554
2555 seg_hit_stack.pop_back();
2556 seg_hit_stack.back() = tempHit;
2557 }
2558 }
2559 }
2560 else
2561 {
2562 merge_segment_chain(rt,
2563 read_seq,
2564 read_quals,
2565 possible_juncs,
2566 possible_insertions,
2567 possible_fusions,
2568 seg_hit_stack,
2569 joined_hits,
2570 fusion_dir);
2571
2572 return join_success = true;
2573 }
2574 return join_success;
2575 }
2576
2577 bool join_segments_for_read(RefSequenceTable& rt,
2578 const string& read_seq,
2579 const string& read_quals,
2580 std::set<Junction>& possible_juncs,
2581 std::set<Insertion>& possible_insertions,
2582 std::set<Fusion>& possible_fusions,
2583 vector<HitsForRead>& seg_hits_for_read,
2584 vector<BowtieHit>& joined_hits)
2585 {
2586 vector<BowtieHit> seg_hit_stack;
2587 bool join_success = false;
2588
2589 // ignore segments that map to more than this many places.
2590 if (bowtie2)
2591 {
2592 for (size_t s = 0; s < seg_hits_for_read.size(); ++s)
2593 {
2594 if (seg_hits_for_read[s].hits.size() > max_seg_multihits)
2595 return join_success;
2596 }
2597 }
2598
2599 for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
2600 {
2601 BowtieHit& bh = seg_hits_for_read[0].hits[i];
2602
2603 // daehwan - remove this
2604 //if (bh.insert_id() == 16487)
2605 // bDebug = true;
2606
2607 if (bh.fusion_opcode() == FUSION_RR)
2608 seg_hit_stack.push_back(bh.reverse());
2609 else
2610 seg_hit_stack.push_back(bh);
2611
2612 bool success = dfs_seg_hits(rt,
2613 read_seq,
2614 read_quals,
2615 possible_juncs,
2616 possible_insertions,
2617 possible_fusions,
2618 seg_hits_for_read,
2619 1,
2620 seg_hit_stack,
2621 joined_hits);
2622 if (success)
2623 join_success = true;
2624 seg_hit_stack.pop_back();
2625 }
2626
2627 return join_success;
2628 }
2629
2630 struct JoinSegmentsWorker
2631 {
2632 void operator()()
2633 {
2634 ReadTable it;
2635 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header_fname.c_str());
2636 ReadStream readstream(reads_fname);
2637 if (readstream.file() == NULL)
2638 err_die("Error: cannot open %s for reading\n", reads_fname.c_str());
2639
2640 if (read_offset > 0)
2641 readstream.seek(read_offset);
2642
2643 bool need_seq = true;
2644 bool need_qual = true;
2645
2646 vector<HitStream> contig_hits;
2647 vector<HitStream> spliced_hits;
2648 vector<HitFactory*> factories;
2649 for (size_t i = 0; i < segmap_fnames.size(); ++i)
2650 {
2651 HitFactory* fac = new BAMHitFactory(it, *rt);
2652 factories.push_back(fac);
2653 HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
2654
2655 if (seg_offsets[i] > 0)
2656 hs.seek(seg_offsets[i]);
2657
2658 contig_hits.push_back(hs);
2659 }
2660
2661 for (size_t i = 0; i < spliced_segmap_fnames.size(); ++i)
2662 {
2663 int anchor_length = 0;
2664 HitFactory* fac = new SplicedBAMHitFactory(it, *rt, anchor_length);
2665 factories.push_back(fac);
2666
2667 HitStream hs(spliced_segmap_fnames[i], fac, true, false, false, need_seq, need_qual);
2668 if (spliced_seg_offsets[i] > 0)
2669 hs.seek(spliced_seg_offsets[i]);
2670
2671 spliced_hits.push_back(hs);
2672 }
2673
2674 uint32_t curr_contig_obs_order = VMAXINT32;
2675 HitStream* first_seg_contig_stream = NULL;
2676 uint64_t next_contig_id = 0;
2677
2678 if (contig_hits.size() > 0)
2679 {
2680 first_seg_contig_stream = &(contig_hits.front());
2681 next_contig_id = first_seg_contig_stream->next_group_id();
2682 curr_contig_obs_order = it.observation_order(next_contig_id);
2683 }
2684
2685 HitsForRead curr_hit_group;
2686
2687 uint32_t curr_spliced_obs_order = VMAXINT32;
2688 HitStream* first_seg_spliced_stream = NULL;
2689 uint64_t next_spliced_id = 0;
2690
2691 if (spliced_hits.size() > 0)
2692 {
2693 first_seg_spliced_stream = &(spliced_hits.front());
2694 next_spliced_id = first_seg_spliced_stream->next_group_id();
2695 curr_spliced_obs_order = it.observation_order(next_spliced_id);
2696 }
2697
2698 while((curr_contig_obs_order != VMAXINT32 || curr_spliced_obs_order != VMAXINT32) &&
2699 (curr_contig_obs_order < end_id || curr_spliced_obs_order < end_id))
2700 {
2701 uint32_t read_in_process;
2702 vector<HitsForRead> seg_hits_for_read;
2703 seg_hits_for_read.resize(contig_hits.size());
2704
2705 if (curr_contig_obs_order < curr_spliced_obs_order)
2706 {
2707 first_seg_contig_stream->next_read_hits(curr_hit_group);
2708 seg_hits_for_read.front() = curr_hit_group;
2709
2710 next_contig_id = first_seg_contig_stream->next_group_id();
2711 uint32_t next_order = it.observation_order(next_contig_id);
2712
2713 read_in_process = curr_contig_obs_order;
2714 curr_contig_obs_order = next_order;
2715 }
2716 else if (curr_spliced_obs_order < curr_contig_obs_order)
2717 {
2718 first_seg_spliced_stream->next_read_hits(curr_hit_group);
2719 seg_hits_for_read.front() = curr_hit_group;
2720
2721 next_spliced_id = first_seg_spliced_stream->next_group_id();
2722 uint32_t next_order = it.observation_order(next_spliced_id);
2723
2724 read_in_process = curr_spliced_obs_order;
2725 curr_spliced_obs_order = next_order;
2726
2727 if (read_in_process < begin_id)
2728 continue;
2729 }
2730 else if (curr_contig_obs_order == curr_spliced_obs_order &&
2731 curr_contig_obs_order != VMAXINT32 &&
2732 curr_spliced_obs_order != VMAXINT32)
2733 {
2734 first_seg_contig_stream->next_read_hits(curr_hit_group);
2735
2736 HitsForRead curr_spliced_group;
2737 first_seg_spliced_stream->next_read_hits(curr_spliced_group);
2738
2739 curr_hit_group.hits.insert(curr_hit_group.hits.end(),
2740 curr_spliced_group.hits.begin(),
2741 curr_spliced_group.hits.end());
2742 seg_hits_for_read.front() = curr_hit_group;
2743 read_in_process = curr_spliced_obs_order;
2744
2745 next_contig_id = first_seg_contig_stream->next_group_id();
2746 uint32_t next_order = it.observation_order(next_contig_id);
2747
2748 next_spliced_id = first_seg_spliced_stream->next_group_id();
2749 uint32_t next_spliced_order = it.observation_order(next_spliced_id);
2750
2751 curr_spliced_obs_order = next_spliced_order;
2752 curr_contig_obs_order = next_order;
2753 }
2754 else
2755 {
2756 break;
2757 }
2758
2759 if (contig_hits.size() > 1)
2760 {
2761 look_right_for_hit_group(it,
2762 contig_hits,
2763 0,
2764 spliced_hits,
2765 curr_hit_group,
2766 seg_hits_for_read);
2767 }
2768
2769 size_t last_non_empty = seg_hits_for_read.size() - 1;
2770 while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
2771 {
2772 --last_non_empty;
2773 }
2774
2775 seg_hits_for_read.resize(last_non_empty + 1);
2776 if (!seg_hits_for_read[last_non_empty].hits[0].end())
2777 continue;
2778
2779 if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
2780 {
2781 uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
2782 if (insert_id >= begin_id && insert_id < end_id)
2783 {
2784 Read read;
2785 if (readstream.getRead(insert_id, read))
2786 {
2787 vector<BowtieHit> joined_hits;
2788 join_segments_for_read(*rt,
2789 read.seq.c_str(),
2790 read.qual.c_str(),
2791 *possible_juncs,
2792 *possible_insertions,
2793 *possible_fusions,
2794 seg_hits_for_read,
2795 joined_hits);
2796
2797 sort(joined_hits.begin(), joined_hits.end());
2798 vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
2799 joined_hits.erase(new_end, joined_hits.end());
2800 for (size_t i = 0; i < joined_hits.size(); i++)
2801 {
2802 const char* ref_name = rt->get_name(joined_hits[i].ref_id());
2803 const char* ref_name2 = "";
2804 if (joined_hits[i].fusion_opcode() != FUSION_NOTHING)
2805 ref_name2 = rt->get_name(joined_hits[i].ref_id2());
2806
2807 vector<string> extra_fields;
2808
2809 if (!color)
2810 bowtie_sam_extra(joined_hits[i], *rt, extra_fields);
2811
2812 if (color)
2813 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2814 joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true, &extra_fields);
2815 else
2816 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2817 read.seq.c_str(), read.qual.c_str(), false, &extra_fields);
2818 }
2819 }
2820 else
2821 {
2822 err_die("Error: could not get read # %d from stream\n",
2823 read_in_process);
2824 }
2825 }
2826 }
2827 else
2828 {
2829 //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
2830 }
2831 }
2832
2833 for (size_t fac = 0; fac < factories.size(); ++fac)
2834 {
2835 delete factories[fac];
2836 }
2837 factories.clear();
2838 }
2839
2840 string bam_output_fname;
2841 string sam_header_fname;
2842 string reads_fname;
2843
2844 vector<string> segmap_fnames;
2845 vector<string> spliced_segmap_fnames;
2846
2847 std::set<Junction>* possible_juncs;
2848 std::set<Insertion>* possible_insertions;
2849 std::set<Fusion>* possible_fusions;
2850
2851 RefSequenceTable* rt;
2852
2853 uint64_t begin_id;
2854 uint64_t end_id;
2855 int64_t read_offset;
2856 vector<int64_t> seg_offsets;
2857 vector<int64_t> spliced_seg_offsets;
2858 };
2859
2860 void driver(const string& bam_output_fname,
2861 istream& ref_stream,
2862 vector<FILE*>& possible_juncs_files,
2863 vector<FILE*>& possible_insertions_files,
2864 vector<FILE*>& possible_deletions_files,
2865 vector<FILE*>& possible_fusions_files,
2866 vector<string>& spliced_segmap_fnames, //.bam files
2867 vector<string>& segmap_fnames, //.bam files
2868 const string& reads_fname)
2869 {
2870 if (!parallel)
2871 num_threads = 1;
2872
2873 if (segmap_fnames.size() == 0)
2874 {
2875 fprintf(stderr, "No hits to process, exiting\n");
2876 exit(0);
2877 }
2878
2879 RefSequenceTable rt(sam_header, true);
2880 fprintf (stderr, "Loading reference sequences...\n");
2881 get_seqs(ref_stream, rt, true);
2882 fprintf (stderr, " reference sequences loaded.\n");
2883
2884 fprintf(stderr, "Loading junctions...");
2885 std::set<Junction> possible_juncs;
2886
2887 for (size_t i = 0; i < possible_juncs_files.size(); ++i)
2888 {
2889 char buf[2048];
2890 while(!feof(possible_juncs_files[i]) &&
2891 fgets(buf, sizeof(buf), possible_juncs_files[i]))
2892 {
2893 char junc_ref_name[256];
2894 int left;
2895 int right;
2896 char orientation;
2897 int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
2898 if (ret != 4)
2899 continue;
2900 uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
2901 possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
2902 }
2903 }
2904 fprintf(stderr, "done\n");
2905
2906 fprintf(stderr, "Loading deletions...");
2907
2908 for (size_t i = 0; i < possible_deletions_files.size(); ++i)
2909 {
2910 char splice_buf[2048];
2911 FILE* deletions_file = possible_deletions_files[i];
2912 if(!deletions_file){
2913 continue;
2914 }
2915 while(fgets(splice_buf, 2048, deletions_file)){
2916 char* nl = strrchr(splice_buf, '\n');
2917 char* buf = splice_buf;
2918 if (nl) *nl = 0;
2919
2920 char* ref_name = get_token((char**)&buf, "\t");
2921 char* scan_left_coord = get_token((char**)&buf, "\t");
2922 char* scan_right_coord = get_token((char**)&buf, "\t");
2923
2924 if (!scan_left_coord || !scan_right_coord)
2925 {
2926 err_die("Error: malformed deletion coordinate record\n");
2927 }
2928
2929 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2930 uint32_t left_coord = atoi(scan_left_coord);
2931 uint32_t right_coord = atoi(scan_right_coord);
2932 possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
2933 }
2934 }
2935 fprintf(stderr, "done\n");
2936
2937 /*
2938 * Read the insertions from the list of insertion
2939 * files into a set
2940 */
2941 fprintf(stderr, "Loading insertions...");
2942 std::set<Insertion> possible_insertions;
2943 for (size_t i=0; i < possible_insertions_files.size(); ++i)
2944 {
2945 char splice_buf[2048];
2946 FILE* insertions_file = possible_insertions_files[i];
2947 if(!insertions_file){
2948 continue;
2949 }
2950 while(fgets(splice_buf, 2048, insertions_file)){
2951 char* nl = strrchr(splice_buf, '\n');
2952 char* buf = splice_buf;
2953 if (nl) *nl = 0;
2954
2955 char* ref_name = get_token((char**)&buf, "\t");
2956 char* scan_left_coord = get_token((char**)&buf, "\t");
2957 char* scan_right_coord = get_token((char**)&buf, "\t");
2958 char* scan_sequence = get_token((char**)&buf, "\t");
2959
2960 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
2961 {
2962 err_die("Error: malformed insertion coordinate record\n");
2963 }
2964
2965 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2966 uint32_t left_coord = atoi(scan_left_coord);
2967 std::string sequence(scan_sequence);
2968 possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
2969 }
2970 }
2971 fprintf(stderr, "done\n");
2972
2973 vector<uint64_t> read_ids;
2974 vector<vector<int64_t> > offsets;
2975 if (num_threads > 1)
2976 {
2977 vector<string> fnames;
2978 fnames.push_back(reads_fname);
2979 fnames.insert(fnames.end(), spliced_segmap_fnames.rbegin(), spliced_segmap_fnames.rend());
2980 fnames.insert(fnames.end(), segmap_fnames.rbegin(), segmap_fnames.rend());
2981 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
2982 if (!enough_data)
2983 num_threads = 1;
2984 }
2985
2986 std::set<Fusion> possible_fusions;
2987 if (fusion_search)
2988 {
2989 fprintf(stderr, "Loading fusions...");
2990 for (size_t i=0; i < possible_fusions_files.size(); ++i)
2991 {
2992 char splice_buf[2048];
2993 FILE* fusions_file = possible_fusions_files[i];
2994 if(!fusions_file){
2995 continue;
2996 }
2997 while(fgets(splice_buf, 2048, fusions_file)){
2998 char* nl = strrchr(splice_buf, '\n');
2999 char* buf = splice_buf;
3000 if (nl) *nl = 0;
3001
3002 char* ref_name1 = strsep((char**)&buf, "\t");
3003 char* scan_left_coord = strsep((char**)&buf, "\t");
3004 char* ref_name2 = strsep((char**)&buf, "\t");
3005 char* scan_right_coord = strsep((char**)&buf, "\t");
3006 char* scan_dir = strsep((char**)&buf, "\t");
3007
3008 if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
3009 {
3010 fprintf(stderr,"Error: malformed insertion coordinate record\n");
3011 exit(1);
3012 }
3013
3014 uint32_t ref_id1 = rt.get_id(ref_name1,NULL,0);
3015 uint32_t ref_id2 = rt.get_id(ref_name2,NULL,0);
3016 uint32_t left_coord = atoi(scan_left_coord);
3017 uint32_t right_coord = atoi(scan_right_coord);
3018 uint32_t dir = FUSION_FF;
3019 if (strcmp(scan_dir, "fr") == 0)
3020 dir = FUSION_FR;
3021 else if(strcmp(scan_dir, "rf") == 0)
3022 dir = FUSION_RF;
3023 else if (strcmp(scan_dir, "rr") == 0)
3024 dir = FUSION_RR;
3025
3026 possible_fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
3027 }
3028 }
3029 fprintf(stderr, "done\n");
3030 }
3031
3032 vector<boost::thread*> threads;
3033 for (int i = 0; i < num_threads; ++i)
3034 {
3035 JoinSegmentsWorker worker;
3036
3037 if (num_threads == 1)
3038 worker.bam_output_fname = bam_output_fname;
3039 else
3040 {
3041 string filename_base = bam_output_fname.substr(0, bam_output_fname.length() - 4);
3042 char filename[1024] = {0};
3043 sprintf(filename, "%s%d.bam", filename_base.c_str(), i);
3044 worker.bam_output_fname = filename;
3045 }
3046
3047 worker.sam_header_fname = sam_header;
3048 worker.reads_fname = reads_fname;
3049 worker.segmap_fnames = segmap_fnames;
3050 worker.spliced_segmap_fnames = spliced_segmap_fnames;
3051 worker.possible_juncs = &possible_juncs;
3052 worker.possible_insertions = &possible_insertions;
3053 worker.possible_fusions = &possible_fusions;
3054 worker.rt = &rt;
3055 if (i == 0)
3056 {
3057 worker.begin_id = 0;
3058 worker.seg_offsets = vector<int64_t>(segmap_fnames.size(), 0);
3059 worker.spliced_seg_offsets = vector<int64_t>(spliced_segmap_fnames.size(), 0);
3060 worker.read_offset = 0;
3061 }
3062 else
3063 {
3064 worker.begin_id = read_ids[i-1];
3065 worker.seg_offsets.insert(worker.seg_offsets.end(),
3066 offsets[i-1].rbegin(),
3067 offsets[i-1].rbegin() + segmap_fnames.size());
3068 worker.spliced_seg_offsets.insert(worker.spliced_seg_offsets.end(),
3069 offsets[i-1].rbegin() + segmap_fnames.size(),
3070 offsets[i-1].rend() - 1);
3071 worker.read_offset = offsets[i-1][0];
3072 }
3073
3074 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
3075
3076 if (num_threads > 1)
3077 threads.push_back(new boost::thread(worker));
3078 else
3079 worker();
3080 }
3081
3082 for (size_t i = 0; i < threads.size(); ++i)
3083 {
3084 threads[i]->join();
3085 delete threads[i];
3086 threads[i] = NULL;
3087 }
3088 threads.clear();
3089
3090 } //driver
3091
3092 int main(int argc, char** argv)
3093 {
3094 fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
3095 fprintf(stderr, "--------------------------------------------\n");
3096
3097 int parse_ret = parse_options(argc, argv, print_usage);
3098 if (parse_ret)
3099 return parse_ret;
3100
3101 if(optind >= argc)
3102 {
3103 print_usage();
3104 return 1;
3105 }
3106
3107 string ref_file_name = argv[optind++];
3108
3109 if(optind >= argc)
3110 {
3111 print_usage();
3112 return 1;
3113 }
3114
3115
3116 string reads_file_name = argv[optind++];
3117
3118 if(optind >= argc)
3119 {
3120 print_usage();
3121 return 1;
3122 }
3123
3124 string juncs_file_list = argv[optind++];
3125
3126 if(optind >= argc)
3127 {
3128 print_usage();
3129 return 1;
3130 }
3131
3132 string insertions_file_list = argv[optind++];
3133 if(optind >= argc)
3134 {
3135 print_usage();
3136 return 1;
3137 }
3138
3139 string deletions_file_list = argv[optind++];
3140
3141 if(optind >= argc)
3142 {
3143 print_usage();
3144 return 1;
3145 }
3146
3147 string fusions_file_list = argv[optind++];
3148
3149 if(optind >= argc)
3150 {
3151 print_usage();
3152 return 1;
3153 }
3154
3155 string bam_output_fname = argv[optind++];
3156
3157 if(optind >= argc)
3158 {
3159 print_usage();
3160 return 1;
3161 }
3162
3163 string segmap_file_list = argv[optind++];
3164 string spliced_segmap_flist;
3165 if(optind < argc)
3166 {
3167 spliced_segmap_flist = argv[optind++];
3168 }
3169
3170 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
3171 if (!ref_stream.good())
3172 err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
3173
3174 checkSamHeader();
3175
3176 //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
3177 vector<string> segmap_file_names;
3178 tokenize(segmap_file_list, ",",segmap_file_names);
3179
3180 vector<string> juncs_file_names;
3181 vector<FILE*> juncs_files;
3182 tokenize(juncs_file_list, ",",juncs_file_names);
3183 for (size_t i = 0; i < juncs_file_names.size(); ++i) {
3184 //fprintf(stderr, "Opening %s for reading\n",
3185 // juncs_file_names[i].c_str());
3186 FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
3187 if (juncs_file == NULL) {
3188 fprintf(stderr, "Warning: cannot open %s for reading\n",
3189 juncs_file_names[i].c_str());
3190 continue;
3191 }
3192 juncs_files.push_back(juncs_file);
3193 }
3194
3195 /*
3196 * Read in the deletion file names
3197 */
3198 vector<string> deletions_file_names;
3199 vector<FILE*> deletions_files;
3200 tokenize(deletions_file_list, ",",deletions_file_names);
3201 for (size_t i = 0; i < deletions_file_names.size(); ++i)
3202 {
3203 //fprintf(stderr, "Opening %s for reading\n",
3204 // deletions_file_names[i].c_str());
3205 FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
3206 if (deletions_file == NULL) {
3207 fprintf(stderr, "Warning: cannot open %s for reading\n",
3208 deletions_file_names[i].c_str());
3209 continue;
3210 }
3211 deletions_files.push_back(deletions_file);
3212 }
3213
3214 /*
3215 * Read in the list of filenames that contain
3216 * insertion coordinates
3217 */
3218
3219 vector<string> insertions_file_names;
3220 vector<FILE*> insertions_files;
3221 tokenize(insertions_file_list, ",",insertions_file_names);
3222 for (size_t i = 0; i < insertions_file_names.size(); ++i)
3223 {
3224 //fprintf(stderr, "Opening %s for reading\n",
3225 // insertions_file_names[i].c_str());
3226 FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
3227 if (insertions_file == NULL)
3228 {
3229 fprintf(stderr, "Warning: cannot open %s for reading\n",
3230 insertions_file_names[i].c_str());
3231 continue;
3232 }
3233 insertions_files.push_back(insertions_file);
3234 }
3235
3236 vector<string> spliced_segmap_file_names;
3237 vector<FZPipe> spliced_segmap_files;
3238 string unzcmd;
3239 tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
3240
3241 vector<string> fusions_file_names;
3242 vector<FILE*> fusions_files;
3243 tokenize(fusions_file_list, ",",fusions_file_names);
3244 for (size_t i = 0; i < fusions_file_names.size(); ++i)
3245 {
3246 fprintf(stderr, "Opening %s for reading\n",
3247 fusions_file_names[i].c_str());
3248 FILE* fusions_file = fopen(fusions_file_names[i].c_str(), "r");
3249 if (fusions_file == NULL)
3250 {
3251 fprintf(stderr, "Warning: cannot open %s for reading\n",
3252 fusions_file_names[i].c_str());
3253 continue;
3254 }
3255 fusions_files.push_back(fusions_file);
3256 }
3257
3258
3259 driver(bam_output_fname,
3260 ref_stream,
3261 juncs_files,
3262 insertions_files,
3263 deletions_files,
3264 fusions_files,
3265 spliced_segmap_file_names,
3266 segmap_file_names,
3267 reads_file_name);
3268
3269 return 0;
3270 }