ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/fusions.cpp
Revision: 200
Committed: Fri Mar 9 22:25:25 2012 UTC (12 years, 5 months ago) by gpertea
File size: 27584 byte(s)
Log Message:
Line File contents
1 /*
2 * fusions.cpp
3 * TopHat
4 */
5
6 #ifdef HAVE_CONFIG_H
7 #include <config.h>
8 #else
9 #define PACKAGE_VERSION "INTERNAL"
10 #define SVN_REVISION "XXX"
11 #endif
12
13
14 #include <cassert>
15 #include <cstdio>
16 #include <cstring>
17 #include <vector>
18 #include <string>
19 #include <map>
20 #include <algorithm>
21 #include <set>
22 #include <stdexcept>
23 #include <iostream>
24 #include <fstream>
25 #include <seqan/sequence.h>
26 #include <seqan/find.h>
27 #include <seqan/file.h>
28 #include <seqan/modifier.h>
29 #include <seqan/align.h>
30 #include <getopt.h>
31
32 #include "common.h"
33 #include "bwt_map.h"
34 #include "junctions.h"
35 #include "fusions.h"
36 #include "fragments.h"
37 #include "wiggles.h"
38 #include "tokenize.h"
39 #include "reads.h"
40
41 #include "inserts.h"
42
43 // daehwan - replace this with that of SeqAn
44 int difference(const string& first, const string& second)
45 {
46 int len = seqan::length(first);
47 if (len != (int)seqan::length(second))
48 return 0;
49
50 int min_value = 10000;
51 short value1[1024] = {0,};
52 short value2[1024] = {0,};
53
54 short* curr = value1;
55 short* prev = value2;
56
57 for (int j = 0; j < len; ++j)
58 {
59 for (int i = 0; i < len; ++i)
60 {
61 int value = 10000;
62 int match = first[i] == second[j] ? 0 : 1;
63
64 // right
65 if (i == 0)
66 value = j * 2 + match;
67 else if (j > 0)
68 value = prev[i] + 2;
69
70 int temp_value = 10000;
71
72 // down
73 if (j == 0)
74 temp_value = i * 2 + match;
75 else if (i > 0)
76 temp_value = curr[i-1] + 2;
77
78 if (temp_value < value)
79 value = temp_value;
80
81 // match
82 if (i > 0 && j > 0)
83 temp_value = prev[i-1] + match;
84
85 if (temp_value < value)
86 value = temp_value;
87
88 curr[i] = value;
89
90 if ((i == len - 1 || j == len - 1) && value < min_value)
91 min_value = value;
92 }
93
94 short* temp = prev;
95 prev = curr;
96 curr = temp;
97 }
98
99 return min_value;
100 }
101
102
103 /**
104 * Add fusions from an alignment to an FusionSet.
105 * This will look for fusion in the alignment specified by bh.
106 * @param bh The bowtie hit to be used to specify alignment infromation.
107 * @param fusions The FusionSet that will be updated with the insertion information from teh alignment.
108 */
109 void fusions_from_alignment(const BowtieHit& bh,
110 FusionSet& fusions,
111 RefSequenceTable& rt,
112 bool update_stat)
113 {
114 RefSequenceTable::Sequence* ref_str1 = rt.get_seq(bh.ref_id());
115 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(bh.ref_id2());
116
117 if (!ref_str1 || !ref_str2)
118 return;
119
120 vector<Fusion> new_fusions;
121 fusions_from_spliced_hit(bh, new_fusions);
122
123 for(size_t i = 0; i < new_fusions.size(); ++i)
124 {
125 Fusion fusion = new_fusions[i];
126 const vector<CigarOp>& cigars = bh.cigar();
127
128 /*
129 * Assume read is in the same direction as fusion.
130 */
131
132 // Find the position of Fusion.
133 size_t fusion_pos = 0;
134 for (; fusion_pos < cigars.size(); ++fusion_pos)
135 {
136 CigarOpCode opcode = cigars[fusion_pos].opcode;
137 if (opcode == FUSION_FF || opcode == FUSION_FR || opcode == FUSION_RF || opcode == FUSION_RR)
138 break;
139 }
140
141 if (fusion_pos <= 0 || fusion_pos + 1 >= cigars.size())
142 continue;
143
144 // For left bases,
145 size_t left_pos = 0;
146 for (int j = (int)fusion_pos - 1; j >= 0; --j)
147 {
148 const CigarOp& cigar = cigars[j];
149 switch (cigar.opcode)
150 {
151 case MATCH:
152 case mATCH:
153 case REF_SKIP:
154 case rEF_SKIP:
155 case DEL:
156 case dEL:
157 left_pos += cigar.length;
158 break;
159 default:
160 break;
161 }
162 }
163
164 // For right bases,
165 size_t right_pos = 0;
166 for (size_t j = fusion_pos + 1; j < cigars.size(); ++j)
167 {
168 const CigarOp& cigar = cigars[j];
169 switch (cigar.opcode)
170 {
171 case MATCH:
172 case mATCH:
173 case REF_SKIP:
174 case rEF_SKIP:
175 case DEL:
176 case dEL:
177 right_pos += cigar.length;
178 break;
179 default:
180 break;
181 }
182 }
183
184 if (left_pos < fusion_anchor_length || right_pos < fusion_anchor_length)
185 continue;
186
187 FusionSet::iterator itr = fusions.find(fusion);
188 if (itr != fusions.end())
189 {
190 itr->second.count += 1;
191 }
192 else
193 {
194 assert(fusion.refid1 != 0xFFFFFFFF);
195 FusionStat fusionStat;
196 fusionStat.count = 1;
197 fusions[fusion] = fusionStat;
198
199 itr = fusions.find(fusion);
200
201 if (!update_stat)
202 {
203 /*
204 * make a reversed fusion.
205 * this is necessary to detect reads that contradict the fusion.
206 */
207
208 FusionStat fusionStat_rev;
209 fusionStat_rev.count = 1;
210 fusionStat_rev.reversed = true;
211
212 Fusion fusion_rev(fusion.refid2, fusion.refid1, fusion.right, fusion.left, fusion.dir);
213 fusions[fusion_rev] = fusionStat_rev;
214 }
215 }
216
217 if (update_stat)
218 {
219 if (itr->second.chr1_seq.length() <= 0)
220 {
221 size_t len = 100;
222 size_t half_len = len / 2;
223 size_t increase = 20;
224
225 if (fusion.left >= half_len && fusion.left + half_len <= seqan::length(*ref_str1) &&
226 fusion.right >= half_len && fusion.right + half_len <= seqan::length(*ref_str2))
227 {
228 seqan::Dna5String left, right;
229
230 if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR)
231 {
232 left = seqan::infix(*ref_str1, fusion.left - half_len, fusion.left + half_len);
233 seqan::reverseComplement(left);
234 }
235 else
236 left = seqan::infix(*ref_str1, fusion.left - half_len + 1, fusion.left + half_len + 1);
237
238 if (fusion.dir == FUSION_FR || fusion.dir == FUSION_RR)
239 {
240 right = seqan::infix(*ref_str2, fusion.right - half_len + 1, fusion.right + half_len + 1);
241 seqan::reverseComplement(right);
242 }
243 else
244 right = seqan::infix(*ref_str2, fusion.right - half_len, fusion.right + half_len);
245
246 itr->second.chr1_seq = DnaString_to_string(left);
247 itr->second.chr2_seq = DnaString_to_string(right);
248
249 for (size_t j = 0; j < 5; ++j)
250 {
251 size_t pos = (5 - j - 1) * increase / 2;
252 const string& left_sub = itr->second.chr1_seq.substr(pos, (j+1) * increase);
253 const string& right_sub = itr->second.chr2_seq.substr(pos, (j+1) * increase);
254
255 itr->second.diffs.push_back(difference(left_sub, right_sub));
256 }
257 }
258 }
259
260 assert (bh.ref_id() == itr->first.refid1);
261
262 itr->second.left_ext = max((size_t)itr->second.left_ext, left_pos);
263 itr->second.right_ext = max((size_t)itr->second.right_ext, right_pos);
264
265 for (size_t k = 0; k < left_pos && k < itr->second.left_bases.size(); ++k)
266 {
267 ++(itr->second.left_bases[k]);
268 }
269
270 for (size_t k = 0; k < right_pos && k < itr->second.right_bases.size(); ++k)
271 {
272 ++(itr->second.right_bases[k]);
273 }
274 }
275 }
276 }
277
278 void unsupport_fusions(const BowtieHit& bh, FusionSet& fusions, const FusionSet& fusions_ref)
279 {
280 if (bh.fusion_opcode() != FUSION_NOTHING || bh.is_spliced() || bh.read_len() < 40)
281 return;
282
283 FusionSet::const_iterator lb, ub;
284
285 uint32_t left = bh.left() + 20;
286 uint32_t right = bh.right() - 20;
287
288 lb = fusions_ref.upper_bound(Fusion(0u, 0u, left, 0));
289 ub = fusions_ref.lower_bound(Fusion(0xffffffffu, 0xffffffffu, right, 0xffffffffu));
290 while (lb != ub && lb != fusions_ref.end())
291 {
292 if (lb->first.refid1 == bh.ref_id())
293 {
294 // daehwan
295 #if 0
296 // MCF-7 RPS6KB1 17:57970443-58027925:1 TMEM49 17:57784863-57917950:1
297 if ((lb->first.left == 57992061 && lb->first.right == 57917126) ||
298 (lb->first.left == 57917126 && lb->first.right == 57992061))
299 {
300 const char* dir_str = "ff";
301 if (lb->first.dir == FUSION_FR)
302 dir_str = "fr";
303 else if (lb->first.dir == FUSION_RF)
304 dir_str = "rf";
305
306 cout << "fusion: " << lb->first.left << "-" << lb->first.right << endl;
307 cout << dir_str << endl;
308 cout << bh.insert_id() << ": " << bh.left() << "-" << bh.right() << "\t";
309 cout << print_cigar(bh.cigar()) << " " << (int)bh.edit_dist() << endl;
310 cout << bh.seq() << endl;
311 cout << bh.qual() << endl;
312 cout << endl;
313 }
314 #endif
315
316 FusionSet::iterator itr;
317 if (lb->second.reversed)
318 itr = fusions.find(Fusion(lb->first.refid2, lb->first.refid1, lb->first.right, lb->first.left, lb->first.dir));
319 else
320 itr = fusions.find(lb->first);
321
322 if (itr == fusions.end())
323 {
324 FusionStat fusionStat;
325 fusionStat.unsupport_count = 1;
326 fusions[lb->first] = fusionStat;
327 }
328 else
329 ++(itr->second.unsupport_count);
330 }
331
332 ++lb;
333 }
334 }
335
336 /**
337 */
338 void print_fusions(FILE* fusions_out, FusionSet& fusions, RefSequenceTable& ref_sequences)
339 {
340 // fprintf(fusions_out, "track name=fusions description=\"TopHat fusions\"\n");
341
342 vector<Fusion> vFusion;
343 for (FusionSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr)
344 {
345 vFusion.push_back(itr->first);
346 }
347 sort(vFusion.begin(), vFusion.end());
348
349 for (size_t i = 0; i < vFusion.size(); ++i)
350 {
351 FusionSet::iterator itr = fusions.find(vFusion[i]);
352 int counts = itr->second.count;
353 if (counts <= 0 || itr->second.reversed)
354 continue;
355
356 const char* dir = "";
357 if (itr->first.dir == FUSION_FF)
358 dir = "ff";
359 else if(itr->first.dir == FUSION_FR)
360 dir = "fr";
361 else if(itr->first.dir == FUSION_RF)
362 dir = "rf";
363 else
364 dir = "rr";
365
366 assert (itr->second.left_bases.size() == itr->second.right_bases.size());
367
368 float symm = 0.0f;
369 for (uint32_t i = 0; i < itr->second.left_bases.size(); ++i)
370 {
371 float term = ((int)itr->second.left_bases[i] - (int)itr->second.right_bases[i]) / (float)counts;
372 symm += (term * term);
373 }
374
375 fprintf(fusions_out, "%s-%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.6f",
376 ref_sequences.get_name(itr->first.refid1),
377 ref_sequences.get_name(itr->first.refid2),
378 itr->first.left,
379 itr->first.right,
380 dir,
381 counts,
382 itr->second.pair_count,
383 itr->second.pair_count_fusion,
384 itr->second.unsupport_count,
385 itr->second.left_ext,
386 itr->second.right_ext,
387 symm);
388
389 fprintf(fusions_out, "\t@\t");
390
391 for (uint32_t i = 0; i < itr->second.diffs.size(); ++i)
392 {
393 fprintf(fusions_out, "%d ", itr->second.diffs[i]);
394 }
395
396 fprintf(fusions_out, "\t@\t");
397
398 uint32_t half_length = itr->second.chr1_seq.length() / 2;
399 fprintf(fusions_out, "%s %s\t@\t", itr->second.chr1_seq.substr(0, half_length).c_str(), itr->second.chr1_seq.substr(half_length).c_str());
400 fprintf(fusions_out, "%s %s\t@\t", itr->second.chr2_seq.substr(0, half_length).c_str(), itr->second.chr2_seq.substr(half_length).c_str());
401
402 for (uint32_t i = 0; i < itr->second.left_bases.size(); ++i)
403 {
404 fprintf(fusions_out, "%d ", itr->second.left_bases[i]);
405 }
406
407 fprintf(fusions_out, "\t@\t");
408
409 for (uint32_t i = 0; i < itr->second.right_bases.size(); ++i)
410 {
411 fprintf(fusions_out, "%d ", itr->second.right_bases[i]);
412 }
413
414 fprintf(fusions_out, "\t@\t");
415
416 sort(itr->second.vPairSupport.begin(), itr->second.vPairSupport.end());
417 for (uint32_t i = 0; i < min((size_t)200, itr->second.vPairSupport.size()); ++i)
418 {
419 fprintf(fusions_out, "%d:%d ", itr->second.vPairSupport[i].ldist, itr->second.vPairSupport[i].rdist);
420 }
421
422 fprintf(fusions_out, "\n");
423 }
424 }
425
426 /**
427 * Extract a list of fusions from a bowtie hit.
428 * Given a bowtie hit, extract a vector of insertions.
429 * @param bh The bowtie hit to use for alignment information.
430 * @param insertions Used to store the resultant vector of insertions.
431 */
432 void fusions_from_spliced_hit(const BowtieHit& bh, vector<Fusion>& fusions, bool auto_sort)
433 {
434 const vector<CigarOp>& cigar = bh.cigar();
435 unsigned int positionInGenome = bh.left();
436
437 for(size_t c = 0; c < cigar.size(); ++c)
438 {
439 Fusion fusion;
440 switch(cigar[c].opcode)
441 {
442 case REF_SKIP:
443 case MATCH:
444 case DEL:
445 positionInGenome += cigar[c].length;
446 break;
447 case rEF_SKIP:
448 case mATCH:
449 case dEL:
450 positionInGenome -= cigar[c].length;
451 break;
452 case FUSION_FF:
453 case FUSION_FR:
454 case FUSION_RF:
455 case FUSION_RR:
456 fusion.dir = cigar[c].opcode;
457 if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR)
458 positionInGenome = positionInGenome + 1;
459 else
460 positionInGenome = positionInGenome - 1;
461
462 if (bh.ref_id() < bh.ref_id2() ||
463 (bh.ref_id() == bh.ref_id2() && positionInGenome < cigar[c].length) ||
464 !auto_sort)
465 {
466 fusion.refid1 = bh.ref_id();
467 fusion.refid2 = bh.ref_id2();
468 fusion.left = positionInGenome;
469 fusion.right = cigar[c].length;
470 }
471 else
472 {
473 assert (auto_sort);
474 fusion.refid1 = bh.ref_id2();
475 fusion.refid2 = bh.ref_id();
476 fusion.left = cigar[c].length;
477 fusion.right = positionInGenome;
478 }
479
480 fusions.push_back(fusion);
481 break;
482 default:
483 break;
484 }
485 }
486 }
487
488 void pair_support(const vector<pair<BowtieHit, BowtieHit> >& best_hits, FusionSet& fusions, FusionSet& fusions_ref)
489 {
490 if (best_hits.size() > fusion_multipairs)
491 return;
492
493 for (size_t i = 0; i < best_hits.size(); ++i)
494 {
495 const BowtieHit& lh = best_hits[i].first;
496 const BowtieHit& rh = best_hits[i].second;
497
498 bool left_fusionSpanned = lh.fusion_opcode() != FUSION_NOTHING;
499 bool right_fusionSpanned = rh.fusion_opcode() != FUSION_NOTHING;
500 if (left_fusionSpanned && right_fusionSpanned)
501 continue;
502
503 bool fusionSpanned = left_fusionSpanned || right_fusionSpanned;
504 bool fusion_leftSide = false;
505
506 uint32_t ref_id1 = lh.ref_id2();
507 uint32_t ref_id2 = rh.ref_id();
508
509 int dir = FUSION_FF;
510 if (fusionSpanned)
511 {
512 if (left_fusionSpanned)
513 dir = lh.fusion_opcode();
514 else
515 dir = rh.fusion_opcode();
516 }
517 else
518 {
519 if (!lh.antisense_align() && !rh.antisense_align())
520 dir = FUSION_FR;
521 else if (lh.antisense_align() && rh.antisense_align())
522 dir = FUSION_RF;
523 else
524 {
525 if (lh.ref_id() == rh.ref_id())
526 {
527 if ((lh.antisense_align() && lh.left() > rh.left()) ||
528 (!lh.antisense_align() && lh.left() < rh.left()))
529 dir = FUSION_FF;
530 else
531 dir = FUSION_RR;
532 }
533 else
534 {
535 if ((lh.antisense_align() && lh.ref_id() > rh.ref_id()) ||
536 (!lh.antisense_align() && lh.ref_id() < rh.ref_id()))
537 dir = FUSION_FF;
538 else
539 dir = FUSION_RR;
540 }
541 }
542 }
543
544 FusionSet::iterator lb, ub;
545 bool unsupport = false;
546
547 // int inner_dist = max_report_intron_length * 2;
548
549 int inner_dist = 10000;
550 int outer_dist = 10000 * 2;
551 int max_dist = 10000 * 2;
552 int left1 = 0, left2 = 0, right1 = 0, right2 = 0;
553
554 if (fusionSpanned)
555 {
556 vector<Fusion> new_fusions;
557 if (left_fusionSpanned)
558 fusions_from_spliced_hit(lh, new_fusions);
559 else
560 fusions_from_spliced_hit(rh, new_fusions);
561
562 Fusion& fusion = new_fusions[0];
563 dir = fusion.dir;
564 ref_id1 = fusion.refid1;
565 ref_id2 = fusion.refid2;
566
567 if (left_fusionSpanned && lh.ref_id() != ref_id2 && lh.ref_id2() != ref_id2)
568 unsupport = true;
569
570 if (right_fusionSpanned && rh.ref_id() != ref_id1 && rh.ref_id2() != ref_id1)
571 unsupport = true;
572
573 int temp1, temp2;
574 const BowtieHit* fusionHit;
575 const BowtieHit* otherHit;
576
577 if (left_fusionSpanned)
578 {
579 fusionHit = &lh;
580 otherHit = &rh;
581 }
582 else
583 {
584 fusionHit = &rh;
585 otherHit = &lh;
586 }
587
588 // check the normal hit (otherHit) is on the left hand side of the fusion.
589 if (ref_id1 == ref_id2)
590 {
591 if (dir == FUSION_FF || dir == FUSION_FR)
592 fusion_leftSide = otherHit->left() < fusionHit->left();
593 else if (dir == FUSION_RF)
594 fusion_leftSide = otherHit->left() < fusionHit->right();
595 else
596 fusion_leftSide = otherHit->right() > fusionHit->left();
597 }
598 else
599 fusion_leftSide = fusionHit->ref_id() == otherHit->ref_id();
600
601 // *****
602 // daehwan - make sure what '+' and '-' really mean for FF, FR, RF, RR cases
603 // *****
604 if ((dir != FUSION_RF && dir != FUSION_RR) && fusionHit->antisense_align() && (!fusion_leftSide || otherHit->antisense_align()))
605 unsupport = true;
606
607 if ((dir != FUSION_FR && dir != FUSION_RR) && !fusionHit->antisense_align() && (fusion_leftSide || !otherHit->antisense_align()))
608 unsupport = true;
609
610 if ((dir == FUSION_FR || dir == FUSION_RR) && !fusionHit->antisense_align() && (fusion_leftSide || otherHit->antisense_align()))
611 unsupport = true;
612
613 if ((dir == FUSION_RF || dir == FUSION_RR) && fusionHit->antisense_align() && (!fusion_leftSide || !otherHit->antisense_align()))
614 unsupport = true;
615
616 temp1 = otherHit->left();
617 temp2 = otherHit->right();
618
619 if ((fusion_leftSide && dir == FUSION_RF) || (!fusion_leftSide && dir != FUSION_FR))
620 {
621 temp2 = temp1 + inner_dist;
622 if (temp1 > outer_dist)
623 temp1 = temp1 - outer_dist;
624 else
625 temp1 = 0;
626 }
627 else
628 {
629 if (temp2 >= inner_dist)
630 temp1 = temp2 - inner_dist;
631 else
632 temp1 = 0;
633
634 temp2 = temp2 + outer_dist;
635 }
636
637 if (fusion_leftSide)
638 {
639 left1 = temp1;
640 left2 = temp2;
641 }
642 else
643 {
644 right1 = temp1;
645 right2 = temp2;
646 }
647
648 lb = fusions_ref.find(fusion);
649 ub = fusions_ref.end();
650
651 // daehwan - debug
652 #if 0
653 if (fusion.left == 6994359 && fusion.right == 17581683)
654 {
655 cout << "daehwan - test - pair_with_fusion: " << lh.insert_id() << endl;
656 cout << "edit dist: " << (int)lh.edit_dist() << ", " << (int)rh.edit_dist() << endl;
657 const char* dir_str = "ff";
658 if (dir == FUSION_FR)
659 dir_str = "fr";
660 else if (dir == FUSION_RF)
661 dir_str = "rf";
662 else if (dir == FUSION_RR)
663 dir_str = "rr";
664
665 cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl;
666 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
667 cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl;
668 cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl;
669 cout << rh.ref_id2() << ": " << print_cigar(rh.cigar()) << endl;
670 cout << "found: " << (lb == ub ? "no" : "yes") << endl;
671 cout << "unsupport: " << (unsupport ? "yes" : "no") << endl;
672 cout << "fusion_left: " << (fusion_leftSide ? "yes" : "no") << endl;
673 cout << endl;
674 }
675 #endif
676 }
677 else
678 {
679 if (dir == FUSION_FF)
680 {
681 if (lh.antisense_align())
682 {
683 if (rh.right() >= inner_dist)
684 right1 = rh.right() - inner_dist;
685 right2 = right1 + outer_dist;
686
687 left2 = lh.left() + inner_dist;
688 if (left2 > outer_dist)
689 left1 = left2 - outer_dist;
690 }
691 else
692 {
693 if (lh.right() >= inner_dist)
694 left1 = lh.right() - inner_dist;
695 left2 = left1 + outer_dist;
696
697 right2 = rh.left() + inner_dist;
698 if (right2 > outer_dist)
699 right1 = right2 - outer_dist;
700 }
701 }
702 else if (dir == FUSION_FR)
703 {
704 if (lh.right() >= inner_dist)
705 left1 = lh.right() - inner_dist;
706 left2 = left1 + outer_dist;
707
708 if (rh.right() >= inner_dist)
709 right1 = rh.right() - inner_dist;
710 right2 = right1 + outer_dist;
711 }
712 else if (dir == FUSION_RF)
713 {
714 left2 = lh.left() + inner_dist;
715 right2 = rh.left() + inner_dist;
716
717 if (left2 > outer_dist)
718 left1 = left2 - outer_dist;
719
720 if (right2 > outer_dist)
721 right1 = right2 - outer_dist;
722 }
723 else // if (dir == FUSION_RR)
724 {
725 if (lh.antisense_align())
726 {
727 left2 = lh.left() + inner_dist;
728 if (left2 > outer_dist)
729 left1 = left2 - outer_dist;
730
731 if (rh.right() >= inner_dist)
732 right1 = rh.right() - inner_dist;
733 right2 = right1 + outer_dist;
734 }
735 else
736 {
737 if (lh.right() >= inner_dist)
738 left1 = lh.right() - inner_dist;
739 left2 = left1 + outer_dist;
740
741 right2 = rh.left() + inner_dist;
742 if (right2 > outer_dist)
743 right1 = right2 - outer_dist;
744 }
745 }
746
747 // daehwan - debug
748 #if 0
749 if (fusion.left == 6994359 && fusion.right == 17581683)
750 {
751 const char* dir_str = "ff";
752 if (dir == FUSION_FR)
753 dir_str = "fr";
754 else if (dir == FUSION_RF)
755 dir_str = "rf";
756 else if (dir == FUSION_RR)
757 dir_str = "rr";
758
759 cout << "paired-end from two chromosomes" << endl;
760 cout << "insert id: " << lh.insert_id() << endl;
761 cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl;
762 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
763 cout << lh.ref_id2() << " " << print_cigar(lh.cigar()) << endl;
764 cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl;
765 cout << rh.ref_id2() << " " << print_cigar(rh.cigar()) << endl;
766 cout << "left: " << left1 << "-" << left2 << endl;
767 cout << "right: " << right1 << "-" << right2 << endl;
768 cout << endl;
769 }
770 #endif
771
772 if (ref_id1 > ref_id2 || (ref_id1 == ref_id2 && lh.left() > rh.left()))
773 {
774 uint32_t temp = ref_id1;
775 ref_id1 = ref_id2;
776 ref_id2 = temp;
777
778 temp = left1;
779 left1 = right1;
780 right1 = temp;
781
782 temp = left2;
783 left2 = right2;
784 right2 = temp;
785 }
786
787 lb = fusions_ref.upper_bound(Fusion(ref_id1, ref_id2, left1, right1));
788 ub = fusions_ref.lower_bound(Fusion(ref_id1, ref_id2, left2, right2));
789
790 // daehwan
791 #if 0
792 static const uint32_t chr_id1 = RefSequenceTable::hash_string("chr2");
793 static const uint32_t chr_id2 = RefSequenceTable::hash_string("chr3");
794
795 if ((lh.ref_id() == chr_id1 && rh.ref_id() == chr_id2) ||
796 (lh.ref_id() == chr_id2 && rh.ref_id() == chr_id1))
797 {
798 // KPL-4 BSG 19:571325-583492:1 NFIX 19:13106584-13209610:1
799 // const uint32_t left1 = 571325, right1 = 583492, left2 = 13106584, right2 = 13209610;
800
801 // SK-BR-3 DHX35 20:37590942-37668366:1 ITCH 20:32951041-33099198:1
802 // const uint32_t left1 = 37590942, right1 = 37668366, left2 = 32951041, right2 = 33099198;
803
804 // SK-BR-3 NFS1 20:34220262-34287287:-1 PREX1 20:47240790-47444420:-1
805 // const uint32_t left1 = 34220262, right1 = 34287287, left2 = 47240790, right2 = 47444420;
806
807 // VCaP TIA1 2:70436576-70475792:-1 DIRC2 3:122513642-122599986:1
808 uint32_t left1 = 70436576, right1 = 70475792, left2 = 122513642, right2 = 122599986;
809 if (lh.ref_id() != chr_id1)
810 {
811 uint32_t temp = left1;
812 left1 = left2;
813 left2 = temp;
814
815 temp = right1;
816 right1 = right2;
817 right2 = temp;
818 }
819
820 if ((lh.left() >= left1 && lh.left() <= right1 && rh.left() >= left2 && rh.left() <= right2) ||
821 (lh.left() >= left2 && lh.left() <= right2 && rh.left() >= left1 && rh.left() <= right1))
822 {
823 for (size_t t = 0; t < left.size(); ++t)
824 {
825 const BowtieHit& lh = left[t];
826 const BowtieHit& rh = right[t];
827
828 const char* dir_str = "ff";
829 if (dir == FUSION_FR)
830 dir_str = "fr";
831 else if (dir == FUSION_RF)
832 dir_str = "rf";
833 else if (dir == FUSION_RR)
834 dir_str = "rr";
835
836 cout << "paired-end from two chromosomes" << endl;
837 cout << "insert id: " << lh.insert_id() << endl;
838 cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl;
839 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
840 cout << lh.ref_id2() << " " << print_cigar(lh.cigar()) << endl;
841 cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl;
842 cout << rh.ref_id2() << " " << print_cigar(rh.cigar()) << endl;
843 cout << "left: " << left1 << "-" << left2 << endl;
844 cout << "right: " << right1 << "-" << right2 << endl;
845 cout << endl;
846 }
847 }
848 }
849 #endif
850 }
851
852 while (lb != ub && lb != fusions_ref.end())
853 {
854 if (lb->first.dir == dir &&
855 lb->first.refid1 == ref_id1 &&
856 lb->first.refid2 == ref_id2 &&
857 ((!fusionSpanned && lb->first.right >= right1 && lb->first.right <= right2) || fusionSpanned) &&
858 !lb->second.reversed)
859 {
860 int dist = 0, left_dist = 0, right_dist = 0;
861
862 // daehwan - check this out
863 if (!fusionSpanned || fusion_leftSide)
864 {
865 if (dir == FUSION_RF || dir == FUSION_RR)
866 left_dist = (left2 - inner_dist) - (int)lb->first.left;
867 else
868 left_dist = (int)lb->first.left - (left1 + inner_dist);
869 }
870
871 if (!fusionSpanned || !fusion_leftSide)
872 {
873 if (dir == FUSION_FR || dir == FUSION_RR)
874 right_dist = (int)lb->first.right - (right1 + inner_dist);
875 else
876 right_dist = (right2 - inner_dist) - (int)lb->first.right;
877 }
878
879 dist = abs(left_dist) + abs(right_dist);
880
881 // daehwan - fix this later
882 if (dist < 0 && fusionSpanned)
883 unsupport = true;
884 bool pass = dist < max_dist;
885
886 // daehwan
887 #if 0
888 // if(!fusionSpanned)
889 // if (pass && !unsupport)
890 if (lb->first.left == 6994359 && lb->first.right == 17581683)
891 {
892 const char* dir_str = "ff";
893 if (dir == FUSION_FR)
894 dir_str = "fr";
895 else if (dir == FUSION_RF)
896 dir_str = "rf";
897 else if (dir == FUSION_RR)
898 dir_str = "rr";
899
900 cout << dir_str << endl;
901 cout << "dist: " << dist << endl;
902 cout << lb->first.refid1 << " " << lb->first.refid2 << endl;
903 cout << lb->first.left << "-" << lb->first.right << endl;
904 cout << "unsupport: " << (unsupport ? "yes" : "no") << endl;
905 cout << "pass: " << (pass ? "yes" : "no") << endl;
906 cout << "ids: " << lh.insert_id() << " : " << rh.insert_id() << endl;
907 cout << endl << endl;
908 }
909 #endif
910
911 FusionSet::iterator itr = fusions.find(lb->first);
912 if (itr != fusions.end())
913 {
914 if (unsupport)
915 ++(itr->second.unsupport_count_pair);
916 else if (pass)
917 {
918 if (fusionSpanned)
919 ++(itr->second.pair_count_fusion);
920 else
921 {
922 itr->second.vPairSupport.push_back(FusionPairSupport(left_dist, right_dist));
923 ++(itr->second.pair_count);
924
925 if (itr->second.vPairSupport.size() >= 300)
926 {
927 sort(itr->second.vPairSupport.begin(), itr->second.vPairSupport.end());
928 itr->second.vPairSupport.erase(itr->second.vPairSupport.begin() + 200, itr->second.vPairSupport.end());
929 }
930 }
931 }
932 }
933 else
934 {
935 FusionStat fusionStat;
936 if (unsupport)
937 fusionStat.unsupport_count_pair = 1;
938 else if (pass)
939 {
940 if (fusionSpanned)
941 fusionStat.pair_count_fusion = 1;
942 else
943 {
944 fusionStat.vPairSupport.push_back(FusionPairSupport(left_dist, right_dist));
945 fusionStat.pair_count = 1;
946 }
947 }
948
949 fusions[lb->first] = fusionStat;
950 }
951 }
952
953 if (fusionSpanned)
954 break;
955
956 ++lb;
957 }
958 }
959 }
960
961 void merge_with(FusionSimpleSet& fusions, const FusionSimpleSet& other_fusions)
962 {
963 for (FusionSimpleSet::const_iterator other_itr = other_fusions.begin(); other_itr != other_fusions.end(); ++other_itr)
964 {
965 FusionSimpleSet::iterator itr = fusions.find(other_itr->first);
966 if (itr != fusions.end())
967 {
968 FusionSimpleStat& curr = itr->second;
969 curr.merge_with(other_itr->second);
970 }
971 else
972 {
973 fusions[other_itr->first] = other_itr->second;
974 }
975 }
976 }
977
978 void merge_with(FusionSet& fusions, const FusionSet& other_fusions)
979 {
980 for (FusionSet::const_iterator other_itr = other_fusions.begin(); other_itr != other_fusions.end(); ++other_itr)
981 {
982 FusionSet::iterator itr = fusions.find(other_itr->first);
983 if (itr != fusions.end())
984 {
985 FusionStat& curr = itr->second;
986 curr.merge_with(other_itr->second);
987 }
988 else
989 {
990 fusions[other_itr->first] = other_itr->second;
991 }
992 }
993 }

Properties

Name Value
svn:executable *