ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 260
Committed: Tue Jul 24 20:10:50 2012 UTC (12 years, 1 month ago) by gpertea
File size: 79413 byte(s)
Log Message:
wip tophat.py - adding resume feature

Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include <boost/thread.hpp>
34 #include <boost/random/mersenne_twister.hpp>
35
36 #include "common.h"
37 #include "utils.h"
38 #include "bwt_map.h"
39 #include "junctions.h"
40 #include "insertions.h"
41 #include "deletions.h"
42 #include "fusions.h"
43 #include "align_status.h"
44 #include "fragments.h"
45 #include "wiggles.h"
46 #include "tokenize.h"
47 #include "reads.h"
48 #include "coverage.h"
49 #include "inserts.h"
50 #include "bam_merge.h"
51
52 using namespace std;
53 using namespace seqan;
54 using std::set;
55
56 static const JunctionSet empty_junctions;
57 static const InsertionSet empty_insertions;
58 static const DeletionSet empty_deletions;
59 static const FusionSet empty_fusions;
60 static const Coverage empty_coverage;
61
62 // daehwan - this is redundancy, which should be removed.
63 void get_seqs(istream& ref_stream,
64 RefSequenceTable& rt,
65 bool keep_seqs = true)
66 {
67 while(ref_stream.good() && !ref_stream.eof())
68 {
69 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
70 string name;
71 readMeta(ref_stream, name, Fasta());
72 string::size_type space_pos = name.find_first_of(" \t\r");
73 if (space_pos != string::npos)
74 {
75 name.resize(space_pos);
76 }
77 fprintf(stderr, "\tLoading %s...", name.c_str());
78 seqan::read(ref_stream, *ref_str, Fasta());
79 fprintf(stderr, "done\n");
80
81 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
82 }
83 }
84
85 struct cmp_read_alignment
86 {
87 bool operator() (const BowtieHit& l, const BowtieHit& r) const
88 {
89 return l.alignment_score() > r.alignment_score();
90 }
91 };
92
93 struct cmp_read_equal
94 {
95 bool operator() (const BowtieHit& l, const BowtieHit& r) const
96 {
97 if (l.insert_id() != r.insert_id() ||
98 l.ref_id() != r.ref_id() ||
99 l.ref_id2() != r.ref_id2() ||
100 l.left() != r.left() ||
101 l.right() != r.right() ||
102 l.antisense_align() != r.antisense_align() ||
103 l.mismatches() != r.mismatches() ||
104 l.edit_dist() != r.edit_dist() ||
105 l.cigar().size() != r.cigar().size())
106 return false;
107
108 return true;
109 }
110 };
111
112 void read_best_alignments(const HitsForRead& hits_for_read,
113 HitsForRead& best_hits,
114 const JunctionSet& gtf_junctions,
115 const JunctionSet& junctions = empty_junctions,
116 const InsertionSet& insertions = empty_insertions,
117 const DeletionSet& deletions = empty_deletions,
118 const FusionSet& fusions = empty_fusions,
119 const Coverage& coverage = empty_coverage,
120 bool final_report = false,
121 boost::random::mt19937* rng = NULL)
122 {
123 const vector<BowtieHit>& hits = hits_for_read.hits;
124 if (hits.size() >= max_multihits * 5)
125 return;
126
127 for (size_t i = 0; i < hits.size(); ++i)
128 {
129 if (hits[i].mismatches() > read_mismatches ||
130 hits[i].gap_length() > read_gap_length ||
131 hits[i].edit_dist() > read_edit_dist)
132 continue;
133
134 BowtieHit hit = hits[i];
135 AlignStatus align_status(hit, gtf_junctions,
136 junctions, insertions, deletions, fusions, coverage);
137 hit.alignment_score(align_status._alignment_score);
138
139 if (report_secondary_alignments || !final_report)
140 {
141 best_hits.hits.push_back(hit);
142 }
143 else
144 {
145 // Is the new status better than the current best one?
146 if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0]))
147 {
148 best_hits.hits.clear();
149 best_hits.hits.push_back(hit);
150 }
151 else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good?
152 {
153 best_hits.hits.push_back(hit);
154 }
155 }
156 }
157
158 // due to indel alignments, there may be alignments with the same location
159 std::sort(best_hits.hits.begin(), best_hits.hits.end());
160 vector<BowtieHit>::iterator new_end = std::unique(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_equal());
161 best_hits.hits.erase(new_end, best_hits.hits.end());
162
163 if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0)
164 {
165 sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment());
166 }
167
168 if (final_report)
169 {
170 if (suppress_hits && best_hits.hits.size() > max_multihits)
171 best_hits.hits.clear();
172
173 if (best_hits.hits.size() > max_multihits)
174 {
175 // there may be several alignments with the same alignment scores,
176 // all of which we can not include because of this limit.
177 // let's pick up some of them randomly up to this many max multihits.
178 vector<size_t> tie_indexes;
179 int tie_alignment_score = best_hits.hits[max_multihits - 1].alignment_score();
180 int count_better_alignments = 0;
181 for (size_t i = 0; i < best_hits.hits.size(); ++i)
182 {
183 int temp_alignment_score = best_hits.hits[i].alignment_score();
184 if (temp_alignment_score == tie_alignment_score)
185 tie_indexes.push_back(i);
186 else if (temp_alignment_score < tie_alignment_score)
187 break;
188 else
189 ++count_better_alignments;
190 }
191
192 while (count_better_alignments + tie_indexes.size() > max_multihits)
193 {
194 int random_index = (*rng)() % tie_indexes.size();
195 tie_indexes.erase(tie_indexes.begin() + random_index);
196 }
197
198 for (size_t i = 0; i < tie_indexes.size(); ++i)
199 {
200 if (count_better_alignments + i != tie_indexes[i])
201 best_hits.hits[count_better_alignments + i] = best_hits.hits[tie_indexes[i]];
202 }
203
204 best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
205 }
206 }
207 }
208
209 bool set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, const JunctionSet& junctions, InsertAlignmentGrade& grade)
210 {
211 bool fusion = false;
212 bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
213 bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
214 if (left_fusion && right_fusion)
215 return false;
216
217 fusion = left_fusion || right_fusion;
218 if (!fusion && lh.ref_id() != rh.ref_id())
219 fusion = true;
220
221 if (!fusion && lh.ref_id() == rh.ref_id())
222 {
223 if (lh.antisense_align() == rh.antisense_align())
224 fusion = true;
225 else
226 {
227 int inner_dist = 0;
228 if (lh.antisense_align())
229 // used rh.left() instead of rh.right() for the cases,
230 // where reads overlap with each other or reads span introns
231 inner_dist = lh.left() - rh.left();
232 else
233 inner_dist = rh.left() - lh.left();
234
235 if (inner_dist < 0 || inner_dist > (int)fusion_min_dist)
236 fusion = true;
237 }
238 }
239
240 // a read contains its partner, in which case the paired mapping will be ignored.
241 if (!fusion)
242 {
243 if (lh.left() <= rh.left() && lh.right() >= rh.right() && lh.right() - lh.left() > rh.right() - rh.left())
244 return false;
245 else if (rh.left() <= lh.left() && rh.right() >= lh.right() && rh.right() - rh.left() > lh.right() - lh.left())
246 return false;
247 }
248
249 grade = InsertAlignmentGrade(lh, rh, junctions, fusion);
250
251 return true;
252 }
253
254 struct cmp_pair_alignment
255 {
256 cmp_pair_alignment(const JunctionSet& junctions) :
257 _junctions(&junctions) {}
258
259 bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
260 {
261 InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, *_junctions, gl);
262 InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, *_junctions, gr);
263
264 bool better = gr < gl;
265 bool worse = gl < gr;
266
267 if (better && !worse)
268 return true;
269 else
270 return false;
271 }
272
273 const JunctionSet* _junctions;
274 };
275
276 struct cmp_pair_equal
277 {
278 bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
279 {
280 if (!cmp_read_equal()(l.first, r.first) || !cmp_read_equal()(l.second, r.second))
281 return false;
282
283 return true;
284 }
285 };
286
287 struct cmp_pair_less
288 {
289 bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
290 {
291 if (l.first < r.first)
292 return true;
293 else if (r.first < l.first)
294 return false;
295
296 if (l.second < r.second)
297 return true;
298 else if (r.second < l.second)
299 return false;
300
301 return false;
302 }
303 };
304
305 void pair_best_alignments(const HitsForRead& left_hits,
306 const HitsForRead& right_hits,
307 InsertAlignmentGrade& best_grade,
308 vector<pair<BowtieHit, BowtieHit> >& best_hits,
309 const JunctionSet& gtf_junctions,
310 const JunctionSet& junctions = empty_junctions,
311 const InsertionSet& insertions = empty_insertions,
312 const DeletionSet& deletions = empty_deletions,
313 const FusionSet& fusions = empty_fusions,
314 const Coverage& coverage = empty_coverage,
315 bool final_report = false,
316 boost::random::mt19937* rng = NULL)
317 {
318 const vector<BowtieHit>& left = left_hits.hits;
319 const vector<BowtieHit>& right = right_hits.hits;
320
321 if (left.size() >= max_multihits * 5 || right.size() >= max_multihits * 5)
322 return;
323
324 for (size_t i = 0; i < left.size(); ++i)
325 {
326 if (left[i].mismatches() > read_mismatches ||
327 left[i].gap_length() > read_gap_length ||
328 left[i].edit_dist() > read_edit_dist)
329 continue;
330
331 BowtieHit lh = left[i];
332 AlignStatus align_status(lh, gtf_junctions,
333 junctions, insertions, deletions, fusions, coverage);
334 lh.alignment_score(align_status._alignment_score);
335
336 for (size_t j = 0; j < right.size(); ++j)
337 {
338 if (right[j].mismatches() > read_mismatches ||
339 right[j].gap_length() > read_gap_length ||
340 right[j].edit_dist() > read_edit_dist)
341 continue;
342
343 BowtieHit rh = right[j];
344 AlignStatus align_status(rh, gtf_junctions,
345 junctions, insertions, deletions, fusions, coverage);
346 rh.alignment_score(align_status._alignment_score);
347 InsertAlignmentGrade g;
348 bool allowed;
349 allowed = set_insert_alignment_grade(lh, rh, final_report ? junctions : gtf_junctions, g);
350
351 // daehwan - for debugging purposes
352 #if 0
353 if (lh.insert_id() == 325708 && !g.is_fusion() && false)
354 {
355 fprintf(stderr, "lh %d:%d %s score: %d (from %d) NM: %d\n",
356 lh.ref_id(), lh.left(), print_cigar(lh.cigar()).c_str(),
357 lh.alignment_score(), left[i].alignment_score(), lh.edit_dist());
358 fprintf(stderr, "rh %d:%d %s score: %d (from %d) NM: %d\n",
359 rh.ref_id(), rh.left(), print_cigar(rh.cigar()).c_str(),
360 rh.alignment_score(), right[j].alignment_score(), rh.edit_dist());
361 fprintf(stderr, "combined score: %d is_fusion(%d)\n", g.align_score(), g.is_fusion());
362 }
363 #endif
364
365 if (!allowed) continue;
366 if (!fusion_search && !report_discordant_pair_alignments && g.is_fusion()) continue;
367
368 if (report_secondary_alignments || !final_report)
369 {
370 best_hits.push_back(make_pair(lh, rh));
371 }
372 else
373 {
374 // Is the new status better than the current best one?
375 if (best_grade < g)
376 {
377 best_hits.clear();
378 best_grade = g;
379 best_hits.push_back(make_pair(lh, rh));
380 }
381 else if (!(g < best_grade))
382 {
383 best_hits.push_back(make_pair(lh, rh));
384 }
385 }
386 }
387 }
388
389 std::sort(best_hits.begin(), best_hits.end(), cmp_pair_less());
390
391 // daehwan - for debugging purposes
392 #if 0
393 if (best_hits.size() > 0 && best_hits[0].first.insert_id() == 37239044)
394 {
395 for (size_t i = 0; i < best_hits.size(); ++i)
396 {
397 const BowtieHit& lh = best_hits[i].first;
398 const BowtieHit& rh = best_hits[i].second;
399
400 fprintf(stderr, "%d %d:%d %s %d:%d %s\n",
401 i,
402 lh.ref_id(), lh.left(), print_cigar(lh.cigar()).c_str(),
403 rh.ref_id(), rh.left(), print_cigar(rh.cigar()).c_str());
404 }
405
406 fprintf(stderr, "\n\n\n");
407 }
408 #endif
409
410 vector<pair<BowtieHit, BowtieHit> >::iterator new_end = std::unique(best_hits.begin(), best_hits.end(), cmp_pair_equal());
411 best_hits.erase(new_end, best_hits.end());
412
413 // daehwan - for debugging purposes
414 #if 0
415 if (best_hits.size() > 0 && best_hits[0].first.insert_id() == 37239044)
416 {
417 for (size_t i = 0; i < best_hits.size(); ++i)
418 {
419 const BowtieHit& lh = best_hits[i].first;
420 const BowtieHit& rh = best_hits[i].second;
421
422 fprintf(stderr, "%d %d:%d %s %d:%d %s\n",
423 i,
424 lh.ref_id(), lh.left(), print_cigar(lh.cigar()).c_str(),
425 rh.ref_id(), rh.left(), print_cigar(rh.cigar()).c_str());
426 }
427
428 fprintf(stderr, "\n\n\n");
429 }
430 #endif
431
432
433 if ((report_secondary_alignments || !final_report) && best_hits.size() > 0)
434 {
435 cmp_pair_alignment cmp(final_report ? junctions : gtf_junctions);
436 sort(best_hits.begin(), best_hits.end(), cmp);
437 set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, final_report ? junctions : gtf_junctions, best_grade);
438 }
439
440 if (final_report)
441 {
442 if (suppress_hits && best_hits.size() > max_multihits)
443 best_hits.clear();
444
445 if (best_hits.size() > max_multihits)
446 {
447 vector<size_t> tie_indexes;
448 InsertAlignmentGrade temp_grade;
449 set_insert_alignment_grade(best_hits[max_multihits - 1].first, best_hits[max_multihits - 1].second, junctions, temp_grade);
450 int tie_alignment_score = temp_grade.align_score();
451 int count_better_alignments = 0;
452
453 for (size_t i = 0; i < best_hits.size(); ++i)
454 {
455 set_insert_alignment_grade(best_hits[i].first, best_hits[i].second, junctions, temp_grade);
456 int temp_alignment_score = temp_grade.align_score();
457 if (temp_alignment_score == tie_alignment_score)
458 tie_indexes.push_back(i);
459 else if (temp_alignment_score < tie_alignment_score)
460 break;
461 else
462 ++count_better_alignments;
463 }
464
465 while (count_better_alignments + tie_indexes.size() > max_multihits)
466 {
467 int random_index = (*rng)() % tie_indexes.size();
468 tie_indexes.erase(tie_indexes.begin() + random_index);
469 }
470
471 for (size_t i = 0; i < tie_indexes.size(); ++i)
472 {
473 if (count_better_alignments + i != tie_indexes[i])
474 best_hits[count_better_alignments + i] = best_hits[tie_indexes[i]];
475 }
476
477 best_hits.erase(best_hits.begin() + max_multihits, best_hits.end());
478 }
479 }
480
481 best_grade.num_alignments = best_hits.size();
482 }
483
484 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
485
486 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
487 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
488 int num_hits, const BowtieHit* next_hit, int hitIndex) {
489 bool XS_found = false;
490 if (sam_toks.size()>11) {
491
492 for (size_t i=11;i<sam_toks.size();++i) {
493 if (sam_toks[i].find("NH:i:")==string::npos &&
494 sam_toks[i].find("XS:i:")==string::npos)
495 auxdata.push_back(sam_toks[i]);
496
497 if (sam_toks[i].find("XS:A:")!=string::npos)
498 XS_found = true;
499 }
500 }
501 string aux("NH:i:");
502 str_appendInt(aux, num_hits);
503 auxdata.push_back(aux);
504 if (next_hit) {
505 const char* nh_ref_name = "=";
506 nh_ref_name = rt.get_name(next_hit->ref_id());
507 assert (nh_ref_name != NULL);
508 bool same_contig=(next_hit->ref_id()==bh.ref_id());
509 aux="CC:Z:";
510 aux+= (same_contig ? "=" : nh_ref_name);
511 auxdata.push_back(aux);
512 aux="CP:i:";
513 int nh_gpos=next_hit->left() + 1;
514 str_appendInt(aux, nh_gpos);
515 auxdata.push_back(aux);
516 } //has next_hit
517 // FIXME: this code is still a bit brittle, because it contains no
518 // consistency check that the mates are on opposite strands - a current protocol
519 // requirement, and that the strand indicated by the alignment is consistent
520 // with the orientation of the splices (though that should be handled upstream).
521 if (!XS_found) {
522 const string xs_f("XS:A:+");
523 const string xs_r("XS:A:-");
524 if (library_type == FR_FIRSTSTRAND) {
525 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
526 if (bh.antisense_align())
527 auxdata.push_back(xs_f);
528 else
529 auxdata.push_back(xs_r);
530 }
531 else {
532 if (bh.antisense_align())
533 auxdata.push_back(xs_r);
534 else
535 auxdata.push_back(xs_f);
536 }
537 }
538 else if (library_type == FR_SECONDSTRAND) {
539 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
540 if (bh.antisense_align())
541 auxdata.push_back(xs_r);
542 else
543 auxdata.push_back(xs_f);
544 }
545 else
546 {
547 if (bh.antisense_align())
548 auxdata.push_back(xs_f);
549 else
550 auxdata.push_back(xs_r);
551 }
552 }
553 }
554 if (hitIndex >= 0)
555 {
556 string aux("HI:i:");
557 str_appendInt(aux, hitIndex);
558 auxdata.push_back(aux);
559 }
560 }
561
562 bool rewrite_sam_record(GBamWriter& bam_writer,
563 const RefSequenceTable& rt,
564 const BowtieHit& bh,
565 const char* bwt_buf,
566 const char* read_alt_name,
567 FragmentType insert_side,
568 int num_hits,
569 const BowtieHit* next_hit,
570 bool primary,
571 int hitIndex)
572 {
573 // Rewrite this hit, filling in the alt name, mate mapping
574 // and setting the pair flag
575 vector<string> sam_toks;
576 tokenize(bwt_buf, "\t", sam_toks);
577
578 string ref_name = sam_toks[2], ref_name2 = "";
579 char cigar1[1024] = {0}, cigar2[1024] = {0};
580 string left_seq, right_seq, left_qual, right_qual;
581 int left1 = -1, left2 = -1;
582 bool fusion_alignment = false;
583 size_t XF_index = 0;
584 for (size_t i = 11; i < sam_toks.size(); ++i)
585 {
586 string& tok = sam_toks[i];
587 if (strncmp(tok.c_str(), "XF", 2) == 0)
588 {
589 fusion_alignment = true;
590 XF_index = i;
591
592 vector<string> tuple_fields;
593 tokenize(tok.c_str(), " ", tuple_fields);
594 vector<string> contigs;
595 tokenize(tuple_fields[1].c_str(), "-", contigs);
596 if (contigs.size() >= 2)
597 {
598 ref_name = contigs[0];
599 ref_name2 = contigs[1];
600 }
601
602 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
603 cigar1, cigar2, left_seq, right_seq,
604 left_qual, right_qual, left1, left2);
605
606 break;
607 }
608
609 else if (strncmp(tok.c_str(), "AS", 2) == 0)
610 {
611 char AS_score[128] = {0};
612 sprintf(AS_score, "AS:i:%d", min(0, bh.alignment_score()));
613 tok = AS_score;
614 }
615 }
616
617 string qname(read_alt_name);
618 size_t slash_pos=qname.rfind('/');
619 if (slash_pos!=string::npos)
620 qname.resize(slash_pos);
621 //read_alt_name as QNAME
622 int flag=atoi(sam_toks[1].c_str()); //FLAG
623 if (insert_side != FRAG_UNPAIRED) {
624 //flag = atoi(sam_toks[1].c_str());
625 // mark this as a singleton mate
626 flag |= 0x0001;
627 if (insert_side == FRAG_LEFT)
628 flag |= 0x0040;
629 else if (insert_side == FRAG_RIGHT)
630 flag |= 0x0080;
631 flag |= 0x0008;
632 //char flag_buf[64];
633 //sprintf(flag_buf, "%d", flag);
634 //sam_toks[t] = flag_buf;
635 }
636 if (!primary)
637 flag |= 0x100;
638
639 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
640 int mapQ = 50;
641 if (num_hits > 1) {
642 double err_prob = 1 - (1.0 / num_hits);
643 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
644 }
645 int tlen =atoi(sam_toks[8].c_str()); //TLEN
646 int mate_pos=atoi(sam_toks[7].c_str());
647
648 string rg_aux = "";
649 if (!sam_readgroup_id.empty())
650 rg_aux = string("RG:Z:") + sam_readgroup_id;
651
652 GBamRecord* bamrec=NULL;
653 if (fusion_alignment) {
654 vector<string> auxdata;
655 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
656 if (rg_aux != "")
657 auxdata.push_back(rg_aux);
658 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
659 cigar1, sam_toks[6].c_str(), mate_pos,
660 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
661 bam_writer.write(bamrec);
662 delete bamrec;
663
664 auxdata.clear();
665 sam_toks[XF_index][5] = '2';
666 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
667 if (rg_aux != "")
668 auxdata.push_back(rg_aux);
669 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
670 cigar2, sam_toks[6].c_str(), mate_pos,
671 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
672 bam_writer.write(bamrec);
673 delete bamrec;
674 } else {
675 vector<string> auxdata;
676 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
677 if (rg_aux != "")
678 auxdata.push_back(rg_aux);
679 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
680 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
681 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
682 bam_writer.write(bamrec);
683 delete bamrec;
684 }
685
686 return true;
687 }
688
689 bool rewrite_sam_record(GBamWriter& bam_writer,
690 const RefSequenceTable& rt,
691 const BowtieHit& bh,
692 const char* bwt_buf,
693 const char* read_alt_name,
694 const InsertAlignmentGrade& grade,
695 FragmentType insert_side,
696 const BowtieHit* partner,
697 int num_hits,
698 const BowtieHit* next_hit,
699 bool primary,
700 int hitIndex)
701 {
702 // Rewrite this hit, filling in the alt name, mate mapping
703 // and setting the pair flag
704 vector<string> sam_toks;
705 tokenize(bwt_buf, "\t", sam_toks);
706 string qname(read_alt_name);
707 size_t slash_pos=qname.rfind('/');
708 if (slash_pos!=string::npos)
709 qname.resize(slash_pos);
710 //read_alt_name as QNAME
711 int flag = atoi(sam_toks[1].c_str());
712 // 0x0010 (strand of query) is assumed to be set correctly
713 // to begin with
714 flag |= 0x0001; //it must be paired
715 if (insert_side == FRAG_LEFT)
716 flag |= 0x0040;
717 else if (insert_side == FRAG_RIGHT)
718 flag |= 0x0080;
719 if (!primary)
720 flag |= 0x100;
721
722 string ref_name = sam_toks[2], ref_name2 = "";
723 char cigar1[1024] = {0}, cigar2[1024] = {0};
724 string left_seq, right_seq, left_qual, right_qual;
725 int left1 = -1, left2 = -1;
726 bool fusion_alignment = false;
727 size_t XF_tok_idx = 11;
728 for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
729 {
730 string& tok = sam_toks[XF_tok_idx];
731 if (strncmp(tok.c_str(), "XF", 2) == 0)
732 {
733 fusion_alignment = true;
734
735 vector<string> tuple_fields;
736 tokenize(tok.c_str(), " ", tuple_fields);
737 vector<string> contigs;
738 tokenize(tuple_fields[1].c_str(), "-", contigs);
739 if (contigs.size() >= 2)
740 {
741 ref_name = contigs[0];
742 ref_name2 = contigs[1];
743 }
744
745 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
746 cigar1, cigar2, left_seq, right_seq,
747 left_qual, right_qual, left1, left2);
748
749 break;
750 }
751
752 else if (strncmp(tok.c_str(), "AS", 2) == 0)
753 {
754 char AS_score[128] = {0};
755 sprintf(AS_score, "AS:i:%d", min(0, bh.alignment_score()));
756 tok = AS_score;
757 }
758 }
759
760 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
761 int mapQ = 50;
762 if (grade.num_alignments > 1) {
763 double err_prob = 1 - (1.0 / grade.num_alignments);
764 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
765 }
766 int tlen=0; //TLEN
767 int mate_pos=atoi(sam_toks[7].c_str());
768 string mate_contig = "*", mate_contig2 = "*";
769 if (partner) {
770 if (partner->ref_id() == bh.ref_id()) {
771 mate_contig = "="; //same chromosome
772 //TLEN:
773 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
774 partner->left() - bh.right();
775 }
776
777 else { //partner on different chromosome/contig
778 mate_contig = rt.get_name(partner->ref_id());
779 }
780 mate_pos = partner->left() + 1;
781 if (grade.happy())
782 flag |= 0x0002;
783 if (partner->antisense_align())
784 flag |= 0x0020;
785
786 if (fusion_alignment)
787 {
788 if (partner->ref_id() == bh.ref_id2())
789 mate_contig2 = "=";
790 else
791 mate_contig2 = rt.get_name(partner->ref_id());
792 }
793
794 if (partner->fusion_opcode() != FUSION_NOTHING)
795 {
796 char partner_pos[1024];
797 sprintf(partner_pos, "XP:Z:%s-%s %d", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1);
798 sam_toks.push_back(partner_pos);
799 }
800 }
801 else {
802 mate_pos = 0;
803 flag |= 0x0008;
804 }
805
806 string rg_aux = "";
807 if (!sam_readgroup_id.empty())
808 rg_aux = string("RG:Z:") + sam_readgroup_id;
809
810 GBamRecord* bamrec=NULL;
811 if (fusion_alignment) {
812 vector<string> auxdata;
813 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
814 if (rg_aux != "")
815 auxdata.push_back(rg_aux);
816 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
817 cigar1, mate_contig.c_str(), mate_pos,
818 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
819 bam_writer.write(bamrec);
820 delete bamrec;
821
822 auxdata.clear();
823 sam_toks[XF_tok_idx][5] = '2';
824 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
825 if (rg_aux != "")
826 auxdata.push_back(rg_aux);
827 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
828 cigar2, mate_contig2.c_str(), mate_pos,
829 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
830 bam_writer.write(bamrec);
831 delete bamrec;
832 } else {
833 vector<string> auxdata;
834 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
835 if (rg_aux != "")
836 auxdata.push_back(rg_aux);
837 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
838 sam_toks[5].c_str(), mate_contig.c_str(), mate_pos,
839 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
840 bam_writer.write(bamrec);
841 delete bamrec;
842 }
843
844 return true;
845 }
846
847 struct lex_hit_sort
848 {
849 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
850 : _rt(rt), _hits(hits)
851 {}
852
853 bool operator()(const uint32_t& l, const uint32_t& r) const
854 {
855 const BowtieHit& lhs = _hits.hits[l];
856 const BowtieHit& rhs = _hits.hits[r];
857
858 uint32_t l_id = lhs.ref_id();
859 uint32_t r_id = rhs.ref_id();
860 if (l_id != r_id)
861 return l_id < r_id;
862
863 return lhs.left() < rhs.left();
864 }
865
866 const RefSequenceTable& _rt;
867 const HitsForRead& _hits;
868 };
869
870 void print_sam_for_single(const RefSequenceTable& rt,
871 const HitsForRead& hits,
872 FragmentType frag_type,
873 const string& alt_name,
874 GBamWriter& bam_writer,
875 boost::random::mt19937& rng)
876 {
877 //assert(!read.alt_name.empty());
878 if (hits.hits.empty())
879 return;
880
881 lex_hit_sort s(rt, hits);
882 vector<uint32_t> index_vector;
883
884 size_t i;
885 for (i = 0; i < hits.hits.size(); ++i)
886 index_vector.push_back(i);
887
888 sort(index_vector.begin(), index_vector.end(), s);
889
890 size_t primaryHit = 0;
891 if (!report_secondary_alignments)
892 primaryHit = rng() % hits.hits.size();
893
894 bool multipleHits = (hits.hits.size() > 1);
895 for (i = 0; i < hits.hits.size(); ++i)
896 {
897 size_t index = index_vector[i];
898 bool primary = (index == primaryHit);
899 const BowtieHit& bh = hits.hits[index];
900 rewrite_sam_record(bam_writer, rt,
901 bh,
902 bh.hitfile_rec().c_str(),
903 alt_name.c_str(),
904 frag_type,
905 hits.hits.size(),
906 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
907 primary,
908 (multipleHits? i: -1));
909 }
910 }
911
912 void print_sam_for_pair(const RefSequenceTable& rt,
913 const vector<pair<BowtieHit, BowtieHit> >& best_hits,
914 const InsertAlignmentGrade& grade,
915 GBamWriter& bam_writer,
916 const string& left_alt_name,
917 const string& right_alt_name,
918 boost::random::mt19937& rng,
919 uint64_t begin_id = 0,
920 uint64_t end_id = std::numeric_limits<uint64_t>::max())
921 {
922 Read left_read;
923 Read right_read;
924 if (best_hits.empty())
925 return;
926
927 size_t i;
928 HitsForRead right_hits;
929 for (i = 0; i < best_hits.size(); ++i)
930 right_hits.hits.push_back(best_hits[i].second);
931
932 size_t primaryHit = 0;
933 vector<uint32_t> index_vector;
934 lex_hit_sort s(rt, right_hits);
935 for (i = 0; i < right_hits.hits.size(); ++i)
936 index_vector.push_back(i);
937
938 sort(index_vector.begin(), index_vector.end(), s);
939
940 if (!report_secondary_alignments)
941 primaryHit = rng() % right_hits.hits.size();
942
943 bool multipleHits = (best_hits.size() > 1);
944 for (i = 0; i < best_hits.size(); ++i)
945 {
946 size_t index = index_vector[i];
947 bool primary = (index == primaryHit);
948 const BowtieHit& right_bh = best_hits[index].second;
949 const BowtieHit& left_bh = best_hits[index].first;
950
951 rewrite_sam_record(bam_writer, rt,
952 right_bh,
953 right_bh.hitfile_rec().c_str(),
954 right_alt_name.c_str(),
955 grade,
956 FRAG_RIGHT,
957 &left_bh,
958 best_hits.size(),
959 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL,
960 primary,
961 (multipleHits? i: -1));
962 rewrite_sam_record(bam_writer, rt,
963 left_bh,
964 left_bh.hitfile_rec().c_str(),
965 left_alt_name.c_str(),
966 grade,
967 FRAG_LEFT,
968 &right_bh,
969 best_hits.size(),
970 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL,
971 primary,
972 (multipleHits? i: -1));
973 }
974 }
975
976 /**
977 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
978 * @param hits hits The alignments for a particular read
979 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
980 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
981 */
982 void update_insertions_and_deletions(const HitsForRead& hits,
983 InsertionSet& insertions,
984 DeletionSet& deletions)
985 {
986 for (size_t i = 0; i < hits.hits.size(); ++i)
987 {
988 const BowtieHit& bh = hits.hits[i];
989 insertions_from_alignment(bh, insertions);
990 deletions_from_alignment(bh, deletions);
991 }
992 }
993
994 void update_coverage(const HitsForRead& hits,
995 Coverage& coverage)
996 {
997 for (size_t i = 0; i < hits.hits.size(); ++i)
998 {
999 const BowtieHit& hit = hits.hits[i];
1000 const vector<CigarOp>& cigar = hit.cigar();
1001 unsigned int positionInGenome = hit.left();
1002 RefID ref_id = hit.ref_id();
1003
1004 for(size_t c = 0; c < cigar.size(); ++c)
1005 {
1006 int opcode = cigar[c].opcode;
1007 int length = cigar[c].length;
1008 switch(opcode)
1009 {
1010 case REF_SKIP:
1011 case MATCH:
1012 case DEL:
1013 if (opcode == MATCH)
1014 coverage.add_coverage(ref_id, positionInGenome, length);
1015
1016 positionInGenome += length;
1017 break;
1018 case rEF_SKIP:
1019 case mATCH:
1020 case dEL:
1021 positionInGenome -= length;
1022
1023 if (opcode == mATCH)
1024 coverage.add_coverage(ref_id, positionInGenome + 1, length);
1025 break;
1026 case FUSION_FF:
1027 case FUSION_FR:
1028 case FUSION_RF:
1029 case FUSION_RR:
1030 positionInGenome = length;
1031 ref_id = hit.ref_id2();
1032 break;
1033 default:
1034 break;
1035 }
1036 }
1037 }
1038 }
1039
1040
1041 void update_fusions(const HitsForRead& hits,
1042 RefSequenceTable& rt,
1043 FusionSet& fusions,
1044 const FusionSet& fusions_ref = empty_fusions)
1045 {
1046 if (!fusion_search)
1047 return;
1048
1049 if (hits.hits.size() > fusion_multireads)
1050 return;
1051
1052 bool update_stat = fusions_ref.size() > 0;
1053 for (size_t i = 0; i < hits.hits.size(); ++i)
1054 {
1055 const BowtieHit& bh = hits.hits[i];
1056
1057 if (bh.edit_dist() > fusion_read_mismatches)
1058 continue;
1059
1060 fusions_from_alignment(bh, fusions, rt, update_stat);
1061
1062 if (update_stat)
1063 unsupport_fusions(bh, fusions, fusions_ref);
1064 }
1065 }
1066
1067 void update_junctions(const HitsForRead& hits,
1068 JunctionSet& junctions)
1069 {
1070 for (size_t i = 0; i < hits.hits.size(); ++i)
1071 {
1072 const BowtieHit& bh = hits.hits[i];
1073 junctions_from_alignment(bh, junctions);
1074 }
1075 }
1076
1077 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
1078 // resets the stream when finished.
1079 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits)
1080 {
1081 HitsForRead remaining;
1082 remaining.insert_id = hits.insert_id;
1083
1084 for (size_t i = 0; i < hits.hits.size(); ++i)
1085 {
1086 BowtieHit& bh = hits.hits[i];
1087 if (bh.mismatches() > read_mismatches ||
1088 bh.gap_length() > read_gap_length ||
1089 bh.edit_dist() > read_edit_dist)
1090 continue;
1091
1092 bool filter_hit = false;
1093 if (!bh.contiguous())
1094 {
1095 JunctionSet bh_juncs;
1096 junctions_from_alignment(bh, bh_juncs);
1097 for (JunctionSet::iterator itr = bh_juncs.begin();
1098 itr != bh_juncs.end();
1099 itr++)
1100 {
1101 const Junction& j = itr->first;
1102 JunctionSet::const_iterator target = junctions.find(j);
1103 if (target == junctions.end() || !target->second.accepted)
1104 {
1105 filter_hit = true;
1106 break;
1107 }
1108 }
1109 }
1110 if (!filter_hit)
1111 remaining.hits.push_back(bh);
1112 }
1113 hits = remaining;
1114 }
1115
1116 void realign_reads(HitsForRead& hits,
1117 const RefSequenceTable& rt,
1118 const JunctionSet& junctions,
1119 const JunctionSet& rev_junctions,
1120 const InsertionSet& insertions,
1121 const DeletionSet& deletions,
1122 const DeletionSet& rev_deletions,
1123 const FusionSet& fusions)
1124 {
1125 if (color)
1126 return;
1127
1128 vector<BowtieHit> additional_hits;
1129 for (size_t i = 0; i < hits.hits.size(); ++i)
1130 {
1131 BowtieHit& bh = hits.hits[i];
1132 if (fusion_search && bh.fusion_opcode() != FUSION_NOTHING)
1133 return;
1134
1135 const vector<CigarOp>& cigars = bh.cigar();
1136 int pos = bh.left();
1137 int refid = bh.ref_id();
1138 for (size_t j = 0; j < cigars.size(); ++j)
1139 {
1140 const CigarOp& op = cigars[j];
1141 if (j == 0 || j == cigars.size() - 1)
1142 {
1143 // let's do this for MATCH case only,
1144 if (op.opcode == MATCH || op.opcode == mATCH)
1145 {
1146 int left1, left2;
1147 if (op.opcode == MATCH)
1148 {
1149 left1 = pos;
1150 left2 = pos + op.length - 1;
1151 }
1152 else
1153 {
1154 left1 = pos - op.length + 1;
1155 left2 = pos;
1156 }
1157
1158 {
1159 static const size_t max_temp_juncs = 3;
1160
1161 JunctionSet::const_iterator lb, ub;
1162 JunctionSet temp_junctions;
1163 if (j == 0)
1164 {
1165 lb = rev_junctions.upper_bound(Junction(refid, left1, 0, true));
1166 ub = rev_junctions.lower_bound(Junction(refid, left2, left2, false));
1167 while (lb != ub && lb != rev_junctions.end())
1168 {
1169 Junction temp_junction = lb->first;
1170 temp_junction.left = lb->first.right;
1171 temp_junction.right = lb->first.left;
1172 temp_junctions[temp_junction] = lb->second;
1173 ++lb;
1174
1175 if (temp_junctions.size() > max_temp_juncs)
1176 break;
1177 }
1178 }
1179
1180 if (j == cigars.size() - 1)
1181 {
1182 int common_right = left2 + max_report_intron_length;
1183 lb = junctions.upper_bound(Junction(refid, left1, common_right, true));
1184 ub = junctions.lower_bound(Junction(refid, left2, common_right, false));
1185 while (lb != ub && lb != junctions.end())
1186 {
1187 temp_junctions[lb->first] = lb->second;
1188 ++lb;
1189
1190 if (temp_junctions.size() > max_temp_juncs)
1191 break;
1192 }
1193 }
1194
1195 if (temp_junctions.size() > max_temp_juncs)
1196 continue;
1197
1198 JunctionSet::const_iterator junc_iter = temp_junctions.begin();
1199 for (; junc_iter != temp_junctions.end(); ++junc_iter)
1200 {
1201 Junction junc = junc_iter->first;
1202
1203 /*
1204 fprintf(stderr, "%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n",
1205 bh.insert_id(), bh.left(), bh.right(),
1206 print_cigar(bh.cigar()).c_str(),
1207 bh.alignment_score(), bh.edit_dist(),
1208 junc.left, junc.right);
1209 //*/
1210
1211 int new_left = bh.left();
1212 int intron_length = junc.right - junc.left - 1;
1213 vector<CigarOp> new_cigars;
1214 bool anchored = false;
1215 if (j == 0 && bh.left() > (int)junc.left)
1216 {
1217 new_left -= intron_length;
1218 int before_match_length = junc.left - new_left + 1;;
1219 int after_match_length = op.length - before_match_length;
1220
1221 if (before_match_length > 0 && after_match_length > 0)
1222 {
1223 anchored = true;
1224 new_cigars.push_back(CigarOp(MATCH, before_match_length));
1225 new_cigars.push_back(CigarOp(REF_SKIP, intron_length));
1226 new_cigars.push_back(CigarOp(MATCH, after_match_length));
1227
1228 new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end());
1229 }
1230 }
1231 else if (j == cigars.size() - 1 && pos < (int)junc.left)
1232 {
1233 new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1);
1234
1235 int before_match_length = junc.left - pos + 1;
1236 int after_match_length = op.length - before_match_length;
1237
1238 if (before_match_length > 0 && after_match_length > 0)
1239 {
1240 anchored = true;
1241 new_cigars.push_back(CigarOp(MATCH, before_match_length));
1242 new_cigars.push_back(CigarOp(REF_SKIP, intron_length));
1243 new_cigars.push_back(CigarOp(MATCH, after_match_length));
1244 }
1245 }
1246
1247 if (!anchored)
1248 continue;
1249
1250 BowtieHit new_bh(bh.ref_id(),
1251 bh.ref_id2(),
1252 bh.insert_id(),
1253 new_left,
1254 new_cigars,
1255 bh.antisense_align(),
1256 junc.antisense,
1257 0, /* mismatches - needs to be recalculated */
1258 0, /* edit_dist - needs to be recalculated */
1259 0, /* splice_mms - needs to be recalculated */
1260 false);
1261
1262 new_bh.seq(bh.seq());
1263 new_bh.qual(bh.qual());
1264
1265 const RefSequenceTable::Sequence* ref_str = rt.get_seq(bh.ref_id());
1266
1267 if (new_left >= 0 && new_bh.right() <= (int)length(*ref_str))
1268 {
1269 vector<string> aux_fields;
1270 bowtie_sam_extra(new_bh, rt, aux_fields);
1271
1272 vector<string>::const_iterator aux_iter = aux_fields.begin();
1273 for (; aux_iter != aux_fields.end(); ++aux_iter)
1274 {
1275 const string& aux_field = *aux_iter;
1276 if (strncmp(aux_field.c_str(), "AS", 2) == 0)
1277 {
1278 int alignment_score = atoi(aux_field.c_str() + 5);
1279 new_bh.alignment_score(alignment_score);
1280 }
1281 else if (strncmp(aux_field.c_str(), "XM", 2) == 0)
1282 {
1283 int XM_value = atoi(aux_field.c_str() + 5);
1284 new_bh.mismatches(XM_value);
1285 new_bh.edit_dist(XM_value + gap_length(new_cigars));
1286 }
1287 }
1288
1289 vector<string> sam_toks;
1290 tokenize(bh.hitfile_rec().c_str(), "\t", sam_toks);
1291
1292 char coord[20] = {0,};
1293 sprintf(coord, "%d", new_bh.left() + 1);
1294 sam_toks[3] = coord;
1295 sam_toks[5] = print_cigar(new_bh.cigar());
1296 for (size_t a = 11; a < sam_toks.size(); ++a)
1297 {
1298 string& sam_tok = sam_toks[a];
1299 for (size_t b = 0; b < aux_fields.size(); ++b)
1300 {
1301 const string& aux_tok = aux_fields[b];
1302 if (strncmp(sam_tok.c_str(), aux_tok.c_str(), 5) == 0)
1303 {
1304 sam_tok = aux_tok;
1305 break;
1306 }
1307 }
1308 }
1309
1310 if (!bh.is_spliced())
1311 {
1312 if (junc.antisense)
1313 sam_toks.push_back("XS:A:-");
1314 else
1315 sam_toks.push_back("XS:A:+");
1316 }
1317
1318
1319 string new_rec = "";
1320 for (size_t d = 0; d < sam_toks.size(); ++d)
1321 {
1322 new_rec += sam_toks[d];
1323 if (d < sam_toks.size() - 1)
1324 new_rec += "\t";
1325 }
1326
1327 new_bh.hitfile_rec(new_rec);
1328
1329 if (new_bh.edit_dist() <= bh.edit_dist())
1330 additional_hits.push_back(new_bh);
1331
1332 /*
1333 fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n",
1334 new_bh.insert_id(), new_bh.left(), new_bh.right(),
1335 print_cigar(new_bh.cigar()).c_str(),
1336 new_bh.alignment_score(), new_bh.edit_dist(),
1337 junc.left, junc.right);
1338 //*/
1339 }
1340 }
1341 }
1342
1343 #if 0
1344 {
1345 DeletionSet::const_iterator lb, ub;
1346 bool use_rev_deletions = (j == 0);
1347 const DeletionSet& curr_deletions = (use_rev_deletions ? rev_deletions : deletions);
1348 if (use_rev_deletions)
1349 {
1350 lb = curr_deletions.upper_bound(Deletion(refid, left1, 0, true));
1351 ub = curr_deletions.lower_bound(Deletion(refid, left2, left2, false));
1352 }
1353 else
1354 {
1355 int common_right = left2 + 100;
1356 lb = curr_deletions.upper_bound(Deletion(refid, left1, common_right, true));
1357 ub = curr_deletions.lower_bound(Deletion(refid, left2, common_right, false));
1358 }
1359
1360 while (lb != curr_deletions.end() && lb != ub)
1361 {
1362 Deletion del = lb->first;
1363 if (use_rev_deletions)
1364 {
1365 int temp = del.left;
1366 del.left = del.right;
1367 del.right = temp;
1368 }
1369
1370 // daehwan - for debuggin purposes
1371 /*
1372 fprintf(stderr, "(type%d) %d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n",
1373 !use_rev_junctions,
1374 bh.insert_id(), bh.left(), bh.right(),
1375 print_cigar(bh.cigar()).c_str(),
1376 bh.alignment_score(), bh.edit_dist(),
1377 junc.left, junc.right);
1378 */
1379
1380 int del_length = del.right - del.left - 1;
1381 int new_left = bh.left();
1382 if (j == 0)
1383 new_left -= del_length;
1384
1385 vector<CigarOp> new_cigars;
1386 if (j == 0)
1387 {
1388 int before_match_length = del.left - new_left + 1;;
1389 int after_match_length = op.length - before_match_length;
1390
1391 if (before_match_length > 0)
1392 new_cigars.push_back(CigarOp(MATCH, before_match_length));
1393 new_cigars.push_back(CigarOp(DEL, del_length));
1394 if (after_match_length > 0)
1395 new_cigars.push_back(CigarOp(MATCH, after_match_length));
1396
1397 new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end());
1398 }
1399 else
1400 {
1401 new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1);
1402
1403 int before_match_length = del.left - pos + 1;
1404 int after_match_length = op.length - before_match_length;
1405
1406 if (before_match_length > 0)
1407 new_cigars.push_back(CigarOp(MATCH, before_match_length));
1408 new_cigars.push_back(CigarOp(DEL, del_length));
1409 if (after_match_length > 0)
1410 new_cigars.push_back(CigarOp(MATCH, after_match_length));
1411 }
1412
1413 BowtieHit new_bh(bh.ref_id(),
1414 bh.ref_id2(),
1415 bh.insert_id(),
1416 new_left,
1417 new_cigars,
1418 bh.antisense_align(),
1419 bh.antisense_splice(),
1420 0, /* edit_dist - needs to be recalculated */
1421 0, /* splice_mms - needs to be recalculated */
1422 false);
1423
1424 new_bh.seq(bh.seq());
1425 new_bh.qual(bh.qual());
1426
1427 vector<string> aux_fields;
1428 bowtie_sam_extra(new_bh, rt, aux_fields);
1429
1430 vector<string>::const_iterator aux_iter = aux_fields.begin();
1431 for (; aux_iter != aux_fields.end(); ++aux_iter)
1432 {
1433 const string& aux_field = *aux_iter;
1434 if (strncmp(aux_field.c_str(), "AS", 2) == 0)
1435 {
1436 int alignment_score = atoi(aux_field.c_str() + 5);
1437 new_bh.alignment_score(alignment_score);
1438 }
1439 else if (strncmp(aux_field.c_str(), "XM", 2) == 0)
1440 {
1441 int XM_value = atoi(aux_field.c_str() + 5);
1442 new_bh.edit_dist(XM_value);
1443 }
1444 }
1445
1446 vector<string> sam_toks;
1447 tokenize(bh.hitfile_rec().c_str(), "\t", sam_toks);
1448
1449 char coord[20] = {0,};
1450 sprintf(coord, "%d", new_bh.left() + 1);
1451 sam_toks[3] = coord;
1452 sam_toks[5] = print_cigar(new_bh.cigar());
1453 for (size_t a = 11; a < sam_toks.size(); ++a)
1454 {
1455 string& sam_tok = sam_toks[a];
1456 for (size_t b = 0; b < aux_fields.size(); ++b)
1457 {
1458 const string& aux_tok = aux_fields[b];
1459 if (strncmp(sam_tok.c_str(), aux_tok.c_str(), 5) == 0)
1460 {
1461 sam_tok = aux_tok;
1462 break;
1463 }
1464 }
1465 }
1466
1467 string new_rec = "";
1468 for (size_t d = 0; d < sam_toks.size(); ++d)
1469 {
1470 new_rec += sam_toks[d];
1471 if (d < sam_toks.size() - 1)
1472 new_rec += "\t";
1473 }
1474
1475 new_bh.hitfile_rec(new_rec);
1476
1477 if (new_bh.edit_dist() <= bh.edit_dist())
1478 additional_hits.push_back(new_bh);
1479
1480 /*
1481 fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n",
1482 new_bh.insert_id(), new_bh.left(), new_bh.right(),
1483 print_cigar(new_bh.cigar()).c_str(),
1484 new_bh.alignment_score(), new_bh.edit_dist(),
1485 junc.left, junc.right);
1486 */
1487
1488 ++lb;
1489 }
1490 }
1491
1492 {
1493 InsertionSet::const_iterator lb, ub;
1494 lb = insertions.upper_bound(Insertion(refid, left1, ""));
1495 ub = insertions.lower_bound(Insertion(refid, left2, ""));
1496
1497 while (lb != insertions.end() && lb != ub)
1498 {
1499 Insertion ins = lb->first;
1500
1501 // daehwan - for debugging purposes
1502 /*
1503 fprintf(stderr, "(type%d) %d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n",
1504 !use_rev_junctions,
1505 bh.insert_id(), bh.left(), bh.right(),
1506 print_cigar(bh.cigar()).c_str(),
1507 bh.alignment_score(), bh.edit_dist(),
1508 junc.left, junc.right);
1509 */
1510
1511 int ins_length = ins.sequence.length();
1512 int new_left = bh.left();
1513 if (j == 0)
1514 new_left -= ins_length;
1515
1516 vector<CigarOp> new_cigars;
1517 if (j == 0)
1518 {
1519 int before_match_length = ins.left - new_left + 1;;
1520 int after_match_length = op.length - before_match_length - ins_length;
1521
1522 if (before_match_length > 0)
1523 new_cigars.push_back(CigarOp(MATCH, before_match_length));
1524 new_cigars.push_back(CigarOp(INS, ins_length));
1525 if (after_match_length > 0)
1526 new_cigars.push_back(CigarOp(MATCH, after_match_length));
1527
1528 new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end());
1529 }
1530 else
1531 {
1532 new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1);
1533
1534 int before_match_length = ins.left - pos + 1;
1535 int after_match_length = op.length - before_match_length - ins_length;
1536
1537 if (before_match_length > 0)
1538 new_cigars.push_back(CigarOp(MATCH, before_match_length));
1539 new_cigars.push_back(CigarOp(INS, ins_length));
1540 if (after_match_length > 0)
1541 new_cigars.push_back(CigarOp(MATCH, after_match_length));
1542 }
1543
1544 BowtieHit new_bh(bh.ref_id(),
1545 bh.ref_id2(),
1546 bh.insert_id(),
1547 new_left,
1548 new_cigars,
1549 bh.antisense_align(),
1550 bh.antisense_splice(),
1551 0, /* edit_dist - needs to be recalculated */
1552 0, /* splice_mms - needs to be recalculated */
1553 false);
1554
1555 new_bh.seq(bh.seq());
1556 new_bh.qual(bh.qual());
1557
1558 vector<string> aux_fields;
1559 bowtie_sam_extra(new_bh, rt, aux_fields);
1560
1561 vector<string>::const_iterator aux_iter = aux_fields.begin();
1562 for (; aux_iter != aux_fields.end(); ++aux_iter)
1563 {
1564 const string& aux_field = *aux_iter;
1565 if (strncmp(aux_field.c_str(), "AS", 2) == 0)
1566 {
1567 int alignment_score = atoi(aux_field.c_str() + 5);
1568 new_bh.alignment_score(alignment_score);
1569 }
1570 else if (strncmp(aux_field.c_str(), "XM", 2) == 0)
1571 {
1572 int XM_value = atoi(aux_field.c_str() + 5);
1573 new_bh.edit_dist(XM_value);
1574 }
1575 }
1576
1577 /*
1578 fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n",
1579 new_bh.insert_id(), new_bh.left(), new_bh.right(),
1580 print_cigar(new_bh.cigar()).c_str(),
1581 new_bh.alignment_score(), new_bh.edit_dist(),
1582 junc.left, junc.right);
1583 */
1584
1585 ++lb;
1586 }
1587 }
1588 #endif
1589 }
1590 }
1591
1592 switch(op.opcode)
1593 {
1594 case REF_SKIP:
1595 pos += op.length;
1596 break;
1597 case rEF_SKIP:
1598 pos -= op.length;
1599 break;
1600 case MATCH:
1601 case DEL:
1602 pos += op.length;
1603 break;
1604 case mATCH:
1605 case dEL:
1606 pos -= op.length;
1607 break;
1608 case FUSION_FF:
1609 case FUSION_FR:
1610 case FUSION_RF:
1611 case FUSION_RR:
1612 pos = op.length;
1613 refid = bh.ref_id2();
1614 break;
1615 default:
1616 break;
1617 }
1618 }
1619 }
1620
1621 hits.hits.insert(hits.hits.end(), additional_hits.begin(), additional_hits.end());
1622
1623 std::sort(hits.hits.begin(), hits.hits.end());
1624 vector<BowtieHit>::iterator new_end = std::unique(hits.hits.begin(), hits.hits.end());
1625 hits.hits.erase(new_end, hits.hits.end());
1626 }
1627
1628 class MultipleBAMReader
1629 {
1630 public:
1631 MultipleBAMReader(ReadTable& it,
1632 RefSequenceTable& rt,
1633 const vector<string>& fnames,
1634 long begin_id,
1635 long end_id) :
1636 _it(it),
1637 _rt(rt),
1638 _begin_id(begin_id),
1639 _end_id(end_id),
1640 _bam_hit_factory(it, rt),
1641 _bam_merge(NULL)
1642 {
1643 // calculate file offsets
1644 vector<int64_t> offsets;
1645 for (size_t i = 0; i < fnames.size(); ++i)
1646 {
1647 const string& fname = fnames[i];
1648
1649 vector<string> temp_fnames;
1650 if (fname.substr(fname.length() - 4) == ".bam")
1651 {
1652 temp_fnames.push_back(fname);
1653 }
1654 else
1655 {
1656 for (size_t j = 0; j < (size_t)num_threads; ++j)
1657 {
1658 char suffix[128];
1659 sprintf(suffix, "%lu.bam", j);
1660 string temp_fname = fname + suffix;
1661 temp_fnames.push_back(temp_fname);
1662 }
1663 }
1664
1665 for (size_t j = 0; j < temp_fnames.size(); ++j)
1666 {
1667 ifstream reads_index_file((temp_fnames[j] + ".index").c_str());
1668 if (!reads_index_file.is_open())
1669 continue;
1670
1671 _fnames.push_back(temp_fnames[j]);
1672
1673 int64_t last_offset = 0;
1674 uint64_t last_read_id = 0;
1675 string line;
1676 while (getline(reads_index_file, line))
1677 {
1678 istringstream istream(line);
1679 uint64_t read_id;
1680 int64_t offset;
1681
1682 istream >> read_id >> offset;
1683 if (read_id > _begin_id && last_read_id <= _begin_id)
1684 {
1685 offsets.push_back(last_offset);
1686
1687 #if 0
1688 fprintf(stderr, "bet %lu and %lu - %s %lu %ld\n",
1689 _begin_id, _end_id, temp_fnames[j].c_str(), last_offset, last_read_id);
1690 #endif
1691 }
1692
1693 last_offset = offset;
1694 last_read_id = read_id;
1695 }
1696
1697 if (last_read_id >= _end_id)
1698 break;
1699 }
1700 }
1701
1702 _bam_merge = new BamMerge(_fnames, offsets);
1703 _bam_hit_factory.set_sam_header(_bam_merge->get_sam_header());
1704 }
1705
1706 ~MultipleBAMReader()
1707 {
1708 if (_bam_merge)
1709 delete _bam_merge;
1710 }
1711
1712 bool next_read_hits(HitsForRead& hits)
1713 {
1714 hits.insert_id = 0;
1715
1716 if (!_bam_merge)
1717 return false;
1718
1719 vector<CBamLine> bam_lines;
1720 while (true)
1721 {
1722 if (!_bam_merge->next_bam_lines(bam_lines))
1723 return false;
1724
1725 if (bam_lines.size() <= 0)
1726 return false;
1727
1728 uint64_t read_id = bam_lines[0].read_id;
1729 if (read_id >= _begin_id && read_id < _end_id)
1730 {
1731 hits.hits.clear();
1732 for (size_t i = 0; i < bam_lines.size(); ++i)
1733 {
1734 CBamLine& bam_line = bam_lines[i];
1735 BowtieHit bh;
1736
1737 char seq[MAX_READ_LEN + 1] = {0};
1738 char qual[MAX_READ_LEN + 1] = {0};
1739
1740 bool success = _bam_hit_factory.get_hit_from_buf((const char*)bam_line.b, bh, true, NULL, NULL, seq, qual);
1741 if (success)
1742 {
1743 bh.seq(seq);
1744 bh.qual(qual);
1745
1746 char* sam_line = bam_format1(_bam_merge->get_sam_header(), bam_line.b);
1747 bh.hitfile_rec(sam_line);
1748 free(sam_line);
1749
1750 hits.insert_id = bh.insert_id();
1751 hits.hits.push_back(bh);
1752 }
1753
1754 bam_line.b_free();
1755 }
1756 bam_lines.clear();
1757
1758 if (hits.hits.size() > 0)
1759 return true;
1760 }
1761
1762 for (size_t i = 0; i < bam_lines.size(); ++i)
1763 bam_lines[i].b_free();
1764
1765 bam_lines.clear();
1766
1767 if (read_id >= _end_id)
1768 break;
1769 }
1770
1771 return false;
1772 }
1773
1774 private:
1775 ReadTable& _it;
1776 RefSequenceTable& _rt;
1777 vector<string> _fnames;
1778 uint64_t _begin_id;
1779 uint64_t _end_id;
1780
1781 BAMHitFactory _bam_hit_factory;
1782 BamMerge* _bam_merge;
1783 };
1784
1785
1786 // events include splice junction, indels, and fusion points.
1787 struct ConsensusEventsWorker
1788 {
1789 void operator()()
1790 {
1791 ReadTable it;
1792 MultipleBAMReader l_hs(it, *rt, left_map_fnames, begin_id, end_id);
1793 MultipleBAMReader r_hs(it, *rt, right_map_fnames, begin_id, end_id);
1794
1795 HitsForRead curr_left_hit_group;
1796 HitsForRead curr_right_hit_group;
1797
1798 l_hs.next_read_hits(curr_left_hit_group);
1799 r_hs.next_read_hits(curr_right_hit_group);
1800
1801 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1802 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1803
1804 // While we still have unreported hits...
1805 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
1806 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
1807 {
1808 // Chew up left singletons
1809 while (curr_left_obs_order < curr_right_obs_order &&
1810 curr_left_obs_order < end_id &&
1811 curr_left_obs_order != VMAXINT32)
1812 {
1813 HitsForRead best_hits;
1814 best_hits.insert_id = curr_left_obs_order;
1815
1816 // Process hits for left singleton, select best alignments
1817 read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions);
1818 update_coverage(best_hits, *coverage);
1819 update_junctions(best_hits, *junctions);
1820 update_insertions_and_deletions(best_hits, *insertions, *deletions);
1821 update_fusions(best_hits, *rt, *fusions);
1822
1823 // Get next hit group
1824 l_hs.next_read_hits(curr_left_hit_group);
1825 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1826 }
1827
1828 // Chew up right singletons
1829 while (curr_left_obs_order > curr_right_obs_order &&
1830 curr_right_obs_order < end_id &&
1831 curr_right_obs_order != VMAXINT32)
1832 {
1833 HitsForRead best_hits;
1834 best_hits.insert_id = curr_right_obs_order;
1835 if (curr_right_obs_order >= begin_id)
1836 {
1837 // Process hit for right singleton, select best alignments
1838 read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions);
1839 update_coverage(best_hits, *coverage);
1840 update_junctions(best_hits, *junctions);
1841 update_insertions_and_deletions(best_hits, *insertions, *deletions);
1842 update_fusions(best_hits, *rt, *fusions);
1843 }
1844
1845 // Get next hit group
1846 r_hs.next_read_hits(curr_right_hit_group);
1847 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1848 }
1849
1850 // Since we have both left hits and right hits for this insert,
1851 // Find the best pairing and print both
1852 while (curr_left_obs_order == curr_right_obs_order &&
1853 curr_left_obs_order < end_id &&
1854 curr_left_obs_order != VMAXINT32)
1855 {
1856 vector<pair<BowtieHit, BowtieHit> > best_hits;
1857
1858 InsertAlignmentGrade grade;
1859 pair_best_alignments(curr_left_hit_group,
1860 curr_right_hit_group,
1861 grade,
1862 best_hits,
1863 *gtf_junctions);
1864
1865 HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
1866 HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
1867
1868 if (best_hits.size() > 0)
1869 {
1870 for (size_t i = 0; i < best_hits.size(); ++i)
1871 {
1872 best_left_hit_group.hits.push_back(best_hits[i].first);
1873 best_right_hit_group.hits.push_back(best_hits[i].second);
1874 }
1875 }
1876 else
1877 {
1878 best_left_hit_group.hits = curr_left_hit_group.hits;
1879 best_right_hit_group.hits = curr_right_hit_group.hits;
1880 }
1881
1882 update_coverage(best_left_hit_group, *coverage);
1883 update_junctions(best_left_hit_group, *junctions);
1884 update_insertions_and_deletions(best_left_hit_group, *insertions, *deletions);
1885 update_fusions(best_left_hit_group, *rt, *fusions);
1886
1887 update_coverage(best_right_hit_group, *coverage);
1888 update_junctions(best_right_hit_group, *junctions);
1889 update_insertions_and_deletions(best_right_hit_group, *insertions, *deletions);
1890 update_fusions(best_right_hit_group, *rt, *fusions);
1891
1892 l_hs.next_read_hits(curr_left_hit_group);
1893 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1894
1895 r_hs.next_read_hits(curr_right_hit_group);
1896 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1897 }
1898 }
1899 }
1900
1901 vector<string> left_map_fnames;
1902 vector<string> right_map_fnames;
1903
1904 RefSequenceTable* rt;
1905
1906 JunctionSet* gtf_junctions;
1907
1908 uint64_t begin_id;
1909 uint64_t end_id;
1910 int64_t left_map_offset;
1911 int64_t right_map_offset;
1912
1913 JunctionSet* junctions;
1914 InsertionSet* insertions;
1915 DeletionSet* deletions;
1916 FusionSet* fusions;
1917
1918 Coverage* coverage;
1919 };
1920
1921 struct ReportWorker
1922 {
1923 void write_singleton_alignments(uint64_t curr_obs_order,
1924 HitsForRead& curr_hit_group,
1925 ReadStream& reads_file,
1926 GBamWriter& bam_writer,
1927 GBamWriter& um_out,
1928 FragmentType fragment_type)
1929 {
1930 HitsForRead best_hits;
1931 best_hits.insert_id = curr_obs_order;
1932
1933 realign_reads(curr_hit_group, *rt, *junctions, *rev_junctions,
1934 *insertions, *deletions, *rev_deletions, *fusions);
1935
1936 exclude_hits_on_filtered_junctions(*junctions, curr_hit_group);
1937
1938 // Process hits for singleton, select best alignments
1939 const bool final_report = true;
1940 read_best_alignments(curr_hit_group, best_hits, *gtf_junctions,
1941 *junctions, *insertions, *deletions, *fusions, *coverage,
1942 final_report, &rng);
1943
1944 if (best_hits.hits.size() > 0)
1945 {
1946 Read read;
1947 bool got_read = reads_file.getRead(curr_obs_order, read,
1948 reads_format, false, begin_id, end_id,
1949 &um_out, false);
1950 if (got_read)
1951 {
1952 update_junctions(best_hits, *final_junctions);
1953 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1954 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1955
1956 print_sam_for_single(*rt,
1957 best_hits,
1958 fragment_type,
1959 read.alt_name,
1960 bam_writer,
1961 rng);
1962 }
1963 }
1964 }
1965
1966 void operator()()
1967 {
1968 rng.seed(1);
1969
1970 ReadTable it;
1971 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
1972
1973 ReadStream left_reads_file(left_reads_fname);
1974 if (left_reads_file.file() == NULL)
1975 err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
1976 if (left_reads_file.isBam()) {
1977 left_reads_file.use_alt_name();
1978 left_reads_file.ignoreQC();
1979 }
1980 if (left_reads_offset > 0)
1981 left_reads_file.seek(left_reads_offset);
1982
1983 GBamWriter* left_um_out=new GBamWriter(left_um_fname.c_str(), sam_header.c_str());
1984 GBamWriter* right_um_out=NULL;
1985
1986 ReadStream right_reads_file(right_reads_fname);
1987 if (right_reads_offset > 0)
1988 right_reads_file.seek(right_reads_offset);
1989
1990 if (!right_reads_fname.empty())
1991 {
1992 if (right_reads_file.isBam()) {
1993 right_reads_file.use_alt_name();
1994 right_reads_file.ignoreQC();
1995 right_um_out=new GBamWriter(right_um_fname.c_str(), sam_header.c_str());
1996 }
1997 }
1998
1999 MultipleBAMReader left_hs(it, *rt, left_map_fnames, begin_id, end_id);
2000 MultipleBAMReader right_hs(it, *rt, right_map_fnames, begin_id, end_id);
2001
2002 HitsForRead curr_left_hit_group;
2003 HitsForRead curr_right_hit_group;
2004
2005 left_hs.next_read_hits(curr_left_hit_group);
2006 right_hs.next_read_hits(curr_right_hit_group);
2007
2008 uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
2009 uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
2010
2011 const bool final_report = true;
2012
2013 // While we still have unreported hits...
2014 Read l_read;
2015 Read r_read;
2016 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
2017 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
2018 {
2019 // Chew up left singletons (pairs with right reads unmapped)
2020 while (curr_left_obs_order < curr_right_obs_order &&
2021 curr_left_obs_order < end_id &&
2022 curr_left_obs_order != VMAXINT32)
2023 {
2024 write_singleton_alignments(curr_left_obs_order,
2025 curr_left_hit_group,
2026 left_reads_file,
2027 bam_writer,
2028 *left_um_out,
2029 right_map_fnames.empty() ? FRAG_UNPAIRED : FRAG_LEFT);
2030
2031 // Get next hit group
2032 left_hs.next_read_hits(curr_left_hit_group);
2033 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
2034 } //left singletons
2035
2036 // Chew up right singletons
2037 while (curr_left_obs_order > curr_right_obs_order &&
2038 curr_right_obs_order < end_id &&
2039 curr_right_obs_order != VMAXINT32)
2040 {
2041 write_singleton_alignments(curr_right_obs_order,
2042 curr_right_hit_group,
2043 right_reads_file,
2044 bam_writer,
2045 *right_um_out,
2046 FRAG_RIGHT);
2047
2048 // Get next hit group
2049 right_hs.next_read_hits(curr_right_hit_group);
2050 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
2051 }
2052
2053 // Since we have both left hits and right hits for this insert,
2054 // Find the best pairing and print both
2055 while (curr_left_obs_order == curr_right_obs_order &&
2056 curr_left_obs_order < end_id &&
2057 curr_left_obs_order != VMAXINT32)
2058 {
2059 realign_reads(curr_left_hit_group, *rt, *junctions, *rev_junctions,
2060 *insertions, *deletions, *rev_deletions, *fusions);
2061 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
2062
2063 realign_reads(curr_right_hit_group, *rt, *junctions, *rev_junctions,
2064 *insertions, *deletions, *rev_deletions, *fusions);
2065 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
2066
2067 vector<pair<BowtieHit, BowtieHit> > best_hits;
2068
2069 bool paired_alignments = curr_left_hit_group.hits.size() > 0 && curr_right_hit_group.hits.size() > 0;
2070 InsertAlignmentGrade grade;
2071
2072 if (paired_alignments)
2073 {
2074 pair_best_alignments(curr_left_hit_group,
2075 curr_right_hit_group,
2076 grade, best_hits, *gtf_junctions,
2077 *junctions, *insertions, *deletions, *fusions, *coverage,
2078 final_report, &rng);
2079
2080 if (report_mixed_alignments)
2081 {
2082 if (best_hits.size() <= 0 ||
2083 (grade.fusion && !fusion_search && !report_discordant_pair_alignments))
2084 paired_alignments = false;
2085 }
2086 }
2087
2088 if (paired_alignments)
2089 {
2090 HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
2091 HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
2092
2093 for (size_t i = 0; i < best_hits.size(); ++i)
2094 {
2095 best_left_hit_group.hits.push_back(best_hits[i].first);
2096 best_right_hit_group.hits.push_back(best_hits[i].second);
2097 }
2098
2099 if (best_hits.size() > 0)
2100 {
2101 bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), l_read,
2102 reads_format, false, begin_id, end_id,
2103 left_um_out, false);
2104
2105 bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), r_read,
2106 reads_format, false, begin_id, end_id,
2107 right_um_out, false);
2108
2109 if (got_left_read && got_right_read)
2110 {
2111 update_junctions(best_left_hit_group, *final_junctions);
2112 update_insertions_and_deletions(best_left_hit_group, *final_insertions, *final_deletions);
2113 update_fusions(best_left_hit_group, *rt, *final_fusions, *fusions);
2114
2115 update_junctions(best_right_hit_group, *final_junctions);
2116 update_insertions_and_deletions(best_right_hit_group, *final_insertions, *final_deletions);
2117 update_fusions(best_right_hit_group, *rt, *final_fusions, *fusions);
2118
2119 pair_support(best_hits, *final_fusions, *fusions);
2120
2121 print_sam_for_pair(*rt,
2122 best_hits,
2123 grade,
2124 bam_writer,
2125 l_read.alt_name,
2126 r_read.alt_name,
2127 rng,
2128 begin_id,
2129 end_id);
2130 }
2131 }
2132 }
2133 else
2134 {
2135 if (curr_left_hit_group.hits.size() > 0)
2136 {
2137 write_singleton_alignments(curr_left_obs_order,
2138 curr_left_hit_group,
2139 left_reads_file,
2140 bam_writer,
2141 *left_um_out,
2142 (right_map_fnames.empty() ? FRAG_UNPAIRED : FRAG_LEFT));
2143 }
2144
2145 if (curr_right_hit_group.hits.size() > 0)
2146 { //only right read mapped
2147 //write it in the mapped file with the #MAPPED# flag
2148 write_singleton_alignments(curr_right_obs_order,
2149 curr_right_hit_group,
2150 right_reads_file,
2151 bam_writer,
2152 *right_um_out,
2153 FRAG_RIGHT);
2154 }
2155 }
2156
2157 left_hs.next_read_hits(curr_left_hit_group);
2158 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
2159
2160 right_hs.next_read_hits(curr_right_hit_group);
2161 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
2162 }
2163 } //while we still have unreported hits..
2164 //print the remaining unmapped reads at the end of each reads' stream
2165
2166 left_reads_file.getRead(VMAXINT32, l_read,
2167 reads_format, false, begin_id, end_id,
2168 left_um_out, false);
2169 if (right_reads_file.file())
2170 right_reads_file.getRead(VMAXINT32, r_read,
2171 reads_format, false, begin_id, end_id,
2172 right_um_out, false);
2173
2174 // pclose (pipe close), which waits for a process to end, seems to conflict with boost::thread::join somehow,
2175 // resulting in deadlock like behavior.
2176 delete left_um_out;
2177 delete right_um_out;
2178 }
2179
2180 string bam_output_fname;
2181 string sam_header_fname;
2182
2183 string left_reads_fname;
2184 vector<string> left_map_fnames;
2185 string right_reads_fname;
2186 vector<string> right_map_fnames;
2187
2188 string left_um_fname;
2189 string right_um_fname;
2190
2191 JunctionSet* gtf_junctions;
2192
2193 JunctionSet* junctions;
2194 JunctionSet* rev_junctions;
2195 InsertionSet* insertions;
2196 DeletionSet* deletions;
2197 DeletionSet* rev_deletions;
2198 FusionSet* fusions;
2199 Coverage* coverage;
2200
2201 JunctionSet* final_junctions;
2202 InsertionSet* final_insertions;
2203 DeletionSet* final_deletions;
2204 FusionSet* final_fusions;
2205
2206 RefSequenceTable* rt;
2207
2208 uint64_t begin_id;
2209 uint64_t end_id;
2210 int64_t left_reads_offset;
2211 int64_t left_map_offset;
2212 int64_t right_reads_offset;
2213 int64_t right_map_offset;
2214
2215 boost::random::mt19937 rng;
2216 };
2217
2218 void driver(const string& bam_output_fname,
2219 istream& ref_stream,
2220 const vector<string>& left_map_fnames,
2221 const string& left_reads_fname,
2222 const vector<string>& right_map_fnames,
2223 const string& right_reads_fname,
2224 FILE* junctions_out,
2225 FILE* insertions_out,
2226 FILE* deletions_out,
2227 FILE* fusions_out)
2228 {
2229 if (!parallel)
2230 num_threads = 1;
2231
2232 RefSequenceTable rt(sam_header, true);
2233 get_seqs(ref_stream, rt, true);
2234
2235 srandom(1);
2236
2237 JunctionSet gtf_junctions;
2238 if (!gtf_juncs.empty())
2239 {
2240 char splice_buf[4096];
2241 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
2242 if (splice_coords)
2243 {
2244 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
2245 {
2246 char* nl = strrchr(splice_buf, '\n');
2247 char* buf = splice_buf;
2248 if (nl) *nl = 0;
2249
2250 char* ref_name = get_token((char**)&buf, "\t");
2251 char* scan_left_coord = get_token((char**)&buf, "\t");
2252 char* scan_right_coord = get_token((char**)&buf, "\t");
2253 char* orientation = get_token((char**)&buf, "\t");
2254
2255 if (!scan_left_coord || !scan_right_coord || !orientation)
2256 {
2257 fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
2258 gtf_juncs.c_str(), buf);
2259 exit(1);
2260 }
2261
2262 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
2263 uint32_t left_coord = atoi(scan_left_coord);
2264 uint32_t right_coord = atoi(scan_right_coord);
2265 bool antisense = *orientation == '-';
2266
2267 JunctionStats junction_stat;
2268 junction_stat.gtf_match = true;
2269 junction_stat.accepted = true;
2270
2271 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), junction_stat));
2272 }
2273 }
2274 fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
2275 }
2276
2277 vector<uint64_t> read_ids;
2278 vector<vector<int64_t> > offsets;
2279 if (num_threads > 1)
2280 {
2281 vector<string> fnames;
2282 if (right_map_fnames.size() > 0)
2283 {
2284 fnames.push_back(right_reads_fname);
2285 fnames.push_back(right_map_fnames.back());
2286 }
2287 fnames.push_back(left_reads_fname);
2288 fnames.push_back(left_map_fnames.back());
2289 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
2290 if (!enough_data)
2291 num_threads = 1;
2292 }
2293
2294 JunctionSet vjunctions[num_threads];
2295 InsertionSet vinsertions[num_threads];
2296 DeletionSet vdeletions[num_threads];
2297 FusionSet vfusions[num_threads];
2298 Coverage vcoverages[num_threads];
2299
2300 vector<boost::thread*> threads;
2301 for (int i = 0; i < num_threads; ++i)
2302 {
2303 ConsensusEventsWorker worker;
2304
2305 worker.left_map_fnames = left_map_fnames;
2306 worker.right_map_fnames = right_map_fnames;
2307 worker.rt = &rt;
2308 worker.gtf_junctions = &gtf_junctions;
2309
2310 worker.junctions = &vjunctions[i];
2311 worker.insertions = &vinsertions[i];
2312 worker.deletions = &vdeletions[i];
2313 worker.fusions = &vfusions[i];
2314 worker.coverage = &vcoverages[i];
2315
2316 worker.right_map_offset = 0;
2317
2318 if (i == 0)
2319 {
2320 worker.begin_id = 0;
2321 worker.left_map_offset = 0;
2322 }
2323 else
2324 {
2325 size_t offsets_size = offsets[i-1].size();
2326
2327 worker.begin_id = read_ids[i-1];
2328 worker.left_map_offset = offsets[i-1].back();
2329 if (offsets_size == 4)
2330 worker.right_map_offset = offsets[i-1][1];
2331 }
2332
2333 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
2334
2335 if (num_threads > 1 && i + 1 < num_threads)
2336 threads.push_back(new boost::thread(worker));
2337 else
2338 worker();
2339 }
2340
2341 for (size_t i = 0; i < threads.size(); ++i)
2342 {
2343 threads[i]->join();
2344 delete threads[i];
2345 threads[i] = NULL;
2346 }
2347 threads.clear();
2348
2349 JunctionSet& junctions = vjunctions[0];
2350 InsertionSet& insertions = vinsertions[0];
2351 DeletionSet& deletions = vdeletions[0];
2352 FusionSet& fusions = vfusions[0];
2353 Coverage& coverage = vcoverages[0];
2354 for (int i = 1; i < num_threads; ++i)
2355 {
2356 merge_with(junctions, vjunctions[i]);
2357 vjunctions[i].clear();
2358
2359 merge_with(insertions, vinsertions[i]);
2360 vinsertions[i].clear();
2361
2362 merge_with(deletions, vdeletions[i]);
2363 vdeletions[i].clear();
2364
2365 merge_with(fusions, vfusions[i]);
2366 vfusions[i].clear();
2367
2368 coverage.merge_with(vcoverages[i]);
2369 vcoverages[i].clear();
2370 }
2371
2372 merge_with(junctions, gtf_junctions);
2373 coverage.calculate_coverage();
2374
2375 JunctionSet rev_junctions;
2376 JunctionSet::const_iterator junction_iter = junctions.begin();
2377 for (; junction_iter != junctions.end(); ++junction_iter)
2378 {
2379 const Junction& junction = junction_iter->first;
2380 Junction rev_junction = junction;
2381 rev_junction.left = junction.right;
2382 rev_junction.right = junction.left;
2383 rev_junctions[rev_junction] = junction_iter->second;
2384 }
2385
2386 DeletionSet rev_deletions;
2387 #if 0
2388 DeletionSet::const_iterator deletion_iter = deletions.begin();
2389 for (; deletion_iter != deletions.end(); ++deletion_iter)
2390 {
2391 const Deletion& deletion = deletion_iter->first;
2392 Deletion rev_deletion = deletion;
2393 rev_deletion.left = deletion.right;
2394 rev_deletion.right = deletion.left;
2395 rev_deletions[rev_deletion] = deletion_iter->second;
2396 }
2397 #endif
2398
2399 size_t num_unfiltered_juncs = junctions.size();
2400 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
2401
2402 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
2403 filter_junctions(junctions, gtf_junctions);
2404
2405 //size_t num_juncs_after_filter = junctions.size();
2406 //fprintf(stderr, "Filtered %lu junctions\n",
2407 // num_unfiltered_juncs - num_juncs_after_filter);
2408
2409 /*
2410 size_t small_overhangs = 0;
2411 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
2412 {
2413 if (i->second.accepted &&
2414 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
2415 {
2416 small_overhangs++;
2417 }
2418 }
2419
2420 if (small_overhangs >0)
2421 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
2422 */
2423
2424 JunctionSet vfinal_junctions[num_threads];
2425 InsertionSet vfinal_insertions[num_threads];
2426 DeletionSet vfinal_deletions[num_threads];
2427 FusionSet vfinal_fusions[num_threads];
2428
2429 for (int i = 0; i < num_threads; ++i)
2430 {
2431 ReportWorker worker;
2432
2433 worker.sam_header_fname = sam_header;
2434 char filename[1024] = {0};
2435 sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
2436 worker.bam_output_fname = filename;
2437 string tmpoutdir = getFdir(worker.bam_output_fname);
2438 worker.left_um_fname = tmpoutdir;
2439 sprintf(filename, "unmapped_left_%d.bam", i);
2440 worker.left_um_fname+=filename;
2441
2442 if (right_reads_fname != "")
2443 {
2444 sprintf(filename, "unmapped_right_%d.bam", i);
2445 worker.right_um_fname = tmpoutdir;
2446 worker.right_um_fname += filename;
2447 }
2448
2449 worker.left_reads_fname = left_reads_fname;
2450 worker.left_map_fnames = left_map_fnames;
2451 worker.right_reads_fname = right_reads_fname;
2452 worker.right_map_fnames = right_map_fnames;
2453
2454 worker.gtf_junctions = &gtf_junctions;
2455 worker.junctions = &junctions;
2456 worker.rev_junctions = &rev_junctions;
2457 worker.insertions = &insertions;
2458 worker.deletions = &deletions;
2459 worker.rev_deletions = &rev_deletions;
2460 worker.fusions = &fusions;
2461 worker.coverage = &coverage;
2462
2463 worker.final_junctions = &vfinal_junctions[i];
2464 worker.final_insertions = &vfinal_insertions[i];
2465 worker.final_deletions = &vfinal_deletions[i];
2466 worker.final_fusions = &vfinal_fusions[i];
2467 worker.rt = &rt;
2468
2469 worker.right_reads_offset = 0;
2470 worker.right_map_offset = 0;
2471
2472 if (i == 0)
2473 {
2474 worker.begin_id = 0;
2475 worker.left_reads_offset = 0;
2476 worker.left_map_offset = 0;
2477 }
2478 else
2479 {
2480 size_t offsets_size = offsets[i-1].size();
2481
2482 worker.begin_id = read_ids[i-1];
2483 worker.left_reads_offset = offsets[i-1][offsets_size - 2];
2484 worker.left_map_offset = offsets[i-1].back();
2485 if (offsets_size == 4)
2486 {
2487 worker.right_reads_offset = offsets[i-1][0];
2488 worker.right_map_offset = offsets[i-1][1];
2489 }
2490 }
2491
2492 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
2493
2494 if (num_threads > 1 && i + 1 < num_threads)
2495 threads.push_back(new boost::thread(worker));
2496 else
2497 worker();
2498 }
2499
2500 for (size_t i = 0; i < threads.size(); ++i)
2501 {
2502 threads[i]->join();
2503 delete threads[i];
2504 threads[i] = NULL;
2505 }
2506 threads.clear();
2507
2508 JunctionSet& final_junctions = vfinal_junctions[0];
2509 InsertionSet& final_insertions = vfinal_insertions[0];
2510 DeletionSet& final_deletions = vfinal_deletions[0];
2511 FusionSet& final_fusions = vfinal_fusions[0];
2512 for (int i = 1; i < num_threads; ++i)
2513 {
2514 merge_with(final_junctions, vfinal_junctions[i]);
2515 vfinal_junctions[i].clear();
2516
2517 merge_with(final_insertions, vfinal_insertions[i]);
2518 vfinal_insertions[i].clear();
2519
2520 merge_with(final_deletions, vfinal_deletions[i]);
2521 vfinal_deletions[i].clear();
2522
2523 merge_with(final_fusions, vfinal_fusions[i]);
2524 vfinal_fusions[i].clear();
2525 }
2526
2527 fprintf (stderr, "Reporting final accepted alignments...");
2528 fprintf (stderr, "done.\n");
2529
2530 //small_overhangs = 0;
2531 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
2532 {
2533 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
2534 {
2535 final_junctions.erase(i++);
2536 }
2537 else
2538 {
2539 ++i;
2540 }
2541 }
2542
2543 // if (small_overhangs > 0)
2544 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
2545
2546 fprintf (stderr, "Printing junction BED track...");
2547 print_junctions(junctions_out, final_junctions, rt);
2548 fprintf (stderr, "done\n");
2549
2550 fprintf (stderr, "Printing insertions...");
2551 print_insertions(insertions_out, final_insertions,rt);
2552 fclose(insertions_out);
2553 fprintf (stderr, "done\n");
2554
2555 fprintf (stderr, "Printing deletions...");
2556 print_deletions(deletions_out, final_deletions, rt);
2557 fclose(deletions_out);
2558 fprintf (stderr, "done\n");
2559
2560 if (fusion_search)
2561 {
2562 fprintf (stderr, "Printing fusions...");
2563 print_fusions(fusions_out, final_fusions, rt);
2564 fclose(fusions_out);
2565 fprintf (stderr, "done\n");
2566 }
2567
2568 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
2569 }
2570
2571 void print_usage()
2572 {
2573 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
2574
2575 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
2576 }
2577
2578 int main(int argc, char** argv)
2579 {
2580 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
2581 fprintf(stderr, "---------------------------------------\n");
2582
2583 reads_format = FASTQ;
2584
2585 int parse_ret = parse_options(argc, argv, print_usage);
2586 if (parse_ret)
2587 return parse_ret;
2588
2589 if(optind >= argc)
2590 {
2591 print_usage();
2592 return 1;
2593 }
2594
2595 string ref_file_name = argv[optind++];
2596
2597 if(optind >= argc)
2598 {
2599 print_usage();
2600 return 1;
2601 }
2602
2603 string junctions_file_name = argv[optind++];
2604
2605 if(optind >= argc)
2606 {
2607 print_usage();
2608 return 1;
2609 }
2610
2611 string insertions_file_name = argv[optind++];
2612 if(optind >= argc)
2613 {
2614 print_usage();
2615 return 1;
2616 }
2617
2618 string deletions_file_name = argv[optind++];
2619 if(optind >= argc)
2620 {
2621 print_usage();
2622 return 1;
2623 }
2624
2625 string fusions_file_name = argv[optind++];
2626 if(optind >= argc)
2627 {
2628 print_usage();
2629 return 1;
2630 }
2631
2632 string accepted_hits_file_name = argv[optind++];
2633 if(optind >= argc)
2634 {
2635 print_usage();
2636 return 1;
2637 }
2638
2639 string left_map_filename_list = argv[optind++];
2640 vector<string> left_map_filenames;
2641 tokenize(left_map_filename_list, ",", left_map_filenames);
2642
2643 if(optind >= argc)
2644 {
2645 print_usage();
2646 return 1;
2647 }
2648
2649 string left_reads_filename = argv[optind++];
2650 string unzcmd=getUnpackCmd(left_reads_filename, false);
2651 string right_map_filename_list;
2652 vector<string> right_map_filenames;
2653 string right_reads_filename;
2654
2655 if (optind < argc)
2656 {
2657 right_map_filename_list = argv[optind++];
2658 tokenize(right_map_filename_list, ",", right_map_filenames);
2659 if(optind >= argc) {
2660 print_usage();
2661 return 1;
2662 }
2663 right_reads_filename=argv[optind++];
2664 }
2665
2666 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
2667 if (!ref_stream.good())
2668 {
2669 fprintf(stderr, "Error: cannot open %s for reading\n",
2670 ref_file_name.c_str());
2671 exit(1);
2672 }
2673
2674 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
2675 if (junctions_file == NULL)
2676 {
2677 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
2678 junctions_file_name.c_str());
2679 exit(1);
2680 }
2681
2682 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
2683 if (insertions_file == NULL)
2684 {
2685 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
2686 insertions_file_name.c_str());
2687 exit(1);
2688 }
2689
2690 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
2691 if (deletions_file == NULL)
2692 {
2693 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
2694 deletions_file_name.c_str());
2695 exit(1);
2696 }
2697
2698 FILE* fusions_file = NULL;
2699 if (fusion_search)
2700 {
2701 fusions_file = fopen(fusions_file_name.c_str(), "w");
2702 if (fusions_file == NULL)
2703 {
2704 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
2705 fusions_file_name.c_str());
2706 exit(1);
2707 }
2708 }
2709
2710 driver(accepted_hits_file_name,
2711 ref_stream,
2712 left_map_filenames,
2713 left_reads_filename,
2714 right_map_filenames,
2715 right_reads_filename,
2716 junctions_file,
2717 insertions_file,
2718 deletions_file,
2719 fusions_file);
2720
2721 return 0;
2722 }