ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 236
Committed: Fri May 4 22:20:02 2012 UTC (12 years, 4 months ago) by gpertea
File size: 55842 byte(s)
Log Message:
sync, wip prep_reads

Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include <boost/thread.hpp>
34 #include <boost/random/mersenne_twister.hpp>
35
36 #include "common.h"
37 #include "utils.h"
38 #include "bwt_map.h"
39 #include "junctions.h"
40 #include "insertions.h"
41 #include "deletions.h"
42 #include "fusions.h"
43 #include "align_status.h"
44 #include "fragments.h"
45 #include "wiggles.h"
46 #include "tokenize.h"
47 #include "reads.h"
48 #include "coverage.h"
49
50
51 #include "inserts.h"
52
53 using namespace std;
54 using namespace seqan;
55 using std::set;
56
57 static const JunctionSet empty_junctions;
58 static const InsertionSet empty_insertions;
59 static const DeletionSet empty_deletions;
60 static const FusionSet empty_fusions;
61 static const Coverage empty_coverage;
62
63 // daehwan - this is redundancy, which should be removed.
64 void get_seqs(istream& ref_stream,
65 RefSequenceTable& rt,
66 bool keep_seqs = true)
67 {
68 while(ref_stream.good() && !ref_stream.eof())
69 {
70 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
71 string name;
72 readMeta(ref_stream, name, Fasta());
73 string::size_type space_pos = name.find_first_of(" \t\r");
74 if (space_pos != string::npos)
75 {
76 name.resize(space_pos);
77 }
78 seqan::read(ref_stream, *ref_str, Fasta());
79
80 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
81 }
82 }
83
84 struct cmp_read_alignment
85 {
86 bool operator() (const BowtieHit& l, const BowtieHit& r) const
87 {
88 return l.alignment_score() > r.alignment_score();
89 }
90 };
91
92 void read_best_alignments(const HitsForRead& hits_for_read,
93 HitsForRead& best_hits,
94 const JunctionSet& gtf_junctions,
95 const JunctionSet& junctions = empty_junctions,
96 const InsertionSet& insertions = empty_insertions,
97 const DeletionSet& deletions = empty_deletions,
98 const FusionSet& fusions = empty_fusions,
99 const Coverage& coverage = empty_coverage,
100 bool final_report = false)
101 {
102 const vector<BowtieHit>& hits = hits_for_read.hits;
103
104 if (hits.size() >= max_multihits * 5)
105 return;
106
107 for (size_t i = 0; i < hits.size(); ++i)
108 {
109 if (hits[i].edit_dist() > max_read_mismatches)
110 continue;
111
112 BowtieHit hit = hits[i];
113 AlignStatus align_status(hit, gtf_junctions,
114 junctions, insertions, deletions, fusions, coverage);
115 hit.alignment_score(align_status._alignment_score);
116
117 if (report_secondary_alignments || !final_report)
118 {
119 best_hits.hits.push_back(hit);
120 }
121 else
122 {
123 // Is the new status better than the current best one?
124 if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0]))
125 {
126 best_hits.hits.clear();
127 best_hits.hits.push_back(hit);
128 }
129 else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good?
130 {
131 best_hits.hits.push_back(hit);
132 }
133 }
134 }
135
136 if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0)
137 {
138 sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment());
139 }
140
141 if (final_report)
142 {
143 if (best_hits.hits.size() > max_multihits)
144 best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
145 }
146 }
147
148 bool set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade)
149 {
150 bool fusion = false;
151 bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
152 bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
153 if (left_fusion && right_fusion)
154 return false;
155
156 fusion = left_fusion || right_fusion;
157 if (!fusion && lh.ref_id() != rh.ref_id())
158 fusion = true;
159
160 if (!fusion && lh.ref_id() == rh.ref_id())
161 {
162 if (lh.antisense_align() == rh.antisense_align())
163 fusion = true;
164 else
165 {
166 int inner_dist = 0;
167 if (lh.antisense_align())
168 // used rh.left() instead of rh.right() for the cases,
169 // where reads overlap with each other or reads span introns
170 inner_dist = lh.left() - rh.left();
171 else
172 inner_dist = rh.left() - lh.left();
173
174 if (inner_dist < 0 || inner_dist > (int)fusion_min_dist)
175 fusion = true;
176 }
177 }
178
179 grade = InsertAlignmentGrade(lh, rh, fusion);
180
181 return true;
182 }
183
184 struct cmp_pair_alignment
185 {
186 cmp_pair_alignment(const JunctionSet& gtf_juncs) :
187 _gtf_juncs(&gtf_juncs) {}
188
189 bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
190 {
191 InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, gl);
192 InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, gr);
193
194 bool better = gr < gl;
195 bool worse = gl < gr;
196
197 if (better && !worse)
198 return true;
199 else
200 return false;
201 }
202
203 const JunctionSet* _gtf_juncs;
204 };
205
206 void pair_best_alignments(const HitsForRead& left_hits,
207 const HitsForRead& right_hits,
208 InsertAlignmentGrade& best_grade,
209 vector<pair<BowtieHit, BowtieHit> >& best_hits,
210 const JunctionSet& gtf_junctions,
211 const JunctionSet& junctions = empty_junctions,
212 const InsertionSet& insertions = empty_insertions,
213 const DeletionSet& deletions = empty_deletions,
214 const FusionSet& fusions = empty_fusions,
215 const Coverage& coverage = empty_coverage,
216 bool final_report = false)
217 {
218 const vector<BowtieHit>& left = left_hits.hits;
219 const vector<BowtieHit>& right = right_hits.hits;
220
221 if (left.size() >= max_multihits * 5 || right.size() >= max_multihits * 5)
222 return;
223
224 for (size_t i = 0; i < left.size(); ++i)
225 {
226 if (left[i].edit_dist() > max_read_mismatches) continue;
227
228 BowtieHit lh = left[i];
229 AlignStatus align_status(lh, gtf_junctions,
230 junctions, insertions, deletions, fusions, coverage);
231 lh.alignment_score(align_status._alignment_score);
232
233 for (size_t j = 0; j < right.size(); ++j)
234 {
235 if (right[j].edit_dist() > max_read_mismatches) continue;
236
237 BowtieHit rh = right[j];
238 AlignStatus align_status(rh, gtf_junctions,
239 junctions, insertions, deletions, fusions, coverage);
240 rh.alignment_score(align_status._alignment_score);
241 InsertAlignmentGrade g;
242 bool allowed = set_insert_alignment_grade(lh, rh, g);
243
244 // daehwan - for debugging purposes
245 if (lh.insert_id() == 325708 && !g.is_fusion() && false)
246 {
247 fprintf(stderr, "lh %d:%d %s score: %d (from %d) NM: %d\n",
248 lh.ref_id(), lh.left(), print_cigar(lh.cigar()).c_str(),
249 lh.alignment_score(), left[i].alignment_score(), lh.edit_dist());
250 fprintf(stderr, "rh %d:%d %s score: %d (from %d) NM: %d\n",
251 rh.ref_id(), rh.left(), print_cigar(rh.cigar()).c_str(),
252 rh.alignment_score(), right[j].alignment_score(), rh.edit_dist());
253 fprintf(stderr, "combined score: %d is_fusion(%d)\n", g.align_score(), g.is_fusion());
254 }
255
256 if (!allowed) continue;
257 if (!fusion_search && !report_discordant_pair_alignments && g.is_fusion()) continue;
258
259 if (report_secondary_alignments || !final_report)
260 {
261 best_hits.push_back(make_pair(lh, rh));
262 }
263 else
264 {
265 // Is the new status better than the current best one?
266 if (best_grade < g)
267 {
268 best_hits.clear();
269 best_grade = g;
270 best_hits.push_back(make_pair(lh, rh));
271 }
272 else if (!(g < best_grade))
273 {
274 best_hits.push_back(make_pair(lh, rh));
275 }
276 }
277 }
278 }
279
280 if ((report_secondary_alignments || !final_report) && best_hits.size() > 0)
281 {
282 cmp_pair_alignment cmp(gtf_junctions);
283 sort(best_hits.begin(), best_hits.end(), cmp);
284 set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade);
285 }
286
287 if (final_report)
288 {
289 if (best_hits.size() > max_multihits)
290 best_hits.erase(best_hits.begin() + max_multihits, best_hits.end());
291 }
292
293 best_grade.num_alignments = best_hits.size();
294 }
295
296 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
297
298 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
299 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
300 int num_hits, const BowtieHit* next_hit, int hitIndex) {
301 bool XS_found = false;
302 if (sam_toks.size()>11) {
303
304 for (size_t i=11;i<sam_toks.size();++i) {
305 if (sam_toks[i].find("NH:i:")==string::npos &&
306 sam_toks[i].find("XS:i:")==string::npos)
307 auxdata.push_back(sam_toks[i]);
308
309 if (sam_toks[i].find("XS:A:")!=string::npos)
310 XS_found = true;
311 }
312 }
313 string aux("NH:i:");
314 str_appendInt(aux, num_hits);
315 auxdata.push_back(aux);
316 if (next_hit) {
317 const char* nh_ref_name = "=";
318 nh_ref_name = rt.get_name(next_hit->ref_id());
319 assert (nh_ref_name != NULL);
320 bool same_contig=(next_hit->ref_id()==bh.ref_id());
321 aux="CC:Z:";
322 aux+= (same_contig ? "=" : nh_ref_name);
323 auxdata.push_back(aux);
324 aux="CP:i:";
325 int nh_gpos=next_hit->left() + 1;
326 str_appendInt(aux, nh_gpos);
327 auxdata.push_back(aux);
328 } //has next_hit
329 // FIXME: this code is still a bit brittle, because it contains no
330 // consistency check that the mates are on opposite strands - a current protocol
331 // requirement, and that the strand indicated by the alignment is consistent
332 // with the orientation of the splices (though that should be handled upstream).
333 if (!XS_found) {
334 const string xs_f("XS:A:+");
335 const string xs_r("XS:A:-");
336 if (library_type == FR_FIRSTSTRAND) {
337 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
338 if (bh.antisense_align())
339 auxdata.push_back(xs_f);
340 else
341 auxdata.push_back(xs_r);
342 }
343 else {
344 if (bh.antisense_align())
345 auxdata.push_back(xs_r);
346 else
347 auxdata.push_back(xs_f);
348 }
349 }
350 else if (library_type == FR_SECONDSTRAND) {
351 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
352 if (bh.antisense_align())
353 auxdata.push_back(xs_r);
354 else
355 auxdata.push_back(xs_f);
356 }
357 else
358 {
359 if (bh.antisense_align())
360 auxdata.push_back(xs_f);
361 else
362 auxdata.push_back(xs_r);
363 }
364 }
365 }
366 if (hitIndex >= 0)
367 {
368 string aux("HI:i:");
369 str_appendInt(aux, hitIndex);
370 auxdata.push_back(aux);
371 }
372 }
373
374 bool rewrite_sam_record(GBamWriter& bam_writer,
375 const RefSequenceTable& rt,
376 const BowtieHit& bh,
377 const char* bwt_buf,
378 const char* read_alt_name,
379 FragmentType insert_side,
380 int num_hits,
381 const BowtieHit* next_hit,
382 bool primary,
383 int hitIndex)
384 {
385 // Rewrite this hit, filling in the alt name, mate mapping
386 // and setting the pair flag
387 vector<string> sam_toks;
388 tokenize(bwt_buf, "\t", sam_toks);
389
390 string ref_name = sam_toks[2], ref_name2 = "";
391 char cigar1[1024] = {0}, cigar2[1024] = {0};
392 string left_seq, right_seq, left_qual, right_qual;
393 int left = -1, left1 = -1, left2 = -1;
394 bool fusion_alignment = false;
395 size_t XF_index = 0;
396 for (size_t i = 11; i < sam_toks.size(); ++i)
397 {
398 string& tok = sam_toks[i];
399 if (strncmp(tok.c_str(), "XF", 2) == 0)
400 {
401 fusion_alignment = true;
402 XF_index = i;
403
404 vector<string> tuple_fields;
405 tokenize(tok.c_str(), " ", tuple_fields);
406 vector<string> contigs;
407 tokenize(tuple_fields[1].c_str(), "-", contigs);
408 if (contigs.size() >= 2)
409 {
410 ref_name = contigs[0];
411 ref_name2 = contigs[1];
412 }
413
414 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
415 cigar1, cigar2, left_seq, right_seq,
416 left_qual, right_qual, left1, left2);
417
418 break;
419 }
420
421 else if (strncmp(tok.c_str(), "AS", 2) == 0)
422 {
423 char AS_score[128] = {0};
424 sprintf(AS_score, "AS:i:%d", bh.alignment_score());
425 tok = AS_score;
426 }
427 }
428
429 string qname(read_alt_name);
430 size_t slash_pos=qname.rfind('/');
431 if (slash_pos!=string::npos)
432 qname.resize(slash_pos);
433 //read_alt_name as QNAME
434 int flag=atoi(sam_toks[1].c_str()); //FLAG
435 if (insert_side != FRAG_UNPAIRED) {
436 //flag = atoi(sam_toks[1].c_str());
437 // mark this as a singleton mate
438 flag |= 0x0001;
439 if (insert_side == FRAG_LEFT)
440 flag |= 0x0040;
441 else if (insert_side == FRAG_RIGHT)
442 flag |= 0x0080;
443 flag |= 0x0008;
444 //char flag_buf[64];
445 //sprintf(flag_buf, "%d", flag);
446 //sam_toks[t] = flag_buf;
447 }
448 if (!primary)
449 flag |= 0x100;
450
451 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
452 int mapQ = 50;
453 if (num_hits > 1) {
454 double err_prob = 1 - (1.0 / num_hits);
455 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
456 }
457 int tlen =atoi(sam_toks[8].c_str()); //TLEN
458 int mate_pos=atoi(sam_toks[7].c_str());
459
460 GBamRecord* bamrec=NULL;
461 if (fusion_alignment) {
462 vector<string> auxdata;
463 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
464 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
465 cigar1, sam_toks[6].c_str(), mate_pos,
466 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
467 bam_writer.write(bamrec);
468 delete bamrec;
469
470 auxdata.clear();
471 sam_toks[XF_index][5] = '2';
472 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
473 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
474 cigar2, sam_toks[6].c_str(), mate_pos,
475 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
476 bam_writer.write(bamrec);
477 delete bamrec;
478 } else {
479 vector<string> auxdata;
480 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
481 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
482 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
483 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
484 bam_writer.write(bamrec);
485 delete bamrec;
486 }
487
488 return true;
489 }
490
491 bool rewrite_sam_record(GBamWriter& bam_writer,
492 const RefSequenceTable& rt,
493 const BowtieHit& bh,
494 const char* bwt_buf,
495 const char* read_alt_name,
496 const InsertAlignmentGrade& grade,
497 FragmentType insert_side,
498 const BowtieHit* partner,
499 int num_hits,
500 const BowtieHit* next_hit,
501 bool primary,
502 int hitIndex)
503 {
504 // Rewrite this hit, filling in the alt name, mate mapping
505 // and setting the pair flag
506 vector<string> sam_toks;
507 tokenize(bwt_buf, "\t", sam_toks);
508 string qname(read_alt_name);
509 size_t slash_pos=qname.rfind('/');
510 if (slash_pos!=string::npos)
511 qname.resize(slash_pos);
512 //read_alt_name as QNAME
513 int flag = atoi(sam_toks[1].c_str());
514 // 0x0010 (strand of query) is assumed to be set correctly
515 // to begin with
516 flag |= 0x0001; //it must be paired
517 if (insert_side == FRAG_LEFT)
518 flag |= 0x0040;
519 else if (insert_side == FRAG_RIGHT)
520 flag |= 0x0080;
521 if (!primary)
522 flag |= 0x100;
523
524 string ref_name = sam_toks[2], ref_name2 = "";
525 char cigar1[1024] = {0}, cigar2[1024] = {0};
526 string left_seq, right_seq, left_qual, right_qual;
527 int left = -1, left1 = -1, left2 = -1;
528 bool fusion_alignment = false;
529 size_t XF_tok_idx = 11;
530 for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
531 {
532 string& tok = sam_toks[XF_tok_idx];
533 if (strncmp(tok.c_str(), "XF", 2) == 0)
534 {
535 fusion_alignment = true;
536
537 vector<string> tuple_fields;
538 tokenize(tok.c_str(), " ", tuple_fields);
539 vector<string> contigs;
540 tokenize(tuple_fields[1].c_str(), "-", contigs);
541 if (contigs.size() >= 2)
542 {
543 ref_name = contigs[0];
544 ref_name2 = contigs[1];
545 }
546
547 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
548 cigar1, cigar2, left_seq, right_seq,
549 left_qual, right_qual, left1, left2);
550
551 break;
552 }
553
554 else if (strncmp(tok.c_str(), "AS", 2) == 0)
555 {
556 char AS_score[128] = {0};
557 sprintf(AS_score, "AS:i:%d", bh.alignment_score());
558 tok = AS_score;
559 }
560 }
561
562 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
563 int mapQ = 50;
564 if (grade.num_alignments > 1) {
565 double err_prob = 1 - (1.0 / grade.num_alignments);
566 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
567 }
568 int tlen=0; //TLEN
569 int mate_pos=atoi(sam_toks[7].c_str());
570 if (partner) {
571 if (partner->ref_id()==bh.ref_id()) {
572 sam_toks[6] = "="; //same chromosome
573 //TLEN:
574 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
575 partner->left() - bh.right();
576 }
577
578 else { //partner on different chromosome/contig
579 sam_toks[6] = rt.get_name(partner->ref_id());
580 }
581 mate_pos = partner->left() + 1;
582 if (grade.happy())
583 flag |= 0x0002;
584 if (partner->antisense_align())
585 flag |= 0x0020;
586
587 if (partner->fusion_opcode() != FUSION_NOTHING)
588 {
589 char partner_pos[1024];
590 sprintf(partner_pos, "XP:Z:%s-%s %d", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1);
591 sam_toks.push_back(partner_pos);
592 }
593 }
594 else {
595 sam_toks[6] = "*";
596 mate_pos = 0;
597 flag |= 0x0008;
598 }
599
600 GBamRecord* bamrec=NULL;
601 if (fusion_alignment) {
602 vector<string> auxdata;
603 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
604 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
605 cigar1, sam_toks[6].c_str(), mate_pos,
606 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
607 bam_writer.write(bamrec);
608 delete bamrec;
609
610 auxdata.clear();
611 sam_toks[XF_tok_idx][5] = '2';
612 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
613 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
614 cigar2, sam_toks[6].c_str(), mate_pos,
615 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
616 bam_writer.write(bamrec);
617 delete bamrec;
618 } else {
619 vector<string> auxdata;
620 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
621 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
622 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
623 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
624 bam_writer.write(bamrec);
625 delete bamrec;
626 }
627
628 return true;
629 }
630
631 struct lex_hit_sort
632 {
633 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
634 : _rt(rt), _hits(hits)
635 {}
636
637 bool operator()(const uint32_t& l, const uint32_t& r) const
638 {
639 const BowtieHit& lhs = _hits.hits[l];
640 const BowtieHit& rhs = _hits.hits[r];
641
642 uint32_t l_id = lhs.ref_id();
643 uint32_t r_id = rhs.ref_id();
644 if (l_id != r_id)
645 return l_id < r_id;
646
647 return lhs.left() < rhs.left();
648 }
649
650 const RefSequenceTable& _rt;
651 const HitsForRead& _hits;
652 };
653
654 void print_sam_for_single(const RefSequenceTable& rt,
655 const HitsForRead& hits,
656 FragmentType frag_type,
657 Read& read,
658 GBamWriter& bam_writer,
659 GBamWriter* um_out, //write unmapped reads here,
660 boost::random::mt19937& rng
661 )
662 {
663 assert(!read.alt_name.empty());
664 if (hits.hits.empty())
665 return;
666
667 lex_hit_sort s(rt, hits);
668 vector<uint32_t> index_vector;
669
670 size_t i;
671 for (i = 0; i < hits.hits.size(); ++i)
672 index_vector.push_back(i);
673
674 sort(index_vector.begin(), index_vector.end(), s);
675
676 size_t primaryHit = 0;
677 if (!report_secondary_alignments)
678 primaryHit = rng() % hits.hits.size();
679
680 bool multipleHits = (hits.hits.size() > 1);
681 for (i = 0; i < hits.hits.size(); ++i)
682 {
683 size_t index = index_vector[i];
684 bool primary = (index == primaryHit);
685 const BowtieHit& bh = hits.hits[index];
686 rewrite_sam_record(bam_writer, rt,
687 bh,
688 bh.hitfile_rec().c_str(),
689 read.alt_name.c_str(),
690 frag_type,
691 hits.hits.size(),
692 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
693 primary,
694 (multipleHits? i: -1));
695 }
696 }
697
698 void print_sam_for_pair(const RefSequenceTable& rt,
699 const vector<pair<BowtieHit, BowtieHit> >& best_hits,
700 const InsertAlignmentGrade& grade,
701 ReadStream& left_reads_file,
702 ReadStream& right_reads_file,
703 GBamWriter& bam_writer,
704 GBamWriter* left_um_out,
705 GBamWriter* right_um_out,
706 boost::random::mt19937& rng,
707 uint64_t begin_id = 0,
708 uint64_t end_id = std::numeric_limits<uint64_t>::max())
709 {
710 Read left_read;
711 Read right_read;
712 if (best_hits.empty())
713 return;
714
715 size_t i;
716 HitsForRead right_hits;
717 for (i = 0; i < best_hits.size(); ++i)
718 right_hits.hits.push_back(best_hits[i].second);
719
720 size_t primaryHit = 0;
721 vector<uint32_t> index_vector;
722 lex_hit_sort s(rt, right_hits);
723 for (i = 0; i < right_hits.hits.size(); ++i)
724 index_vector.push_back(i);
725
726 sort(index_vector.begin(), index_vector.end(), s);
727
728 if (!report_secondary_alignments)
729 primaryHit = rng() % right_hits.hits.size();
730
731 bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), left_read,
732 reads_format, false, begin_id, end_id,
733 left_um_out, false);
734
735 bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), right_read,
736 reads_format, false, begin_id, end_id,
737 right_um_out, false);
738
739 assert (got_left_read && got_right_read);
740 bool multipleHits = (best_hits.size() > 1);
741 for (i = 0; i < best_hits.size(); ++i)
742 {
743 size_t index = index_vector[i];
744 bool primary = (index == primaryHit);
745 const BowtieHit& right_bh = best_hits[index].second;
746 const BowtieHit& left_bh = best_hits[index].first;
747
748 rewrite_sam_record(bam_writer, rt,
749 right_bh,
750 right_bh.hitfile_rec().c_str(),
751 right_read.alt_name.c_str(),
752 grade,
753 FRAG_RIGHT,
754 &left_bh,
755 best_hits.size(),
756 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL,
757 primary,
758 (multipleHits? i: -1));
759 rewrite_sam_record(bam_writer, rt,
760 left_bh,
761 left_bh.hitfile_rec().c_str(),
762 left_read.alt_name.c_str(),
763 grade,
764 FRAG_LEFT,
765 &right_bh,
766 best_hits.size(),
767 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL,
768 primary,
769 (multipleHits? i: -1));
770 }
771 }
772
773 /**
774 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
775 * @param hits hits The alignments for a particular read
776 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
777 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
778 */
779 void update_insertions_and_deletions(const HitsForRead& hits,
780 InsertionSet& insertions,
781 DeletionSet& deletions)
782 {
783 for (size_t i = 0; i < hits.hits.size(); ++i)
784 {
785 const BowtieHit& bh = hits.hits[i];
786 insertions_from_alignment(bh, insertions);
787 deletions_from_alignment(bh, deletions);
788 }
789 }
790
791 void update_coverage(const HitsForRead& hits,
792 Coverage& coverage)
793 {
794 for (size_t i = 0; i < hits.hits.size(); ++i)
795 {
796 const BowtieHit& hit = hits.hits[i];
797 const vector<CigarOp>& cigar = hit.cigar();
798 unsigned int positionInGenome = hit.left();
799 RefID ref_id = hit.ref_id();
800
801 for(size_t c = 0; c < cigar.size(); ++c)
802 {
803 int opcode = cigar[c].opcode;
804 int length = cigar[c].length;
805 switch(opcode)
806 {
807 case REF_SKIP:
808 case MATCH:
809 case DEL:
810 if (opcode == MATCH)
811 coverage.add_coverage(ref_id, positionInGenome, length);
812
813 positionInGenome += length;
814 break;
815 case rEF_SKIP:
816 case mATCH:
817 case dEL:
818 positionInGenome -= length;
819
820 if (opcode == mATCH)
821 coverage.add_coverage(ref_id, positionInGenome + 1, length);
822 break;
823 case FUSION_FF:
824 case FUSION_FR:
825 case FUSION_RF:
826 case FUSION_RR:
827 positionInGenome = length;
828 ref_id = hit.ref_id2();
829 break;
830 default:
831 break;
832 }
833 }
834 }
835 }
836
837
838 void update_fusions(const HitsForRead& hits,
839 RefSequenceTable& rt,
840 FusionSet& fusions,
841 const FusionSet& fusions_ref = empty_fusions)
842 {
843 if (!fusion_search)
844 return;
845
846 if (hits.hits.size() > fusion_multireads)
847 return;
848
849 bool update_stat = fusions_ref.size() > 0;
850 for (size_t i = 0; i < hits.hits.size(); ++i)
851 {
852 const BowtieHit& bh = hits.hits[i];
853
854 if (bh.edit_dist() > fusion_read_mismatches)
855 continue;
856
857 fusions_from_alignment(bh, fusions, rt, update_stat);
858
859 if (update_stat)
860 unsupport_fusions(bh, fusions, fusions_ref);
861 }
862 }
863
864 void update_junctions(const HitsForRead& hits,
865 JunctionSet& junctions)
866 {
867 for (size_t i = 0; i < hits.hits.size(); ++i)
868 {
869 const BowtieHit& bh = hits.hits[i];
870 junctions_from_alignment(bh, junctions);
871 }
872 }
873
874 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
875 // resets the stream when finished.
876 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits)
877 {
878 HitsForRead remaining;
879 remaining.insert_id = hits.insert_id;
880
881 for (size_t i = 0; i < hits.hits.size(); ++i)
882 {
883 BowtieHit& bh = hits.hits[i];
884 if (bh.edit_dist() > max_read_mismatches)
885 continue;
886
887 bool filter_hit = false;
888 if (!bh.contiguous())
889 {
890 JunctionSet bh_juncs;
891 junctions_from_alignment(bh, bh_juncs);
892 for (JunctionSet::iterator itr = bh_juncs.begin();
893 itr != bh_juncs.end();
894 itr++)
895 {
896 const Junction& j = itr->first;
897 JunctionSet::const_iterator target = junctions.find(j);
898 if (target == junctions.end() || !target->second.accepted)
899 {
900 filter_hit = true;
901 break;
902 }
903 }
904 }
905 if (!filter_hit)
906 remaining.hits.push_back(bh);
907 }
908 hits = remaining;
909 }
910
911 // events include splice junction, indels, and fusion points.
912 struct ConsensusEventsWorker
913 {
914 void operator()()
915 {
916 ReadTable it;
917 vector<BAMHitFactory*> hit_factories;
918 hit_factories.push_back(new BAMHitFactory(it, *rt));
919 HitStream l_hs(left_map_fname, hit_factories.back(), false, true, true, true);
920 if (left_map_offset > 0)
921 l_hs.seek(left_map_offset);
922
923 hit_factories.push_back(new BAMHitFactory(it, *rt));
924 HitStream r_hs(right_map_fname, hit_factories.back(), false, true, true, true);
925 if (right_map_offset > 0)
926 r_hs.seek(right_map_offset);
927
928 HitsForRead curr_left_hit_group;
929 HitsForRead curr_right_hit_group;
930
931 l_hs.next_read_hits(curr_left_hit_group);
932 r_hs.next_read_hits(curr_right_hit_group);
933
934 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
935 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
936
937 // While we still have unreported hits...
938 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
939 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
940 {
941 // Chew up left singletons
942 while (curr_left_obs_order < curr_right_obs_order &&
943 curr_left_obs_order < end_id &&
944 curr_left_obs_order != VMAXINT32)
945 {
946 HitsForRead best_hits;
947 best_hits.insert_id = curr_left_obs_order;
948
949 // Process hits for left singleton, select best alignments
950 read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions);
951 update_coverage(best_hits, *coverage);
952 update_junctions(best_hits, *junctions);
953 update_insertions_and_deletions(best_hits, *insertions, *deletions);
954 update_fusions(best_hits, *rt, *fusions);
955
956 // Get next hit group
957 l_hs.next_read_hits(curr_left_hit_group);
958 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
959 }
960
961 // Chew up right singletons
962 while (curr_left_obs_order > curr_right_obs_order &&
963 curr_right_obs_order < end_id &&
964 curr_right_obs_order != VMAXINT32)
965 {
966 HitsForRead best_hits;
967 best_hits.insert_id = curr_right_obs_order;
968 if (curr_right_obs_order >= begin_id)
969 {
970 // Process hit for right singleton, select best alignments
971 read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions);
972 update_coverage(best_hits, *coverage);
973 update_junctions(best_hits, *junctions);
974 update_insertions_and_deletions(best_hits, *insertions, *deletions);
975 update_fusions(best_hits, *rt, *fusions);
976 }
977
978 // Get next hit group
979 r_hs.next_read_hits(curr_right_hit_group);
980 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
981 }
982
983 // Since we have both left hits and right hits for this insert,
984 // Find the best pairing and print both
985 while (curr_left_obs_order == curr_right_obs_order &&
986 curr_left_obs_order < end_id &&
987 curr_left_obs_order != VMAXINT32)
988 {
989 vector<pair<BowtieHit, BowtieHit> > best_hits;
990
991 InsertAlignmentGrade grade;
992 pair_best_alignments(curr_left_hit_group,
993 curr_right_hit_group,
994 grade,
995 best_hits,
996 *gtf_junctions);
997
998 HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
999 HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
1000
1001 for (size_t i = 0; i < best_hits.size(); ++i)
1002 {
1003 best_left_hit_group.hits.push_back(best_hits[i].first);
1004 best_right_hit_group.hits.push_back(best_hits[i].second);
1005 }
1006
1007 update_coverage(best_left_hit_group, *coverage);
1008 update_junctions(best_left_hit_group, *junctions);
1009 update_insertions_and_deletions(best_left_hit_group, *insertions, *deletions);
1010 update_fusions(best_left_hit_group, *rt, *fusions);
1011
1012 update_coverage(best_right_hit_group, *coverage);
1013 update_junctions(best_right_hit_group, *junctions);
1014 update_insertions_and_deletions(best_right_hit_group, *insertions, *deletions);
1015 update_fusions(best_right_hit_group, *rt, *fusions);
1016
1017 l_hs.next_read_hits(curr_left_hit_group);
1018 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1019
1020 r_hs.next_read_hits(curr_right_hit_group);
1021 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1022 }
1023 }
1024
1025 for (size_t i = 0; i < hit_factories.size(); ++i)
1026 delete hit_factories[i];
1027
1028 hit_factories.clear();
1029 }
1030
1031 string left_map_fname;
1032 string right_map_fname;
1033
1034 RefSequenceTable* rt;
1035
1036 JunctionSet* gtf_junctions;
1037
1038 uint64_t begin_id;
1039 uint64_t end_id;
1040 int64_t left_map_offset;
1041 int64_t right_map_offset;
1042
1043 JunctionSet* junctions;
1044 InsertionSet* insertions;
1045 DeletionSet* deletions;
1046 FusionSet* fusions;
1047
1048 Coverage* coverage;
1049 };
1050
1051 struct ReportWorker
1052 {
1053 void operator()()
1054 {
1055 rng.seed(1);
1056
1057 ReadTable it;
1058 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
1059
1060 ReadStream left_reads_file(left_reads_fname);
1061 if (left_reads_file.file() == NULL)
1062 err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
1063 if (left_reads_file.isBam()) {
1064 left_reads_file.use_alt_name();
1065 left_reads_file.ignoreQC();
1066 }
1067 if (left_reads_offset > 0)
1068 left_reads_file.seek(left_reads_offset);
1069
1070 //if (!zpacker.empty()) left_um_fname+=".z";
1071 GBamWriter* left_um_out=new GBamWriter(left_um_fname.c_str(), sam_header.c_str());
1072 GBamWriter* right_um_out=NULL;
1073
1074 ReadStream right_reads_file(right_reads_fname);
1075 if (right_reads_offset > 0)
1076 right_reads_file.seek(right_reads_offset);
1077
1078 //FZPipe right_um_out;
1079 if (!right_reads_fname.empty())
1080 {
1081 if (right_reads_file.isBam()) {
1082 right_reads_file.use_alt_name();
1083 right_reads_file.ignoreQC();
1084 right_um_out=new GBamWriter(right_um_fname.c_str(), sam_header.c_str());
1085 }
1086 //if (!zpacker.empty()) right_um_fname+=".z";
1087 //if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
1088 // err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
1089 }
1090
1091 vector<BAMHitFactory*> hit_factories;
1092 hit_factories.push_back(new BAMHitFactory(it, *rt));
1093 HitStream left_hs(left_map_fname, hit_factories.back(), false, true, true, true);
1094 if (left_map_offset > 0)
1095 left_hs.seek(left_map_offset);
1096
1097 hit_factories.push_back(new BAMHitFactory(it, *rt));
1098 HitStream right_hs(right_map_fname, hit_factories.back(), false, true, true, true);
1099 if (right_map_offset > 0)
1100 right_hs.seek(right_map_offset);
1101
1102 HitsForRead curr_left_hit_group;
1103 HitsForRead curr_right_hit_group;
1104
1105 left_hs.next_read_hits(curr_left_hit_group);
1106 right_hs.next_read_hits(curr_right_hit_group);
1107
1108 uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1109 uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1110
1111 const bool final_report = true;
1112
1113 // While we still have unreported hits...
1114 Read l_read;
1115 Read r_read;
1116 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
1117 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
1118 {
1119 // Chew up left singletons (pairs with right reads unmapped)
1120 while (curr_left_obs_order < curr_right_obs_order &&
1121 curr_left_obs_order < end_id &&
1122 curr_left_obs_order != VMAXINT32)
1123 {
1124 HitsForRead best_hits;
1125 best_hits.insert_id = curr_left_obs_order;
1126 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1127 reads_format, false, begin_id, end_id,
1128 left_um_out, false);
1129 //assert(got_read);
1130
1131 if (right_reads_file.file()) {
1132 got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1133 reads_format, false, begin_id, end_id,
1134 right_um_out, true);
1135 //assert(got_read);
1136 }
1137
1138 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1139
1140 // Process hits for left singleton, select best alignments
1141 read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions,
1142 *junctions, *insertions, *deletions, *fusions, *coverage,
1143 final_report);
1144
1145 if (best_hits.hits.size() > 0)
1146 {
1147 update_junctions(best_hits, *final_junctions);
1148 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1149 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1150
1151 print_sam_for_single(*rt,
1152 best_hits,
1153 (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
1154 l_read,
1155 bam_writer,
1156 left_um_out,
1157 rng);
1158 }
1159 else
1160 {
1161 }
1162
1163 // Get next hit group
1164 left_hs.next_read_hits(curr_left_hit_group);
1165 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1166 } //left singletons
1167
1168 // Chew up right singletons
1169 while (curr_left_obs_order > curr_right_obs_order &&
1170 curr_right_obs_order < end_id &&
1171 curr_right_obs_order != VMAXINT32)
1172 {
1173 HitsForRead best_hits;
1174 best_hits.insert_id = curr_right_obs_order;
1175
1176 if (curr_right_obs_order >= begin_id)
1177 {
1178 bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read,
1179 reads_format, false, begin_id, end_id,
1180 right_um_out, false);
1181
1182 //assert(got_read);
1183 /*
1184 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1185 r_read.seq.c_str(), r_read.qual.c_str());
1186 */
1187 got_read=left_reads_file.getRead(curr_right_obs_order, l_read,
1188 reads_format, false, begin_id, end_id,
1189 left_um_out, true);
1190 //assert(got_read);
1191
1192 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1193
1194 // Process hit for right singleton, select best alignments
1195 read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions,
1196 *junctions, *insertions, *deletions, *fusions, *coverage,
1197 final_report);
1198
1199 if (best_hits.hits.size() > 0)
1200 {
1201 update_junctions(best_hits, *final_junctions);
1202 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1203 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1204
1205 print_sam_for_single(*rt,
1206 best_hits,
1207 FRAG_RIGHT,
1208 r_read,
1209 bam_writer,
1210 right_um_out,
1211 rng);
1212 }
1213 else
1214 {
1215 }
1216 }
1217
1218 // Get next hit group
1219 right_hs.next_read_hits(curr_right_hit_group);
1220 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1221 }
1222
1223 // Since we have both left hits and right hits for this insert,
1224 // Find the best pairing and print both
1225 while (curr_left_obs_order == curr_right_obs_order &&
1226 curr_left_obs_order < end_id &&
1227 curr_left_obs_order != VMAXINT32)
1228 {
1229 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1230 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1231 if (curr_left_hit_group.hits.empty())
1232 { //only right read mapped
1233 //write it in the mapped file with the #MAPPED# flag
1234
1235 bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1236 reads_format, false, begin_id, end_id,
1237 right_um_out, false);
1238 //assert(got_read);
1239 /*
1240 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1241 r_read.seq.c_str(), r_read.qual.c_str());
1242 */
1243 HitsForRead right_best_hits;
1244 right_best_hits.insert_id = curr_right_obs_order;
1245
1246 read_best_alignments(curr_right_hit_group, right_best_hits, *gtf_junctions,
1247 *junctions, *insertions, *deletions, *fusions, *coverage,
1248 final_report);
1249
1250 if (right_best_hits.hits.size() > 0)
1251 {
1252 update_junctions(right_best_hits, *final_junctions);
1253 update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1254 update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1255
1256 print_sam_for_single(*rt,
1257 right_best_hits,
1258 FRAG_RIGHT,
1259 r_read,
1260 bam_writer,
1261 right_um_out,
1262 rng);
1263 }
1264 }
1265 else if (curr_right_hit_group.hits.empty())
1266 {
1267 HitsForRead left_best_hits;
1268 left_best_hits.insert_id = curr_left_obs_order;
1269 //only left read mapped
1270 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1271 reads_format, false, begin_id, end_id,
1272 left_um_out, false);
1273
1274 // Process hits for left singleton, select best alignments
1275 read_best_alignments(curr_left_hit_group, left_best_hits, *gtf_junctions,
1276 *junctions, *insertions, *deletions, *fusions, *coverage,
1277 final_report);
1278
1279 if (left_best_hits.hits.size() > 0)
1280 {
1281 update_junctions(left_best_hits, *final_junctions);
1282 update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1283 update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1284
1285 print_sam_for_single(*rt,
1286 left_best_hits,
1287 FRAG_LEFT,
1288 l_read,
1289 bam_writer,
1290 left_um_out,
1291 rng);
1292 }
1293 }
1294 else
1295 {
1296 vector<pair<BowtieHit, BowtieHit> > best_hits;
1297
1298 InsertAlignmentGrade grade;
1299 pair_best_alignments(curr_left_hit_group,
1300 curr_right_hit_group,
1301 grade, best_hits, *gtf_junctions,
1302 *junctions, *insertions, *deletions, *fusions, *coverage,
1303 final_report);
1304
1305 HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
1306 HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
1307
1308 for (size_t i = 0; i < best_hits.size(); ++i)
1309 {
1310 best_left_hit_group.hits.push_back(best_hits[i].first);
1311 best_right_hit_group.hits.push_back(best_hits[i].second);
1312 }
1313
1314 if (best_hits.size() > 0)
1315 {
1316 update_junctions(best_left_hit_group, *final_junctions);
1317 update_insertions_and_deletions(best_left_hit_group, *final_insertions, *final_deletions);
1318 update_fusions(best_left_hit_group, *rt, *final_fusions, *fusions);
1319
1320 update_junctions(best_right_hit_group, *final_junctions);
1321 update_insertions_and_deletions(best_right_hit_group, *final_insertions, *final_deletions);
1322 update_fusions(best_right_hit_group, *rt, *final_fusions, *fusions);
1323
1324 pair_support(best_hits, *final_fusions, *fusions);
1325
1326 print_sam_for_pair(*rt,
1327 best_hits,
1328 grade,
1329 left_reads_file,
1330 right_reads_file,
1331 bam_writer,
1332 left_um_out,
1333 right_um_out,
1334 rng,
1335 begin_id,
1336 end_id);
1337 }
1338 else
1339 {
1340 }
1341 }
1342
1343 left_hs.next_read_hits(curr_left_hit_group);
1344 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1345
1346 right_hs.next_read_hits(curr_right_hit_group);
1347 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1348 }
1349 } //while we still have unreported hits..
1350 //print the remaining unmapped reads at the end of each reads' stream
1351
1352 left_reads_file.getRead(VMAXINT32, l_read,
1353 reads_format, false, begin_id, end_id,
1354 left_um_out, false);
1355 if (right_reads_file.file())
1356 right_reads_file.getRead(VMAXINT32, r_read,
1357 reads_format, false, begin_id, end_id,
1358 right_um_out, false);
1359
1360
1361 // pclose (pipe close), which waits for a process to end, seems to conflict with boost::thread::join somehow,
1362 // resulting in deadlock like behavior.
1363 delete left_um_out;
1364 delete right_um_out;
1365
1366
1367 for (size_t i = 0; i < hit_factories.size(); ++i)
1368 delete hit_factories[i];
1369
1370 hit_factories.clear();
1371 }
1372
1373 string bam_output_fname;
1374 string sam_header_fname;
1375
1376 string left_reads_fname;
1377 string left_map_fname;
1378 string right_reads_fname;
1379 string right_map_fname;
1380
1381 string left_um_fname;
1382 string right_um_fname;
1383
1384 JunctionSet* gtf_junctions;
1385
1386 JunctionSet* junctions;
1387 InsertionSet* insertions;
1388 DeletionSet* deletions;
1389 FusionSet* fusions;
1390 Coverage* coverage;
1391
1392 JunctionSet* final_junctions;
1393 InsertionSet* final_insertions;
1394 DeletionSet* final_deletions;
1395 FusionSet* final_fusions;
1396
1397 RefSequenceTable* rt;
1398
1399 uint64_t begin_id;
1400 uint64_t end_id;
1401 int64_t left_reads_offset;
1402 int64_t left_map_offset;
1403 int64_t right_reads_offset;
1404 int64_t right_map_offset;
1405
1406 boost::random::mt19937 rng;
1407 };
1408
1409 void driver(const string& bam_output_fname,
1410 istream& ref_stream,
1411 const string& left_map_fname,
1412 const string& left_reads_fname,
1413 const string& right_map_fname,
1414 const string& right_reads_fname,
1415 FILE* junctions_out,
1416 FILE* insertions_out,
1417 FILE* deletions_out,
1418 FILE* fusions_out)
1419 {
1420 if (!parallel)
1421 num_threads = 1;
1422
1423 RefSequenceTable rt(sam_header, true);
1424
1425 if (fusion_search)
1426 get_seqs(ref_stream, rt, true);
1427
1428 srandom(1);
1429
1430 JunctionSet gtf_junctions;
1431 if (!gtf_juncs.empty())
1432 {
1433 char splice_buf[4096];
1434 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
1435 if (splice_coords)
1436 {
1437 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
1438 {
1439 char* nl = strrchr(splice_buf, '\n');
1440 char* buf = splice_buf;
1441 if (nl) *nl = 0;
1442
1443 char* ref_name = get_token((char**)&buf, "\t");
1444 char* scan_left_coord = get_token((char**)&buf, "\t");
1445 char* scan_right_coord = get_token((char**)&buf, "\t");
1446 char* orientation = get_token((char**)&buf, "\t");
1447
1448 if (!scan_left_coord || !scan_right_coord || !orientation)
1449 {
1450 fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
1451 gtf_juncs.c_str(), buf);
1452 exit(1);
1453 }
1454
1455 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
1456 uint32_t left_coord = atoi(scan_left_coord);
1457 uint32_t right_coord = atoi(scan_right_coord);
1458 bool antisense = *orientation == '-';
1459
1460 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
1461 }
1462 }
1463 fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
1464 }
1465
1466 vector<uint64_t> read_ids;
1467 vector<vector<int64_t> > offsets;
1468 if (num_threads > 1)
1469 {
1470 vector<string> fnames;
1471 if (right_map_fname != "")
1472 {
1473 fnames.push_back(right_reads_fname);
1474 fnames.push_back(right_map_fname);
1475 }
1476 fnames.push_back(left_reads_fname);
1477 fnames.push_back(left_map_fname);
1478 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
1479 if (!enough_data)
1480 num_threads = 1;
1481 }
1482
1483 JunctionSet vjunctions[num_threads];
1484 InsertionSet vinsertions[num_threads];
1485 DeletionSet vdeletions[num_threads];
1486 FusionSet vfusions[num_threads];
1487 Coverage vcoverages[num_threads];
1488
1489 vector<boost::thread*> threads;
1490 for (int i = 0; i < num_threads; ++i)
1491 {
1492 ConsensusEventsWorker worker;
1493
1494 worker.left_map_fname = left_map_fname;
1495 worker.right_map_fname = right_map_fname;
1496 worker.rt = &rt;
1497 worker.gtf_junctions = &gtf_junctions;
1498
1499 worker.junctions = &vjunctions[i];
1500 worker.insertions = &vinsertions[i];
1501 worker.deletions = &vdeletions[i];
1502 worker.fusions = &vfusions[i];
1503 worker.coverage = &vcoverages[i];
1504
1505 worker.right_map_offset = 0;
1506
1507 if (i == 0)
1508 {
1509 worker.begin_id = 0;
1510 worker.left_map_offset = 0;
1511 }
1512 else
1513 {
1514 size_t offsets_size = offsets[i-1].size();
1515
1516 worker.begin_id = read_ids[i-1];
1517 worker.left_map_offset = offsets[i-1].back();
1518 if (offsets_size == 4)
1519 worker.right_map_offset = offsets[i-1][1];
1520 }
1521
1522 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1523
1524 if (num_threads > 1)
1525 threads.push_back(new boost::thread(worker));
1526 else
1527 worker();
1528 }
1529
1530 for (size_t i = 0; i < threads.size(); ++i)
1531 {
1532 threads[i]->join();
1533 delete threads[i];
1534 threads[i] = NULL;
1535 }
1536 threads.clear();
1537
1538 JunctionSet& junctions = vjunctions[0];
1539 InsertionSet& insertions = vinsertions[0];
1540 DeletionSet& deletions = vdeletions[0];
1541 FusionSet& fusions = vfusions[0];
1542 Coverage& coverage = vcoverages[0];
1543 for (size_t i = 1; i < num_threads; ++i)
1544 {
1545 merge_with(junctions, vjunctions[i]);
1546 vjunctions[i].clear();
1547
1548 merge_with(insertions, vinsertions[i]);
1549 vinsertions[i].clear();
1550
1551 merge_with(deletions, vdeletions[i]);
1552 vdeletions[i].clear();
1553
1554 merge_with(fusions, vfusions[i]);
1555 vfusions[i].clear();
1556
1557 coverage.merge_with(vcoverages[i]);
1558 vcoverages[i].clear();
1559 }
1560
1561 coverage.calculate_coverage();
1562
1563 size_t num_unfiltered_juncs = junctions.size();
1564 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
1565
1566 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1567 filter_junctions(junctions, gtf_junctions);
1568
1569 //size_t num_juncs_after_filter = junctions.size();
1570 //fprintf(stderr, "Filtered %lu junctions\n",
1571 // num_unfiltered_juncs - num_juncs_after_filter);
1572
1573 /*
1574 size_t small_overhangs = 0;
1575 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1576 {
1577 if (i->second.accepted &&
1578 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1579 {
1580 small_overhangs++;
1581 }
1582 }
1583
1584 if (small_overhangs >0)
1585 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1586 */
1587
1588 JunctionSet vfinal_junctions[num_threads];
1589 InsertionSet vfinal_insertions[num_threads];
1590 DeletionSet vfinal_deletions[num_threads];
1591 FusionSet vfinal_fusions[num_threads];
1592
1593 for (int i = 0; i < num_threads; ++i)
1594 {
1595 ReportWorker worker;
1596
1597 worker.sam_header_fname = sam_header;
1598 char filename[1024] = {0};
1599 sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
1600 worker.bam_output_fname = filename;
1601 string tmpoutdir=getFdir(worker.bam_output_fname);
1602 worker.left_um_fname = tmpoutdir;
1603 sprintf(filename, "unmapped_left_%d.bam", i);
1604 worker.left_um_fname+=filename;
1605
1606 if (right_reads_fname != "")
1607 {
1608 sprintf(filename, "unmapped_right_%d.bam", i);
1609 worker.right_um_fname = tmpoutdir;
1610 worker.right_um_fname += filename;
1611 }
1612
1613 worker.left_reads_fname = left_reads_fname;
1614 worker.left_map_fname = left_map_fname;
1615 worker.right_reads_fname = right_reads_fname;
1616 worker.right_map_fname = right_map_fname;
1617
1618 worker.gtf_junctions = &gtf_junctions;
1619 worker.junctions = &junctions;
1620 worker.insertions = &insertions;
1621 worker.deletions = &deletions;
1622 worker.fusions = &fusions;
1623 worker.coverage = &coverage;
1624
1625 worker.final_junctions = &vfinal_junctions[i];
1626 worker.final_insertions = &vfinal_insertions[i];
1627 worker.final_deletions = &vfinal_deletions[i];
1628 worker.final_fusions = &vfinal_fusions[i];
1629 worker.rt = &rt;
1630
1631 worker.right_reads_offset = 0;
1632 worker.right_map_offset = 0;
1633
1634 if (i == 0)
1635 {
1636 worker.begin_id = 0;
1637 worker.left_reads_offset = 0;
1638 worker.left_map_offset = 0;
1639 }
1640 else
1641 {
1642 size_t offsets_size = offsets[i-1].size();
1643
1644 worker.begin_id = read_ids[i-1];
1645 worker.left_reads_offset = offsets[i-1][offsets_size - 2];
1646 worker.left_map_offset = offsets[i-1].back();
1647 if (offsets_size == 4)
1648 {
1649 worker.right_reads_offset = offsets[i-1][0];
1650 worker.right_map_offset = offsets[i-1][1];
1651 }
1652 }
1653
1654 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1655
1656 if (num_threads > 1)
1657 threads.push_back(new boost::thread(worker));
1658 else
1659 worker();
1660 }
1661
1662 for (size_t i = 0; i < threads.size(); ++i)
1663 {
1664 threads[i]->join();
1665 delete threads[i];
1666 threads[i] = NULL;
1667 }
1668 threads.clear();
1669
1670 JunctionSet final_junctions = vfinal_junctions[0];
1671 InsertionSet final_insertions = vfinal_insertions[0];
1672 DeletionSet final_deletions = vfinal_deletions[0];
1673 FusionSet final_fusions = vfinal_fusions[0];
1674 for (size_t i = 1; i < num_threads; ++i)
1675 {
1676 merge_with(final_junctions, vfinal_junctions[i]);
1677 vfinal_junctions[i].clear();
1678
1679 merge_with(final_insertions, vfinal_insertions[i]);
1680 vfinal_insertions[i].clear();
1681
1682 merge_with(final_deletions, vfinal_deletions[i]);
1683 vfinal_deletions[i].clear();
1684
1685 merge_with(final_fusions, vfinal_fusions[i]);
1686 vfinal_fusions[i].clear();
1687 }
1688
1689 fprintf (stderr, "Reporting final accepted alignments...");
1690 fprintf (stderr, "done.\n");
1691
1692 //small_overhangs = 0;
1693 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1694 {
1695 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
1696 {
1697 final_junctions.erase(i++);
1698 }
1699 else
1700 {
1701 ++i;
1702 }
1703 }
1704
1705 // if (small_overhangs > 0)
1706 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1707
1708 fprintf (stderr, "Printing junction BED track...");
1709 print_junctions(junctions_out, final_junctions, rt);
1710 fprintf (stderr, "done\n");
1711
1712 fprintf (stderr, "Printing insertions...");
1713 print_insertions(insertions_out, final_insertions,rt);
1714 fclose(insertions_out);
1715 fprintf (stderr, "done\n");
1716
1717 fprintf (stderr, "Printing deletions...");
1718 print_deletions(deletions_out, final_deletions, rt);
1719 fclose(deletions_out);
1720 fprintf (stderr, "done\n");
1721
1722 if (fusion_search)
1723 {
1724 fprintf (stderr, "Printing fusions...");
1725 print_fusions(fusions_out, final_fusions, rt);
1726 fclose(fusions_out);
1727 fprintf (stderr, "done\n");
1728 }
1729
1730 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1731 }
1732
1733 void print_usage()
1734 {
1735 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1736
1737 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1738 }
1739
1740 int main(int argc, char** argv)
1741 {
1742 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1743 fprintf(stderr, "---------------------------------------\n");
1744
1745 reads_format = FASTQ;
1746
1747 int parse_ret = parse_options(argc, argv, print_usage);
1748 if (parse_ret)
1749 return parse_ret;
1750
1751 if(optind >= argc)
1752 {
1753 print_usage();
1754 return 1;
1755 }
1756
1757 string ref_file_name = argv[optind++];
1758
1759 if(optind >= argc)
1760 {
1761 print_usage();
1762 return 1;
1763 }
1764
1765 string junctions_file_name = argv[optind++];
1766
1767 if(optind >= argc)
1768 {
1769 print_usage();
1770 return 1;
1771 }
1772
1773 string insertions_file_name = argv[optind++];
1774 if(optind >= argc)
1775 {
1776 print_usage();
1777 return 1;
1778 }
1779
1780 string deletions_file_name = argv[optind++];
1781 if(optind >= argc)
1782 {
1783 print_usage();
1784 return 1;
1785 }
1786
1787 string fusions_file_name = argv[optind++];
1788 if(optind >= argc)
1789 {
1790 print_usage();
1791 return 1;
1792 }
1793
1794 string accepted_hits_file_name = argv[optind++];
1795 if(optind >= argc)
1796 {
1797 print_usage();
1798 return 1;
1799 }
1800
1801 string left_map_filename = argv[optind++];
1802 if(optind >= argc)
1803 {
1804 print_usage();
1805 return 1;
1806 }
1807
1808 string left_reads_filename = argv[optind++];
1809 string unzcmd=getUnpackCmd(left_reads_filename, false);
1810 string right_map_filename;
1811 string right_reads_filename;
1812
1813 if (optind < argc)
1814 {
1815 right_map_filename = argv[optind++];
1816 if(optind >= argc) {
1817 print_usage();
1818 return 1;
1819 }
1820 right_reads_filename=argv[optind++];
1821 }
1822
1823 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1824 if (!ref_stream.good())
1825 {
1826 fprintf(stderr, "Error: cannot open %s for reading\n",
1827 ref_file_name.c_str());
1828 exit(1);
1829 }
1830
1831 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1832 if (junctions_file == NULL)
1833 {
1834 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1835 junctions_file_name.c_str());
1836 exit(1);
1837 }
1838
1839 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1840 if (insertions_file == NULL)
1841 {
1842 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1843 insertions_file_name.c_str());
1844 exit(1);
1845 }
1846
1847 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1848 if (deletions_file == NULL)
1849 {
1850 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1851 deletions_file_name.c_str());
1852 exit(1);
1853 }
1854
1855 FILE* fusions_file = NULL;
1856 if (fusion_search)
1857 {
1858 fusions_file = fopen(fusions_file_name.c_str(), "w");
1859 if (fusions_file == NULL)
1860 {
1861 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1862 fusions_file_name.c_str());
1863 exit(1);
1864 }
1865 }
1866
1867 driver(accepted_hits_file_name,
1868 ref_stream,
1869 left_map_filename,
1870 left_reads_filename,
1871 right_map_filename,
1872 right_reads_filename,
1873 junctions_file,
1874 insertions_file,
1875 deletions_file,
1876 fusions_file);
1877
1878 return 0;
1879 }