ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 206
Committed: Mon Mar 12 18:02:35 2012 UTC (12 years, 5 months ago) by gpertea
File size: 54693 byte(s)
Log Message:
updated

Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include <boost/thread.hpp>
34
35 #include "common.h"
36 #include "utils.h"
37 #include "bwt_map.h"
38 #include "junctions.h"
39 #include "insertions.h"
40 #include "deletions.h"
41 #include "fusions.h"
42 #include "align_status.h"
43 #include "fragments.h"
44 #include "wiggles.h"
45 #include "tokenize.h"
46 #include "reads.h"
47 #include "coverage.h"
48
49
50 #include "inserts.h"
51
52 using namespace std;
53 using namespace seqan;
54 using std::set;
55
56 static const JunctionSet empty_junctions;
57 static const InsertionSet empty_insertions;
58 static const DeletionSet empty_deletions;
59 static const FusionSet empty_fusions;
60 static const Coverage empty_coverage;
61
62 // daehwan - this is redundancy, which should be removed.
63 void get_seqs(istream& ref_stream,
64 RefSequenceTable& rt,
65 bool keep_seqs = true)
66 {
67 while(ref_stream.good() && !ref_stream.eof())
68 {
69 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
70 string name;
71 readMeta(ref_stream, name, Fasta());
72 string::size_type space_pos = name.find_first_of(" \t\r");
73 if (space_pos != string::npos)
74 {
75 name.resize(space_pos);
76 }
77 seqan::read(ref_stream, *ref_str, Fasta());
78
79 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
80 }
81 }
82
83 struct cmp_read_alignment
84 {
85 bool operator() (const BowtieHit& l, const BowtieHit& r) const
86 {
87 return l.alignment_score() > r.alignment_score();
88 }
89 };
90
91 void read_best_alignments(const HitsForRead& hits_for_read,
92 HitsForRead& best_hits,
93 const JunctionSet& gtf_junctions,
94 const JunctionSet& junctions = empty_junctions,
95 const InsertionSet& insertions = empty_insertions,
96 const DeletionSet& deletions = empty_deletions,
97 const FusionSet& fusions = empty_fusions,
98 const Coverage& coverage = empty_coverage,
99 bool final_report = false)
100 {
101 const vector<BowtieHit>& hits = hits_for_read.hits;
102 for (size_t i = 0; i < hits.size(); ++i)
103 {
104 if (hits[i].edit_dist() > max_read_mismatches)
105 continue;
106
107 BowtieHit hit = hits[i];
108 AlignStatus align_status(hit, gtf_junctions,
109 junctions, insertions, deletions, fusions, coverage);
110 hit.alignment_score(align_status._alignment_score);
111
112 if (report_secondary_alignments || !final_report)
113 {
114 best_hits.hits.push_back(hit);
115 }
116 else
117 {
118 // Is the new status better than the current best one?
119 if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0]))
120 {
121 best_hits.hits.clear();
122 best_hits.hits.push_back(hit);
123 }
124 else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good?
125 {
126 best_hits.hits.push_back(hit);
127 }
128 }
129 }
130
131 if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0)
132 {
133 sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment());
134 }
135
136 if (final_report)
137 {
138 if (best_hits.hits.size() > max_multihits)
139 best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
140 }
141 }
142
143 bool set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade)
144 {
145 // max mate inner distance (genomic)
146 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
147 if (max_mate_inner_dist == -1)
148 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
149
150 bool fusion = false;
151 bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
152 bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
153 if (left_fusion && right_fusion)
154 return false;
155
156 fusion = left_fusion || right_fusion;
157 if (!fusion && lh.ref_id() != rh.ref_id())
158 fusion = true;
159
160 if (!fusion && lh.ref_id() == rh.ref_id())
161 {
162 if (lh.antisense_align() == rh.antisense_align())
163 fusion = true;
164 else
165 {
166 int inner_dist = 0;
167 if (lh.antisense_align())
168 // used rh.left() instead of rh.right() for the cases,
169 // where reads overlap with each other or reads span introns
170 inner_dist = lh.left() - rh.left();
171 else
172 inner_dist = rh.left() - lh.left();
173
174 if (inner_dist < 0 || inner_dist > (int)fusion_min_dist)
175 fusion = true;
176 }
177 }
178
179 grade = InsertAlignmentGrade(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
180
181 return true;
182 }
183
184 struct cmp_pair_alignment
185 {
186 cmp_pair_alignment(const JunctionSet& gtf_juncs) :
187 _gtf_juncs(&gtf_juncs) {}
188
189 bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
190 {
191 InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, gl);
192 InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, gr);
193
194 bool better = gr < gl;
195 bool worse = gl < gr;
196
197 if (better && !worse)
198 return true;
199 else
200 return false;
201 }
202
203 const JunctionSet* _gtf_juncs;
204 };
205
206 void pair_best_alignments(const HitsForRead& left_hits,
207 const HitsForRead& right_hits,
208 InsertAlignmentGrade& best_grade,
209 vector<pair<BowtieHit, BowtieHit> >& best_hits,
210 const JunctionSet& gtf_junctions,
211 const JunctionSet& junctions = empty_junctions,
212 const InsertionSet& insertions = empty_insertions,
213 const DeletionSet& deletions = empty_deletions,
214 const FusionSet& fusions = empty_fusions,
215 const Coverage& coverage = empty_coverage,
216 bool final_report = false)
217 {
218 const vector<BowtieHit>& left = left_hits.hits;
219 const vector<BowtieHit>& right = right_hits.hits;
220
221 for (size_t i = 0; i < left.size(); ++i)
222 {
223 if (left[i].edit_dist() > max_read_mismatches) continue;
224
225 BowtieHit lh = left[i];
226 AlignStatus align_status(lh, gtf_junctions,
227 junctions, insertions, deletions, fusions, coverage);
228 lh.alignment_score(align_status._alignment_score);
229
230 for (size_t j = 0; j < right.size(); ++j)
231 {
232 if (right[j].edit_dist() > max_read_mismatches) continue;
233
234 BowtieHit rh = right[j];
235 AlignStatus align_status(rh, gtf_junctions,
236 junctions, insertions, deletions, fusions, coverage);
237 rh.alignment_score(align_status._alignment_score);
238 InsertAlignmentGrade g;
239 bool allowed = set_insert_alignment_grade(lh, rh, g);
240
241 if (!allowed) continue;
242 if (!fusion_search && !report_discordant_pair_alignments && g.is_fusion()) continue;
243
244 if (report_secondary_alignments || !final_report)
245 {
246 best_hits.push_back(make_pair(lh, rh));
247 }
248 else
249 {
250 // Is the new status better than the current best one?
251 if (best_grade < g)
252 {
253 best_hits.clear();
254 best_grade = g;
255 best_hits.push_back(make_pair(lh, rh));
256 }
257 else if (!(g < best_grade))
258 {
259 best_hits.push_back(make_pair(lh, rh));
260 }
261 }
262 }
263 }
264
265 if ((report_secondary_alignments || !final_report) && best_hits.size() > 0)
266 {
267 cmp_pair_alignment cmp(gtf_junctions);
268 sort(best_hits.begin(), best_hits.end(), cmp);
269 set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade);
270 }
271
272 if (final_report)
273 {
274 if (best_hits.size() > max_multihits)
275 best_hits.erase(best_hits.begin() + max_multihits, best_hits.end());
276 }
277
278 best_grade.num_alignments = best_hits.size();
279 }
280
281 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
282
283 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
284 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
285 int num_hits, const BowtieHit* next_hit, int hitIndex) {
286 bool XS_found = false;
287 if (sam_toks.size()>11) {
288
289 for (size_t i=11;i<sam_toks.size();++i) {
290 if (sam_toks[i].find("NH:i:")==string::npos &&
291 sam_toks[i].find("XS:i:")==string::npos)
292 auxdata.push_back(sam_toks[i]);
293
294 if (sam_toks[i].find("XS:A:")!=string::npos)
295 XS_found = true;
296 }
297 }
298 string aux("NH:i:");
299 str_appendInt(aux, num_hits);
300 auxdata.push_back(aux);
301 if (next_hit) {
302 const char* nh_ref_name = "=";
303 nh_ref_name = rt.get_name(next_hit->ref_id());
304 assert (nh_ref_name != NULL);
305 bool same_contig=(next_hit->ref_id()==bh.ref_id());
306 aux="CC:Z:";
307 aux+= (same_contig ? "=" : nh_ref_name);
308 auxdata.push_back(aux);
309 aux="CP:i:";
310 int nh_gpos=next_hit->left() + 1;
311 str_appendInt(aux, nh_gpos);
312 auxdata.push_back(aux);
313 } //has next_hit
314 // FIXME: this code is still a bit brittle, because it contains no
315 // consistency check that the mates are on opposite strands - a current protocol
316 // requirement, and that the strand indicated by the alignment is consistent
317 // with the orientation of the splices (though that should be handled upstream).
318 if (!XS_found) {
319 const string xs_f("XS:A:+");
320 const string xs_r("XS:A:-");
321 if (library_type == FR_FIRSTSTRAND) {
322 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
323 if (bh.antisense_align())
324 auxdata.push_back(xs_f);
325 else
326 auxdata.push_back(xs_r);
327 }
328 else {
329 if (bh.antisense_align())
330 auxdata.push_back(xs_r);
331 else
332 auxdata.push_back(xs_f);
333 }
334 }
335 else if (library_type == FR_SECONDSTRAND) {
336 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
337 if (bh.antisense_align())
338 auxdata.push_back(xs_r);
339 else
340 auxdata.push_back(xs_f);
341 }
342 else
343 {
344 if (bh.antisense_align())
345 auxdata.push_back(xs_f);
346 else
347 auxdata.push_back(xs_r);
348 }
349 }
350 }
351 if (hitIndex >= 0)
352 {
353 string aux("HI:i:");
354 str_appendInt(aux, hitIndex);
355 auxdata.push_back(aux);
356 }
357 }
358
359 bool rewrite_sam_record(GBamWriter& bam_writer,
360 const RefSequenceTable& rt,
361 const BowtieHit& bh,
362 const char* bwt_buf,
363 const char* read_alt_name,
364 FragmentType insert_side,
365 int num_hits,
366 const BowtieHit* next_hit,
367 bool primary,
368 int hitIndex)
369 {
370 // Rewrite this hit, filling in the alt name, mate mapping
371 // and setting the pair flag
372 vector<string> sam_toks;
373 tokenize(bwt_buf, "\t", sam_toks);
374
375 string ref_name = sam_toks[2], ref_name2 = "";
376 char cigar1[1024] = {0}, cigar2[1024] = {0};
377 string left_seq, right_seq, left_qual, right_qual;
378 int left = -1, left1 = -1, left2 = -1;
379 bool fusion_alignment = false;
380 size_t XF_index = 0;
381 for (size_t i = 11; i < sam_toks.size(); ++i)
382 {
383 string& tok = sam_toks[i];
384 if (strncmp(tok.c_str(), "XF", 2) == 0)
385 {
386 fusion_alignment = true;
387 XF_index = i;
388
389 vector<string> tuple_fields;
390 tokenize(tok.c_str(), " ", tuple_fields);
391 vector<string> contigs;
392 tokenize(tuple_fields[1].c_str(), "-", contigs);
393 if (contigs.size() >= 2)
394 {
395 ref_name = contigs[0];
396 ref_name2 = contigs[1];
397 }
398
399 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
400 cigar1, cigar2, left_seq, right_seq,
401 left_qual, right_qual, left1, left2);
402
403 break;
404 }
405
406 else if (strncmp(tok.c_str(), "AS", 2) == 0)
407 {
408 char AS_score[128] = {0};
409 sprintf(AS_score, "AS:i:%d", bh.alignment_score());
410 tok = AS_score;
411 }
412 }
413
414 string qname(read_alt_name);
415 size_t slash_pos=qname.rfind('/');
416 if (slash_pos!=string::npos)
417 qname.resize(slash_pos);
418 //read_alt_name as QNAME
419 int flag=atoi(sam_toks[1].c_str()); //FLAG
420 if (insert_side != FRAG_UNPAIRED) {
421 //flag = atoi(sam_toks[1].c_str());
422 // mark this as a singleton mate
423 flag |= 0x0001;
424 if (insert_side == FRAG_LEFT)
425 flag |= 0x0040;
426 else if (insert_side == FRAG_RIGHT)
427 flag |= 0x0080;
428 flag |= 0x0008;
429 //char flag_buf[64];
430 //sprintf(flag_buf, "%d", flag);
431 //sam_toks[t] = flag_buf;
432 }
433 if (!primary)
434 flag |= 0x100;
435
436 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
437 int mapQ=255;
438 if (num_hits > 1) {
439 double err_prob = 1 - (1.0 / num_hits);
440 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
441 }
442 int tlen =atoi(sam_toks[8].c_str()); //TLEN
443 int mate_pos=atoi(sam_toks[7].c_str());
444
445 GBamRecord* bamrec=NULL;
446 if (fusion_alignment) {
447 vector<string> auxdata;
448 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
449 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
450 cigar1, sam_toks[6].c_str(), mate_pos,
451 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
452 bam_writer.write(bamrec);
453 delete bamrec;
454
455 auxdata.clear();
456 sam_toks[XF_index][5] = '2';
457 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
458 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
459 cigar2, sam_toks[6].c_str(), mate_pos,
460 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
461 bam_writer.write(bamrec);
462 delete bamrec;
463 } else {
464 vector<string> auxdata;
465 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
466 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
467 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
468 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
469 bam_writer.write(bamrec);
470 delete bamrec;
471 }
472
473 return true;
474 }
475
476 bool rewrite_sam_record(GBamWriter& bam_writer,
477 const RefSequenceTable& rt,
478 const BowtieHit& bh,
479 const char* bwt_buf,
480 const char* read_alt_name,
481 const InsertAlignmentGrade& grade,
482 FragmentType insert_side,
483 const BowtieHit* partner,
484 int num_hits,
485 const BowtieHit* next_hit,
486 bool primary,
487 int hitIndex)
488 {
489 // Rewrite this hit, filling in the alt name, mate mapping
490 // and setting the pair flag
491 vector<string> sam_toks;
492 tokenize(bwt_buf, "\t", sam_toks);
493 string qname(read_alt_name);
494 size_t slash_pos=qname.rfind('/');
495 if (slash_pos!=string::npos)
496 qname.resize(slash_pos);
497 //read_alt_name as QNAME
498 int flag = atoi(sam_toks[1].c_str());
499 // 0x0010 (strand of query) is assumed to be set correctly
500 // to begin with
501 flag |= 0x0001; //it must be paired
502 if (insert_side == FRAG_LEFT)
503 flag |= 0x0040;
504 else if (insert_side == FRAG_RIGHT)
505 flag |= 0x0080;
506 if (!primary)
507 flag |= 0x100;
508
509 string ref_name = sam_toks[2], ref_name2 = "";
510 char cigar1[1024] = {0}, cigar2[1024] = {0};
511 string left_seq, right_seq, left_qual, right_qual;
512 int left = -1, left1 = -1, left2 = -1;
513 bool fusion_alignment = false;
514 size_t XF_tok_idx = 11;
515 for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
516 {
517 string& tok = sam_toks[XF_tok_idx];
518 if (strncmp(tok.c_str(), "XF", 2) == 0)
519 {
520 fusion_alignment = true;
521
522 vector<string> tuple_fields;
523 tokenize(tok.c_str(), " ", tuple_fields);
524 vector<string> contigs;
525 tokenize(tuple_fields[1].c_str(), "-", contigs);
526 if (contigs.size() >= 2)
527 {
528 ref_name = contigs[0];
529 ref_name2 = contigs[1];
530 }
531
532 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
533 cigar1, cigar2, left_seq, right_seq,
534 left_qual, right_qual, left1, left2);
535
536 break;
537 }
538
539 else if (strncmp(tok.c_str(), "AS", 2) == 0)
540 {
541 char AS_score[128] = {0};
542 sprintf(AS_score, "AS:i:%d", bh.alignment_score());
543 tok = AS_score;
544 }
545 }
546
547 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
548 int mapQ=255;
549 if (grade.num_alignments > 1) {
550 double err_prob = 1 - (1.0 / grade.num_alignments);
551 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
552 }
553 int tlen=0; //TLEN
554 int mate_pos=atoi(sam_toks[7].c_str());
555 if (partner) {
556 if (partner->ref_id()==bh.ref_id()) {
557 sam_toks[6] = "="; //same chromosome
558 //TLEN:
559 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
560 partner->left() - bh.right();
561 }
562
563 else { //partner on different chromosome/contig
564 sam_toks[6] = rt.get_name(partner->ref_id());
565 }
566 mate_pos = partner->left() + 1;
567 if (grade.happy())
568 flag |= 0x0002;
569 if (partner->antisense_align())
570 flag |= 0x0020;
571
572 if (partner->fusion_opcode() != FUSION_NOTHING)
573 {
574 char partner_pos[1024];
575 sprintf(partner_pos, "XP:Z:%s-%s %d", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1);
576 sam_toks.push_back(partner_pos);
577 }
578 }
579 else {
580 sam_toks[6] = "*";
581 mate_pos = 0;
582 flag |= 0x0008;
583 }
584
585 GBamRecord* bamrec=NULL;
586 if (fusion_alignment) {
587 vector<string> auxdata;
588 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
589 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
590 cigar1, sam_toks[6].c_str(), mate_pos,
591 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
592 bam_writer.write(bamrec);
593 delete bamrec;
594
595 auxdata.clear();
596 sam_toks[XF_tok_idx][5] = '2';
597 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
598 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
599 cigar2, sam_toks[6].c_str(), mate_pos,
600 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
601 bam_writer.write(bamrec);
602 delete bamrec;
603 } else {
604 vector<string> auxdata;
605 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
606 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
607 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
608 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
609 bam_writer.write(bamrec);
610 delete bamrec;
611 }
612
613 return true;
614 }
615
616 struct lex_hit_sort
617 {
618 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
619 : _rt(rt), _hits(hits)
620 {}
621
622 bool operator()(const uint32_t& l, const uint32_t& r) const
623 {
624 const BowtieHit& lhs = _hits.hits[l];
625 const BowtieHit& rhs = _hits.hits[r];
626
627 uint32_t l_id = lhs.ref_id();
628 uint32_t r_id = rhs.ref_id();
629 if (l_id != r_id)
630 return l_id < r_id;
631
632 return lhs.left() < rhs.left();
633 }
634
635 const RefSequenceTable& _rt;
636 const HitsForRead& _hits;
637 };
638
639 void print_sam_for_single(const RefSequenceTable& rt,
640 const HitsForRead& hits,
641 FragmentType frag_type,
642 Read& read,
643 GBamWriter& bam_writer,
644 FILE* um_out //write unmapped reads here
645 )
646 {
647 assert(!read.alt_name.empty());
648 if (hits.hits.empty())
649 return;
650
651 lex_hit_sort s(rt, hits);
652 vector<uint32_t> index_vector;
653
654 size_t i;
655 for (i = 0; i < hits.hits.size(); ++i)
656 index_vector.push_back(i);
657
658 sort(index_vector.begin(), index_vector.end(), s);
659
660 size_t primaryHit = 0;
661 if (!report_secondary_alignments)
662 primaryHit = random() % hits.hits.size();
663
664 bool multipleHits = (hits.hits.size() > 1);
665 for (i = 0; i < hits.hits.size(); ++i)
666 {
667 size_t index = index_vector[i];
668 bool primary = (index == primaryHit);
669 const BowtieHit& bh = hits.hits[index];
670 rewrite_sam_record(bam_writer, rt,
671 bh,
672 bh.hitfile_rec().c_str(),
673 read.alt_name.c_str(),
674 frag_type,
675 hits.hits.size(),
676 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
677 primary,
678 (multipleHits? i: -1));
679 }
680 }
681
682 void print_sam_for_pair(const RefSequenceTable& rt,
683 const vector<pair<BowtieHit, BowtieHit> >& best_hits,
684 const InsertAlignmentGrade& grade,
685 ReadStream& left_reads_file,
686 ReadStream& right_reads_file,
687 GBamWriter& bam_writer,
688 FILE* left_um_out,
689 FILE* right_um_out,
690 uint64_t begin_id = 0,
691 uint64_t end_id = std::numeric_limits<uint64_t>::max())
692 {
693 Read left_read;
694 Read right_read;
695 if (best_hits.empty())
696 return;
697
698 size_t i;
699 HitsForRead right_hits;
700 for (i = 0; i < best_hits.size(); ++i)
701 right_hits.hits.push_back(best_hits[i].second);
702
703 size_t primaryHit = 0;
704 vector<uint32_t> index_vector;
705 lex_hit_sort s(rt, right_hits);
706 for (i = 0; i < right_hits.hits.size(); ++i)
707 index_vector.push_back(i);
708
709 sort(index_vector.begin(), index_vector.end(), s);
710
711 if (!report_secondary_alignments)
712 primaryHit = random() % right_hits.hits.size();
713
714 bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), left_read,
715 reads_format, false, begin_id, end_id,
716 left_um_out, false);
717
718 bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), right_read,
719 reads_format, false, begin_id, end_id,
720 right_um_out, false);
721
722 assert (got_left_read && got_right_read);
723 bool multipleHits = (best_hits.size() > 1);
724 for (i = 0; i < best_hits.size(); ++i)
725 {
726 size_t index = index_vector[i];
727 bool primary = (index == primaryHit);
728 const BowtieHit& right_bh = best_hits[index].second;
729 const BowtieHit& left_bh = best_hits[index].first;
730
731 rewrite_sam_record(bam_writer, rt,
732 right_bh,
733 right_bh.hitfile_rec().c_str(),
734 right_read.alt_name.c_str(),
735 grade,
736 FRAG_RIGHT,
737 &left_bh,
738 best_hits.size(),
739 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL,
740 primary,
741 (multipleHits? i: -1));
742 rewrite_sam_record(bam_writer, rt,
743 left_bh,
744 left_bh.hitfile_rec().c_str(),
745 left_read.alt_name.c_str(),
746 grade,
747 FRAG_LEFT,
748 &right_bh,
749 best_hits.size(),
750 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL,
751 primary,
752 (multipleHits? i: -1));
753 }
754 }
755
756 /**
757 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
758 * @param hits hits The alignments for a particular read
759 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
760 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
761 */
762 void update_insertions_and_deletions(const HitsForRead& hits,
763 InsertionSet& insertions,
764 DeletionSet& deletions)
765 {
766 for (size_t i = 0; i < hits.hits.size(); ++i)
767 {
768 const BowtieHit& bh = hits.hits[i];
769 insertions_from_alignment(bh, insertions);
770 deletions_from_alignment(bh, deletions);
771 }
772 }
773
774 void update_coverage(const HitsForRead& hits,
775 Coverage& coverage)
776 {
777 for (size_t i = 0; i < hits.hits.size(); ++i)
778 {
779 const BowtieHit& hit = hits.hits[i];
780 const vector<CigarOp>& cigar = hit.cigar();
781 unsigned int positionInGenome = hit.left();
782 RefID ref_id = hit.ref_id();
783
784 for(size_t c = 0; c < cigar.size(); ++c)
785 {
786 int opcode = cigar[c].opcode;
787 int length = cigar[c].length;
788 switch(opcode)
789 {
790 case REF_SKIP:
791 case MATCH:
792 case DEL:
793 if (opcode == MATCH)
794 coverage.add_coverage(ref_id, positionInGenome, length);
795
796 positionInGenome += length;
797 break;
798 case rEF_SKIP:
799 case mATCH:
800 case dEL:
801 positionInGenome -= length;
802
803 if (opcode == mATCH)
804 coverage.add_coverage(ref_id, positionInGenome + 1, length);
805 break;
806 case FUSION_FF:
807 case FUSION_FR:
808 case FUSION_RF:
809 case FUSION_RR:
810 positionInGenome = length;
811 ref_id = hit.ref_id2();
812 break;
813 default:
814 break;
815 }
816 }
817 }
818 }
819
820
821 void update_fusions(const HitsForRead& hits,
822 RefSequenceTable& rt,
823 FusionSet& fusions,
824 const FusionSet& fusions_ref = empty_fusions)
825 {
826 if (hits.hits.size() > fusion_multireads)
827 return;
828
829 bool update_stat = fusions_ref.size() > 0;
830 for (size_t i = 0; i < hits.hits.size(); ++i)
831 {
832 const BowtieHit& bh = hits.hits[i];
833
834 if (bh.edit_dist() > fusion_read_mismatches)
835 continue;
836
837 fusions_from_alignment(bh, fusions, rt, update_stat);
838
839 if (update_stat)
840 unsupport_fusions(bh, fusions, fusions_ref);
841 }
842 }
843
844 void update_junctions(const HitsForRead& hits,
845 JunctionSet& junctions)
846 {
847 for (size_t i = 0; i < hits.hits.size(); ++i)
848 {
849 const BowtieHit& bh = hits.hits[i];
850 junctions_from_alignment(bh, junctions);
851 }
852 }
853
854 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
855 // resets the stream when finished.
856 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits)
857 {
858 HitsForRead remaining;
859 remaining.insert_id = hits.insert_id;
860
861 for (size_t i = 0; i < hits.hits.size(); ++i)
862 {
863 BowtieHit& bh = hits.hits[i];
864 if (bh.edit_dist() > max_read_mismatches)
865 continue;
866
867 bool filter_hit = false;
868 if (!bh.contiguous())
869 {
870 JunctionSet bh_juncs;
871 junctions_from_alignment(bh, bh_juncs);
872 for (JunctionSet::iterator itr = bh_juncs.begin();
873 itr != bh_juncs.end();
874 itr++)
875 {
876 const Junction& j = itr->first;
877 JunctionSet::const_iterator target = junctions.find(j);
878 if (target == junctions.end() || !target->second.accepted)
879 {
880 filter_hit = true;
881 break;
882 }
883 }
884 }
885 if (!filter_hit)
886 remaining.hits.push_back(bh);
887 }
888 hits = remaining;
889 }
890
891 // events include splice junction, indels, and fusion points.
892 struct ConsensusEventsWorker
893 {
894 void operator()()
895 {
896 ReadTable it;
897 vector<BAMHitFactory*> hit_factories;
898 hit_factories.push_back(new BAMHitFactory(it, *rt));
899 HitStream l_hs(left_map_fname, hit_factories.back(), false, true, true, true);
900 if (left_map_offset > 0)
901 l_hs.seek(left_map_offset);
902
903 hit_factories.push_back(new BAMHitFactory(it, *rt));
904 HitStream r_hs(right_map_fname, hit_factories.back(), false, true, true, true);
905 if (right_map_offset > 0)
906 r_hs.seek(right_map_offset);
907
908 HitsForRead curr_left_hit_group;
909 HitsForRead curr_right_hit_group;
910
911 l_hs.next_read_hits(curr_left_hit_group);
912 r_hs.next_read_hits(curr_right_hit_group);
913
914 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
915 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
916
917 // While we still have unreported hits...
918 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
919 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
920 {
921 // Chew up left singletons
922 while (curr_left_obs_order < curr_right_obs_order &&
923 curr_left_obs_order < end_id &&
924 curr_left_obs_order != VMAXINT32)
925 {
926 HitsForRead best_hits;
927 best_hits.insert_id = curr_left_obs_order;
928
929 // Process hits for left singleton, select best alignments
930 read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions);
931 update_coverage(best_hits, *coverage);
932 update_junctions(best_hits, *junctions);
933 update_insertions_and_deletions(best_hits, *insertions, *deletions);
934 update_fusions(best_hits, *rt, *fusions);
935
936 // Get next hit group
937 l_hs.next_read_hits(curr_left_hit_group);
938 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
939 }
940
941 // Chew up right singletons
942 while (curr_left_obs_order > curr_right_obs_order &&
943 curr_right_obs_order < end_id &&
944 curr_right_obs_order != VMAXINT32)
945 {
946 HitsForRead best_hits;
947 best_hits.insert_id = curr_right_obs_order;
948 if (curr_right_obs_order >= begin_id)
949 {
950 // Process hit for right singleton, select best alignments
951 read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions);
952 update_coverage(best_hits, *coverage);
953 update_junctions(best_hits, *junctions);
954 update_insertions_and_deletions(best_hits, *insertions, *deletions);
955 update_fusions(best_hits, *rt, *fusions);
956 }
957
958 // Get next hit group
959 r_hs.next_read_hits(curr_right_hit_group);
960 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
961 }
962
963 // Since we have both left hits and right hits for this insert,
964 // Find the best pairing and print both
965 while (curr_left_obs_order == curr_right_obs_order &&
966 curr_left_obs_order < end_id &&
967 curr_left_obs_order != VMAXINT32)
968 {
969 vector<pair<BowtieHit, BowtieHit> > best_hits;
970
971 InsertAlignmentGrade grade;
972 pair_best_alignments(curr_left_hit_group,
973 curr_right_hit_group,
974 grade,
975 best_hits,
976 *gtf_junctions);
977
978 HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
979 HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
980
981 for (size_t i = 0; i < best_hits.size(); ++i)
982 {
983 best_left_hit_group.hits.push_back(best_hits[i].first);
984 best_right_hit_group.hits.push_back(best_hits[i].second);
985 }
986
987 update_coverage(best_left_hit_group, *coverage);
988 update_junctions(best_left_hit_group, *junctions);
989 update_insertions_and_deletions(best_left_hit_group, *insertions, *deletions);
990 update_fusions(best_left_hit_group, *rt, *fusions);
991
992 update_coverage(best_right_hit_group, *coverage);
993 update_junctions(best_right_hit_group, *junctions);
994 update_insertions_and_deletions(best_right_hit_group, *insertions, *deletions);
995 update_fusions(best_right_hit_group, *rt, *fusions);
996
997 l_hs.next_read_hits(curr_left_hit_group);
998 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
999
1000 r_hs.next_read_hits(curr_right_hit_group);
1001 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1002 }
1003 }
1004
1005 for (size_t i = 0; i < hit_factories.size(); ++i)
1006 delete hit_factories[i];
1007
1008 hit_factories.clear();
1009 }
1010
1011 string left_map_fname;
1012 string right_map_fname;
1013
1014 RefSequenceTable* rt;
1015
1016 JunctionSet* gtf_junctions;
1017
1018 uint64_t begin_id;
1019 uint64_t end_id;
1020 int64_t left_map_offset;
1021 int64_t right_map_offset;
1022
1023 JunctionSet* junctions;
1024 InsertionSet* insertions;
1025 DeletionSet* deletions;
1026 FusionSet* fusions;
1027
1028 Coverage* coverage;
1029 };
1030
1031 struct ReportWorker
1032 {
1033 void operator()()
1034 {
1035 ReadTable it;
1036 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
1037
1038 ReadStream left_reads_file(left_reads_fname);
1039 if (left_reads_file.file() == NULL)
1040 err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
1041
1042 if (left_reads_offset > 0)
1043 left_reads_file.seek(left_reads_offset);
1044
1045 if (!zpacker.empty()) left_um_fname+=".z";
1046 FZPipe left_um_out;
1047 if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
1048 err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
1049
1050 ReadStream right_reads_file(right_reads_fname);
1051 if (right_reads_offset > 0)
1052 right_reads_file.seek(right_reads_offset);
1053
1054 FZPipe right_um_out;
1055 if (right_reads_fname != "")
1056 {
1057 if (!zpacker.empty()) right_um_fname+=".z";
1058 if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
1059 err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
1060 }
1061
1062 vector<BAMHitFactory*> hit_factories;
1063 hit_factories.push_back(new BAMHitFactory(it, *rt));
1064 HitStream left_hs(left_map_fname, hit_factories.back(), false, true, true, true);
1065 if (left_map_offset > 0)
1066 left_hs.seek(left_map_offset);
1067
1068 hit_factories.push_back(new BAMHitFactory(it, *rt));
1069 HitStream right_hs(right_map_fname, hit_factories.back(), false, true, true, true);
1070 if (right_map_offset > 0)
1071 right_hs.seek(right_map_offset);
1072
1073 HitsForRead curr_left_hit_group;
1074 HitsForRead curr_right_hit_group;
1075
1076 left_hs.next_read_hits(curr_left_hit_group);
1077 right_hs.next_read_hits(curr_right_hit_group);
1078
1079 uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1080 uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1081
1082 const bool final_report = true;
1083
1084 // While we still have unreported hits...
1085 Read l_read;
1086 Read r_read;
1087 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
1088 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
1089 {
1090 // Chew up left singletons (pairs with right reads unmapped)
1091 while (curr_left_obs_order < curr_right_obs_order &&
1092 curr_left_obs_order < end_id &&
1093 curr_left_obs_order != VMAXINT32)
1094 {
1095 HitsForRead best_hits;
1096 best_hits.insert_id = curr_left_obs_order;
1097 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1098 reads_format, false, begin_id, end_id,
1099 left_um_out.file, false);
1100 //assert(got_read);
1101
1102 if (right_reads_file.file()) {
1103 got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1104 reads_format, false, begin_id, end_id,
1105 right_um_out.file, true);
1106 //assert(got_read);
1107 }
1108
1109 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1110
1111 // Process hits for left singleton, select best alignments
1112 read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions,
1113 *junctions, *insertions, *deletions, *fusions, *coverage,
1114 final_report);
1115
1116 if (best_hits.hits.size() > 0)
1117 {
1118 update_junctions(best_hits, *final_junctions);
1119 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1120 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1121
1122 print_sam_for_single(*rt,
1123 best_hits,
1124 (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
1125 l_read,
1126 bam_writer,
1127 left_um_out.file);
1128 }
1129 // Get next hit group
1130 left_hs.next_read_hits(curr_left_hit_group);
1131 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1132 } //left singletons
1133
1134 // Chew up right singletons
1135 while (curr_left_obs_order > curr_right_obs_order &&
1136 curr_right_obs_order < end_id &&
1137 curr_right_obs_order != VMAXINT32)
1138 {
1139 HitsForRead best_hits;
1140 best_hits.insert_id = curr_right_obs_order;
1141
1142 if (curr_right_obs_order >= begin_id)
1143 {
1144 bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read,
1145 reads_format, false, begin_id, end_id,
1146 right_um_out.file, false);
1147
1148 //assert(got_read);
1149 /*
1150 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1151 r_read.seq.c_str(), r_read.qual.c_str());
1152 */
1153 got_read=left_reads_file.getRead(curr_right_obs_order, l_read,
1154 reads_format, false, begin_id, end_id,
1155 left_um_out.file, true);
1156 //assert(got_read);
1157
1158 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1159
1160 // Process hit for right singleton, select best alignments
1161 read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions,
1162 *junctions, *insertions, *deletions, *fusions, *coverage,
1163 final_report);
1164
1165 if (best_hits.hits.size() > 0)
1166 {
1167 update_junctions(best_hits, *final_junctions);
1168 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1169 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1170
1171 print_sam_for_single(*rt,
1172 best_hits,
1173 FRAG_RIGHT,
1174 r_read,
1175 bam_writer,
1176 right_um_out.file);
1177 }
1178 }
1179
1180 // Get next hit group
1181 right_hs.next_read_hits(curr_right_hit_group);
1182 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1183 }
1184
1185 // Since we have both left hits and right hits for this insert,
1186 // Find the best pairing and print both
1187 while (curr_left_obs_order == curr_right_obs_order &&
1188 curr_left_obs_order < end_id &&
1189 curr_left_obs_order != VMAXINT32)
1190 {
1191 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1192 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1193 if (curr_left_hit_group.hits.empty())
1194 { //only right read mapped
1195 //write it in the mapped file with the #MAPPED# flag
1196
1197 bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1198 reads_format, false, begin_id, end_id,
1199 right_um_out.file, false);
1200 //assert(got_read);
1201 /*
1202 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1203 r_read.seq.c_str(), r_read.qual.c_str());
1204 */
1205 HitsForRead right_best_hits;
1206 right_best_hits.insert_id = curr_right_obs_order;
1207
1208 read_best_alignments(curr_right_hit_group, right_best_hits, *gtf_junctions,
1209 *junctions, *insertions, *deletions, *fusions, *coverage,
1210 final_report);
1211
1212 if (right_best_hits.hits.size() > 0)
1213 {
1214 update_junctions(right_best_hits, *final_junctions);
1215 update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1216 update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1217
1218 print_sam_for_single(*rt,
1219 right_best_hits,
1220 FRAG_RIGHT,
1221 r_read,
1222 bam_writer,
1223 right_um_out.file);
1224 }
1225 }
1226 else if (curr_right_hit_group.hits.empty())
1227 {
1228 HitsForRead left_best_hits;
1229 left_best_hits.insert_id = curr_left_obs_order;
1230 //only left read mapped
1231 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1232 reads_format, false, begin_id, end_id,
1233 left_um_out.file, false);
1234
1235 // Process hits for left singleton, select best alignments
1236 read_best_alignments(curr_left_hit_group, left_best_hits, *gtf_junctions,
1237 *junctions, *insertions, *deletions, *fusions, *coverage,
1238 final_report);
1239
1240 if (left_best_hits.hits.size() > 0)
1241 {
1242 update_junctions(left_best_hits, *final_junctions);
1243 update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1244 update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1245
1246 print_sam_for_single(*rt,
1247 left_best_hits,
1248 FRAG_LEFT,
1249 l_read,
1250 bam_writer,
1251 left_um_out.file);
1252 }
1253 }
1254 else
1255 {
1256 vector<pair<BowtieHit, BowtieHit> > best_hits;
1257
1258 InsertAlignmentGrade grade;
1259 pair_best_alignments(curr_left_hit_group,
1260 curr_right_hit_group,
1261 grade, best_hits, *gtf_junctions,
1262 *junctions, *insertions, *deletions, *fusions, *coverage,
1263 final_report);
1264
1265 HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
1266 HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
1267
1268 for (size_t i = 0; i < best_hits.size(); ++i)
1269 {
1270 best_left_hit_group.hits.push_back(best_hits[i].first);
1271 best_right_hit_group.hits.push_back(best_hits[i].second);
1272 }
1273
1274 if (best_hits.size() > 0)
1275 {
1276 update_junctions(best_left_hit_group, *final_junctions);
1277 update_insertions_and_deletions(best_left_hit_group, *final_insertions, *final_deletions);
1278 update_fusions(best_left_hit_group, *rt, *final_fusions, *fusions);
1279
1280 update_junctions(best_right_hit_group, *final_junctions);
1281 update_insertions_and_deletions(best_right_hit_group, *final_insertions, *final_deletions);
1282 update_fusions(best_right_hit_group, *rt, *final_fusions, *fusions);
1283
1284 pair_support(best_hits, *final_fusions, *fusions);
1285
1286 print_sam_for_pair(*rt,
1287 best_hits,
1288 grade,
1289 left_reads_file,
1290 right_reads_file,
1291 bam_writer,
1292 left_um_out.file,
1293 right_um_out.file,
1294 begin_id,
1295 end_id);
1296 }
1297 }
1298
1299 left_hs.next_read_hits(curr_left_hit_group);
1300 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1301
1302 right_hs.next_read_hits(curr_right_hit_group);
1303 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1304 }
1305 } //while we still have unreported hits..
1306 //print the remaining unmapped reads at the end of each reads' stream
1307
1308 left_reads_file.getRead(VMAXINT32, l_read,
1309 reads_format, false, begin_id, end_id,
1310 left_um_out.file, false);
1311 if (right_reads_file.file())
1312 right_reads_file.getRead(VMAXINT32, r_read,
1313 reads_format, false, begin_id, end_id,
1314 right_um_out.file, false);
1315
1316
1317 // pclose (pipe close), which waits for a process to end, seems to conflict with boost::thread::join somehow,
1318 // resulting in deadlock like behavior.
1319 #if 0
1320 left_um_out.close();
1321 right_um_out.close();
1322 #endif
1323
1324 for (size_t i = 0; i < hit_factories.size(); ++i)
1325 delete hit_factories[i];
1326
1327 hit_factories.clear();
1328 }
1329
1330 string bam_output_fname;
1331 string sam_header_fname;
1332
1333 string left_reads_fname;
1334 string left_map_fname;
1335 string right_reads_fname;
1336 string right_map_fname;
1337
1338 string left_um_fname;
1339 string right_um_fname;
1340
1341 JunctionSet* gtf_junctions;
1342
1343 JunctionSet* junctions;
1344 InsertionSet* insertions;
1345 DeletionSet* deletions;
1346 FusionSet* fusions;
1347 Coverage* coverage;
1348
1349 JunctionSet* final_junctions;
1350 InsertionSet* final_insertions;
1351 DeletionSet* final_deletions;
1352 FusionSet* final_fusions;
1353
1354 RefSequenceTable* rt;
1355
1356 uint64_t begin_id;
1357 uint64_t end_id;
1358 int64_t left_reads_offset;
1359 int64_t left_map_offset;
1360 int64_t right_reads_offset;
1361 int64_t right_map_offset;
1362 };
1363
1364 void driver(const string& bam_output_fname,
1365 istream& ref_stream,
1366 const string& left_map_fname,
1367 const string& left_reads_fname,
1368 const string& right_map_fname,
1369 const string& right_reads_fname,
1370 FILE* junctions_out,
1371 FILE* insertions_out,
1372 FILE* deletions_out,
1373 FILE* fusions_out)
1374 {
1375 if (!parallel)
1376 num_threads = 1;
1377
1378 RefSequenceTable rt(sam_header, true);
1379
1380 if (fusion_search)
1381 get_seqs(ref_stream, rt, true);
1382
1383 srandom(1);
1384
1385 JunctionSet gtf_junctions;
1386 if (!gtf_juncs.empty())
1387 {
1388 char splice_buf[4096];
1389 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
1390 if (splice_coords)
1391 {
1392 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
1393 {
1394 char* nl = strrchr(splice_buf, '\n');
1395 char* buf = splice_buf;
1396 if (nl) *nl = 0;
1397
1398 char* ref_name = get_token((char**)&buf, "\t");
1399 char* scan_left_coord = get_token((char**)&buf, "\t");
1400 char* scan_right_coord = get_token((char**)&buf, "\t");
1401 char* orientation = get_token((char**)&buf, "\t");
1402
1403 if (!scan_left_coord || !scan_right_coord || !orientation)
1404 {
1405 fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
1406 gtf_juncs.c_str(), buf);
1407 exit(1);
1408 }
1409
1410 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
1411 uint32_t left_coord = atoi(scan_left_coord);
1412 uint32_t right_coord = atoi(scan_right_coord);
1413 bool antisense = *orientation == '-';
1414
1415 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
1416 }
1417 }
1418 fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
1419 }
1420
1421 vector<uint64_t> read_ids;
1422 vector<vector<int64_t> > offsets;
1423 if (num_threads > 1)
1424 {
1425 vector<string> fnames;
1426 if (right_map_fname != "")
1427 {
1428 fnames.push_back(right_reads_fname);
1429 fnames.push_back(right_map_fname);
1430 }
1431 fnames.push_back(left_reads_fname);
1432 fnames.push_back(left_map_fname);
1433 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
1434 if (!enough_data)
1435 num_threads = 1;
1436 }
1437
1438 JunctionSet vjunctions[num_threads];
1439 InsertionSet vinsertions[num_threads];
1440 DeletionSet vdeletions[num_threads];
1441 FusionSet vfusions[num_threads];
1442 Coverage vcoverages[num_threads];
1443
1444 vector<boost::thread*> threads;
1445 for (int i = 0; i < num_threads; ++i)
1446 {
1447 ConsensusEventsWorker worker;
1448
1449 worker.left_map_fname = left_map_fname;
1450 worker.right_map_fname = right_map_fname;
1451 worker.rt = &rt;
1452 worker.gtf_junctions = &gtf_junctions;
1453
1454 worker.junctions = &vjunctions[i];
1455 worker.insertions = &vinsertions[i];
1456 worker.deletions = &vdeletions[i];
1457 worker.fusions = &vfusions[i];
1458 worker.coverage = &vcoverages[i];
1459
1460 worker.right_map_offset = 0;
1461
1462 if (i == 0)
1463 {
1464 worker.begin_id = 0;
1465 worker.left_map_offset = 0;
1466 }
1467 else
1468 {
1469 size_t offsets_size = offsets[i-1].size();
1470
1471 worker.begin_id = read_ids[i-1];
1472 worker.left_map_offset = offsets[i-1].back();
1473 if (offsets_size == 4)
1474 worker.right_map_offset = offsets[i-1][1];
1475 }
1476
1477 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1478
1479 if (num_threads > 1)
1480 threads.push_back(new boost::thread(worker));
1481 else
1482 worker();
1483 }
1484
1485 for (size_t i = 0; i < threads.size(); ++i)
1486 {
1487 threads[i]->join();
1488 delete threads[i];
1489 threads[i] = NULL;
1490 }
1491 threads.clear();
1492
1493 JunctionSet& junctions = vjunctions[0];
1494 InsertionSet& insertions = vinsertions[0];
1495 DeletionSet& deletions = vdeletions[0];
1496 FusionSet& fusions = vfusions[0];
1497 Coverage& coverage = vcoverages[0];
1498 for (size_t i = 1; i < num_threads; ++i)
1499 {
1500 merge_with(junctions, vjunctions[i]);
1501 vjunctions[i].clear();
1502
1503 merge_with(insertions, vinsertions[i]);
1504 vinsertions[i].clear();
1505
1506 merge_with(deletions, vdeletions[i]);
1507 vdeletions[i].clear();
1508
1509 merge_with(fusions, vfusions[i]);
1510 vfusions[i].clear();
1511
1512 coverage.merge_with(vcoverages[i]);
1513 vcoverages[i].clear();
1514 }
1515
1516 coverage.calculate_coverage();
1517
1518 size_t num_unfiltered_juncs = junctions.size();
1519 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
1520
1521 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1522 filter_junctions(junctions, gtf_junctions);
1523
1524 //size_t num_juncs_after_filter = junctions.size();
1525 //fprintf(stderr, "Filtered %lu junctions\n",
1526 // num_unfiltered_juncs - num_juncs_after_filter);
1527
1528 /*
1529 size_t small_overhangs = 0;
1530 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1531 {
1532 if (i->second.accepted &&
1533 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1534 {
1535 small_overhangs++;
1536 }
1537 }
1538
1539 if (small_overhangs >0)
1540 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1541 */
1542
1543 JunctionSet vfinal_junctions[num_threads];
1544 InsertionSet vfinal_insertions[num_threads];
1545 DeletionSet vfinal_deletions[num_threads];
1546 FusionSet vfinal_fusions[num_threads];
1547
1548 for (int i = 0; i < num_threads; ++i)
1549 {
1550 ReportWorker worker;
1551
1552 char filename[1024] = {0};
1553 sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
1554 worker.bam_output_fname = filename;
1555 worker.sam_header_fname = sam_header;
1556 sprintf(filename, "%s/unmapped_left%d.fq", output_dir.c_str(), i);
1557 worker.left_um_fname = filename;
1558
1559 if (right_reads_fname != "")
1560 {
1561 sprintf(filename, "%s/unmapped_right%d.fq", output_dir.c_str(), i);
1562 worker.right_um_fname = filename;
1563 }
1564
1565 worker.left_reads_fname = left_reads_fname;
1566 worker.left_map_fname = left_map_fname;
1567 worker.right_reads_fname = right_reads_fname;
1568 worker.right_map_fname = right_map_fname;
1569
1570 worker.gtf_junctions = &gtf_junctions;
1571 worker.junctions = &junctions;
1572 worker.insertions = &insertions;
1573 worker.deletions = &deletions;
1574 worker.fusions = &fusions;
1575 worker.coverage = &coverage;
1576
1577 worker.final_junctions = &vfinal_junctions[i];
1578 worker.final_insertions = &vfinal_insertions[i];
1579 worker.final_deletions = &vfinal_deletions[i];
1580 worker.final_fusions = &vfinal_fusions[i];
1581 worker.rt = &rt;
1582
1583 worker.right_reads_offset = 0;
1584 worker.right_map_offset = 0;
1585
1586 if (i == 0)
1587 {
1588 worker.begin_id = 0;
1589 worker.left_reads_offset = 0;
1590 worker.left_map_offset = 0;
1591 }
1592 else
1593 {
1594 size_t offsets_size = offsets[i-1].size();
1595
1596 worker.begin_id = read_ids[i-1];
1597 worker.left_reads_offset = offsets[i-1][offsets_size - 2];
1598 worker.left_map_offset = offsets[i-1].back();
1599 if (offsets_size == 4)
1600 {
1601 worker.right_reads_offset = offsets[i-1][0];
1602 worker.right_map_offset = offsets[i-1][1];
1603 }
1604 }
1605
1606 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1607
1608 if (num_threads > 1)
1609 threads.push_back(new boost::thread(worker));
1610 else
1611 worker();
1612 }
1613
1614 for (size_t i = 0; i < threads.size(); ++i)
1615 {
1616 threads[i]->join();
1617 delete threads[i];
1618 threads[i] = NULL;
1619 }
1620 threads.clear();
1621
1622 JunctionSet final_junctions = vfinal_junctions[0];
1623 InsertionSet final_insertions = vfinal_insertions[0];
1624 DeletionSet final_deletions = vfinal_deletions[0];
1625 FusionSet final_fusions = vfinal_fusions[0];
1626 for (size_t i = 1; i < num_threads; ++i)
1627 {
1628 merge_with(final_junctions, vfinal_junctions[i]);
1629 vfinal_junctions[i].clear();
1630
1631 merge_with(final_insertions, vfinal_insertions[i]);
1632 vfinal_insertions[i].clear();
1633
1634 merge_with(final_deletions, vfinal_deletions[i]);
1635 vfinal_deletions[i].clear();
1636
1637 merge_with(final_fusions, vfinal_fusions[i]);
1638 vfinal_fusions[i].clear();
1639 }
1640
1641 fprintf (stderr, "Reporting final accepted alignments...");
1642 fprintf (stderr, "done.\n");
1643
1644 //small_overhangs = 0;
1645 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1646 {
1647 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
1648 {
1649 final_junctions.erase(i++);
1650 }
1651 else
1652 {
1653 ++i;
1654 }
1655 }
1656
1657 // if (small_overhangs > 0)
1658 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1659
1660 fprintf (stderr, "Printing junction BED track...");
1661 print_junctions(junctions_out, final_junctions, rt);
1662 fprintf (stderr, "done\n");
1663
1664 fprintf (stderr, "Printing insertions...");
1665 print_insertions(insertions_out, final_insertions,rt);
1666 fclose(insertions_out);
1667 fprintf (stderr, "done\n");
1668
1669 fprintf (stderr, "Printing deletions...");
1670 print_deletions(deletions_out, final_deletions, rt);
1671 fclose(deletions_out);
1672 fprintf (stderr, "done\n");
1673
1674 if (fusion_search)
1675 {
1676 fprintf (stderr, "Printing fusions...");
1677 print_fusions(fusions_out, final_fusions, rt);
1678 fclose(fusions_out);
1679 fprintf (stderr, "done\n");
1680 }
1681
1682 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1683 }
1684
1685 void print_usage()
1686 {
1687 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1688
1689 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1690 }
1691
1692 int main(int argc, char** argv)
1693 {
1694 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1695 fprintf(stderr, "---------------------------------------\n");
1696
1697 reads_format = FASTQ;
1698
1699 int parse_ret = parse_options(argc, argv, print_usage);
1700 if (parse_ret)
1701 return parse_ret;
1702
1703 if(optind >= argc)
1704 {
1705 print_usage();
1706 return 1;
1707 }
1708
1709 string ref_file_name = argv[optind++];
1710
1711 if(optind >= argc)
1712 {
1713 print_usage();
1714 return 1;
1715 }
1716
1717 string junctions_file_name = argv[optind++];
1718
1719 if(optind >= argc)
1720 {
1721 print_usage();
1722 return 1;
1723 }
1724
1725 string insertions_file_name = argv[optind++];
1726 if(optind >= argc)
1727 {
1728 print_usage();
1729 return 1;
1730 }
1731
1732 string deletions_file_name = argv[optind++];
1733 if(optind >= argc)
1734 {
1735 print_usage();
1736 return 1;
1737 }
1738
1739 string fusions_file_name = argv[optind++];
1740 if(optind >= argc)
1741 {
1742 print_usage();
1743 return 1;
1744 }
1745
1746 string accepted_hits_file_name = argv[optind++];
1747 if(optind >= argc)
1748 {
1749 print_usage();
1750 return 1;
1751 }
1752
1753 string left_map_filename = argv[optind++];
1754 if(optind >= argc)
1755 {
1756 print_usage();
1757 return 1;
1758 }
1759
1760 string left_reads_filename = argv[optind++];
1761 string unzcmd=getUnpackCmd(left_reads_filename, false);
1762
1763 string right_map_filename;
1764 string right_reads_filename;
1765
1766 if (optind < argc)
1767 {
1768 right_map_filename = argv[optind++];
1769 if(optind >= argc) {
1770 print_usage();
1771 return 1;
1772 }
1773 right_reads_filename=argv[optind++];
1774 }
1775
1776 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1777 if (!ref_stream.good())
1778 {
1779 fprintf(stderr, "Error: cannot open %s for reading\n",
1780 ref_file_name.c_str());
1781 exit(1);
1782 }
1783
1784 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1785 if (junctions_file == NULL)
1786 {
1787 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1788 junctions_file_name.c_str());
1789 exit(1);
1790 }
1791
1792 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1793 if (insertions_file == NULL)
1794 {
1795 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1796 insertions_file_name.c_str());
1797 exit(1);
1798 }
1799
1800 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1801 if (deletions_file == NULL)
1802 {
1803 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1804 deletions_file_name.c_str());
1805 exit(1);
1806 }
1807
1808 FILE* fusions_file = NULL;
1809 if (fusion_search)
1810 {
1811 fusions_file = fopen(fusions_file_name.c_str(), "w");
1812 if (fusions_file == NULL)
1813 {
1814 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1815 fusions_file_name.c_str());
1816 exit(1);
1817 }
1818 }
1819
1820 driver(accepted_hits_file_name,
1821 ref_stream,
1822 left_map_filename,
1823 left_reads_filename,
1824 right_map_filename,
1825 right_reads_filename,
1826 junctions_file,
1827 insertions_file,
1828 deletions_file,
1829 fusions_file);
1830
1831 return 0;
1832 }