ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (12 years, 8 months ago) by gpertea
File size: 43917 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include "common.h"
34 #include "bwt_map.h"
35 #include "junctions.h"
36 #include "insertions.h"
37 #include "deletions.h"
38 #include "align_status.h"
39 #include "fragments.h"
40 #include "wiggles.h"
41 #include "tokenize.h"
42 #include "reads.h"
43
44 #include "inserts.h"
45
46 using namespace std;
47 using namespace seqan;
48 using std::set;
49
50 void read_best_alignments(const HitsForRead& hits_for_read,
51 FragmentAlignmentGrade& best_grade,
52 HitsForRead& best_hits,
53 const JunctionSet& gtf_junctions)
54 {
55 const vector<BowtieHit>& hits = hits_for_read.hits;
56 for (size_t i = 0; i < hits.size(); ++i)
57 {
58 if (hits[i].edit_dist()>max_read_mismatches) continue;
59 FragmentAlignmentGrade g(hits[i], gtf_junctions);
60 // Is the new status better than the current best one?
61 if (best_grade < g)
62 {
63 best_hits.hits.clear();
64 best_grade = g;
65 best_hits.hits.push_back(hits[i]);
66 }
67 else if (!(g < best_grade)) // is it just as good?
68 {
69 best_grade.num_alignments++;
70 best_hits.hits.push_back(hits[i]);
71 }
72 }
73 }
74
75 void pair_best_alignments(const HitsForRead& left_hits,
76 const HitsForRead& right_hits,
77 InsertAlignmentGrade& best_grade,
78 HitsForRead& left_best_hits,
79 HitsForRead& right_best_hits)
80 {
81 // max mate inner distance (genomic)
82 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
83 if (max_mate_inner_dist == -1)
84 {
85 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
86 }
87
88 const vector<BowtieHit>& left = left_hits.hits;
89 const vector<BowtieHit>& right = right_hits.hits;
90
91 for (size_t i = 0; i < left.size(); ++i)
92 {
93 if (left[i].edit_dist()>max_read_mismatches) continue;
94
95 const BowtieHit& lh = left[i];
96 for (size_t j = 0; j < right.size(); ++j)
97 {
98 const BowtieHit& rh = right[j];
99
100 if (lh.ref_id() != rh.ref_id())
101 continue;
102 if (right[j].edit_dist()>max_read_mismatches) continue;
103 InsertAlignmentGrade g(lh, rh, min_mate_inner_dist, max_mate_inner_dist);
104
105 // Is the new status better than the current best one?
106 if (best_grade < g)
107 {
108 left_best_hits.hits.clear();
109 right_best_hits.hits.clear();
110
111 best_grade = g;
112 left_best_hits.hits.push_back(lh);
113 right_best_hits.hits.push_back(rh);
114 }
115 else if (!(g < best_grade))
116 {
117 left_best_hits.hits.push_back(lh);
118 right_best_hits.hits.push_back(rh);
119 best_grade.num_alignments++;
120 }
121 }
122 }
123 }
124
125 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
126
127 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
128 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
129 int num_hits, const BowtieHit* next_hit, int hitIndex) {
130
131 if (sam_toks.size()>11) {
132 for (size_t i=11;i<sam_toks.size();++i) {
133 if (sam_toks[i].find("NH:i:")==string::npos)
134 auxdata.push_back(sam_toks[i]);
135 }
136 }
137 string aux("NH:i:");
138 str_appendInt(aux, num_hits);
139 auxdata.push_back(aux);
140 if (next_hit) {
141 const char* nh_ref_name = "=";
142 nh_ref_name = rt.get_name(next_hit->ref_id());
143 assert (nh_ref_name != NULL);
144 bool same_contig=(next_hit->ref_id()==bh.ref_id());
145 aux="CC:Z:";
146 aux+= (same_contig ? "=" : nh_ref_name);
147 auxdata.push_back(aux);
148 aux="CP:i:";
149 int nh_gpos=next_hit->left() + 1;
150 str_appendInt(aux, nh_gpos);
151 auxdata.push_back(aux);
152 } //has next_hit
153 // FIXME: this code is still a bit brittle, because it contains no
154 // consistency check that the mates are on opposite strands - a current protocol
155 // requirement, and that the strand indicated by the alignment is consistent
156 // with the orientation of the splices (though that should be handled upstream).
157 const string xs_f("XS:A:+");
158 const string xs_r("XS:A:-");
159 if (bh.contiguous()) {
160 if (library_type == FR_FIRSTSTRAND) {
161 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
162 if (bh.antisense_align())
163 auxdata.push_back(xs_f);
164 else
165 auxdata.push_back(xs_r);
166 }
167 else {
168 if (bh.antisense_align())
169 auxdata.push_back(xs_r);
170 else
171 auxdata.push_back(xs_f);
172 }
173 }
174 else if (library_type == FR_SECONDSTRAND) {
175 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
176 if (bh.antisense_align())
177 auxdata.push_back(xs_r);
178 else
179 auxdata.push_back(xs_f);
180 }
181 else
182 {
183 if (bh.antisense_align())
184 auxdata.push_back(xs_f);
185 else
186 auxdata.push_back(xs_r);
187 }
188 }
189 } //bh.contiguous()
190 if (hitIndex >= 0)
191 {
192 string aux("HI:i:");
193 str_appendInt(aux, hitIndex);
194 auxdata.push_back(aux);
195 }
196 }
197
198 bool rewrite_sam_record(GBamWriter& bam_writer, const RefSequenceTable& rt,
199 const BowtieHit& bh,
200 const char* bwt_buf,
201 const char* read_alt_name,
202 const FragmentAlignmentGrade& grade,
203 FragmentType insert_side,
204 int num_hits,
205 const BowtieHit* next_hit,
206 bool primary,
207 int hitIndex)
208 {
209 // Rewrite this hit, filling in the alt name, mate mapping
210 // and setting the pair flag
211 vector<string> sam_toks;
212 tokenize(bwt_buf, "\t", sam_toks);
213 string qname(read_alt_name);
214 size_t slash_pos=qname.rfind('/');
215 if (slash_pos!=string::npos)
216 qname.resize(slash_pos);
217 //read_alt_name as QNAME
218 int flag=atoi(sam_toks[1].c_str()); //FLAG
219 if (insert_side != FRAG_UNPAIRED) {
220 //flag = atoi(sam_toks[1].c_str());
221 // mark this as a singleton mate
222 flag |= 0x0001;
223 if (insert_side == FRAG_LEFT)
224 flag |= 0x0040;
225 else if (insert_side == FRAG_RIGHT)
226 flag |= 0x0080;
227 flag |= 0x0008;
228 //char flag_buf[64];
229 //sprintf(flag_buf, "%d", flag);
230 //sam_toks[t] = flag_buf;
231 }
232 if (!primary)
233 flag |= 0x100;
234
235 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
236 int mapQ=255;
237 if (grade.num_alignments > 1) {
238 double err_prob = 1 - (1.0 / grade.num_alignments);
239 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
240 }
241 vector<string> auxdata;
242 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
243 int tlen =atoi(sam_toks[8].c_str()); //TLEN
244 int mate_pos=atoi(sam_toks[7].c_str());
245 GBamRecord* bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
246 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
247 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
248 bam_writer.write(bamrec);
249 delete bamrec;
250 return true;
251 }
252
253 bool rewrite_sam_record(GBamWriter& bam_writer, const RefSequenceTable& rt,
254 const BowtieHit& bh,
255 const char* bwt_buf,
256 const char* read_alt_name,
257 const InsertAlignmentGrade& grade,
258 FragmentType insert_side,
259 const BowtieHit* partner,
260 int num_hits,
261 const BowtieHit* next_hit,
262 bool primary,
263 int hitIndex)
264 {
265 // Rewrite this hit, filling in the alt name, mate mapping
266 // and setting the pair flag
267 vector<string> sam_toks;
268 tokenize(bwt_buf, "\t", sam_toks);
269 string qname(read_alt_name);
270 size_t slash_pos=qname.rfind('/');
271 if (slash_pos!=string::npos)
272 qname.resize(slash_pos);
273 //read_alt_name as QNAME
274 int flag = atoi(sam_toks[1].c_str());
275 // 0x0010 (strand of query) is assumed to be set correctly
276 // to begin with
277 flag |= 0x0001; //it must be paired
278 if (insert_side == FRAG_LEFT)
279 flag |= 0x0040;
280 else if (insert_side == FRAG_RIGHT)
281 flag |= 0x0080;
282 if (!primary)
283 flag |= 0x100;
284
285 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
286 int mapQ=255;
287 if (grade.num_alignments > 1) {
288 double err_prob = 1 - (1.0 / grade.num_alignments);
289 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
290 }
291 int tlen=0; //TLEN
292 int mate_pos=atoi(sam_toks[7].c_str());
293 if (partner) {
294 if (partner->ref_id()==bh.ref_id()) {
295 sam_toks[6] = "="; //same chromosome
296 //TLEN:
297 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
298 partner->left() - bh.right();
299 }
300
301 else { //partner on different chromosome/contig
302 sam_toks[6] = rt.get_name(partner->ref_id());
303 if (sam_toks[6].empty()) {
304 //FIXME -- this should never happen
305 sam_toks[6] = "=";
306 fprintf(stderr, "Warning: partner ref_id %d has no entry in ref table?\n", partner->ref_id());
307 }
308 }
309 mate_pos = partner->left() + 1;
310 if (grade.happy())
311 flag |= 0x0002;
312 if (partner->antisense_align())
313 flag |= 0x0020;
314 }
315 else {
316 sam_toks[6] = "*";
317 mate_pos = 0;
318 flag |= 0x0008;
319 }
320 vector<string> auxdata;
321 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
322 GBamRecord* bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
323 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
324 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
325 bam_writer.write(bamrec);
326 delete bamrec;
327 return true;
328 }
329
330 struct lex_hit_sort
331 {
332 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
333 : _rt(rt), _hits(hits)
334 {}
335
336 bool operator()(const uint32_t& l, const uint32_t& r) const
337 {
338 const BowtieHit& lhs = _hits.hits[l];
339 const BowtieHit& rhs = _hits.hits[r];
340
341 uint32_t l_id = lhs.ref_id();
342 uint32_t r_id = rhs.ref_id();
343 if (l_id != r_id)
344 {
345 return (strcmp(_rt.get_name(lhs.ref_id()), _rt.get_name(rhs.ref_id())) < 0);
346 }
347 return lhs.left() < rhs.left();
348 }
349
350 const RefSequenceTable& _rt;
351 const HitsForRead& _hits;
352 };
353
354 void print_sam_for_single(const RefSequenceTable& rt,
355 const HitsForRead& hits,
356 const FragmentAlignmentGrade& grade,
357 FragmentType frag_type,
358 //FILE* reads_file,
359 Read& read,
360 GBamWriter& bam_writer,
361 FILE* um_out//write unmapped reads here
362 )
363 {
364 assert(!read.alt_name.empty());
365 lex_hit_sort s(rt, hits);
366 vector<uint32_t> index_vector;
367 for (size_t i = 0; i < hits.hits.size(); ++i)
368 index_vector.push_back(i);
369
370 sort(index_vector.begin(), index_vector.end(), s);
371 size_t primaryHit = (hits.hits.empty()? 0: random() % hits.hits.size());
372 bool multipleHits = (hits.hits.size() > 1);
373 for (size_t i = 0; i < hits.hits.size(); ++i)
374 {
375 bool primary = (i == primaryHit);
376 size_t index = index_vector[i];
377 const BowtieHit& bh = hits.hits[index];
378 rewrite_sam_record(bam_writer, rt,
379 bh,
380 bh.hitfile_rec().c_str(),
381 read.alt_name.c_str(),
382 grade,
383 frag_type,
384 hits.hits.size(),
385 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
386 primary,
387 (multipleHits? i: -1));
388 }
389 }
390
391 void print_sam_for_pair(const RefSequenceTable& rt,
392 const HitsForRead& left_hits,
393 const HitsForRead& right_hits,
394 const InsertAlignmentGrade& grade,
395 FILE* left_reads_file,
396 FILE* right_reads_file,
397 GBamWriter& bam_writer,
398 FILE* left_um_out,
399 FILE* right_um_out
400 )
401 {
402 assert (left_hits.insert_id == right_hits.insert_id);
403
404 Read left_read;
405 Read right_read;
406 assert (left_hits.hits.size() == right_hits.hits.size() ||
407 (left_hits.hits.empty() || right_hits.hits.empty()));
408
409 size_t primaryHit = 0;
410 vector<uint32_t> index_vector;
411 if(right_hits.hits.size() > 0)
412 {
413 lex_hit_sort s(rt, right_hits);
414 for (size_t i = 0; i < right_hits.hits.size(); ++i)
415 index_vector.push_back(i);
416
417 sort(index_vector.begin(), index_vector.end(), s);
418 primaryHit = random() % right_hits.hits.size();
419 }
420 else if (left_hits.hits.size() > 0)
421 {
422 lex_hit_sort s(rt, left_hits);
423 for (size_t i = 0; i < left_hits.hits.size(); ++i)
424 index_vector.push_back(i);
425
426 sort(index_vector.begin(), index_vector.end(), s);
427 primaryHit = random() % left_hits.hits.size();
428 }
429
430 bool got_left_read = get_read_from_stream(left_hits.insert_id,
431 left_reads_file,
432 reads_format,
433 false,
434 left_read, left_um_out);
435
436 bool got_right_read = get_read_from_stream(right_hits.insert_id,
437 right_reads_file,
438 reads_format,
439 false,
440 right_read, right_um_out);
441
442 if (left_hits.hits.size() == right_hits.hits.size())
443 {
444 assert (got_left_read && got_right_read);
445 bool multipleHits = (left_hits.hits.size() > 1);
446 for (size_t i = 0; i < right_hits.hits.size(); ++i)
447 {
448 bool primary = (i == primaryHit);
449 size_t index = index_vector[i];
450 const BowtieHit& right_bh = right_hits.hits[index];
451 const BowtieHit& left_bh = left_hits.hits[index];
452
453 rewrite_sam_record(bam_writer, rt,
454 right_bh,
455 right_bh.hitfile_rec().c_str(),
456 right_read.alt_name.c_str(),
457 grade,
458 FRAG_RIGHT,
459 &left_bh,
460 right_hits.hits.size(),
461 (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
462 primary,
463 (multipleHits? i: -1));
464 rewrite_sam_record(bam_writer, rt,
465 left_bh,
466 left_bh.hitfile_rec().c_str(),
467 left_read.alt_name.c_str(),
468 grade,
469 FRAG_LEFT,
470 &right_bh,
471 left_hits.hits.size(),
472 (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
473 primary,
474 (multipleHits? i: -1));
475 }
476 }
477 else if (left_hits.hits.empty())
478 { //only right read was mapped properly
479 if (right_um_out) {
480 //write it in the mapped file with the #MAPPED# flag
481 fprintf(right_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", right_read.alt_name.c_str(),
482 right_read.seq.c_str(), right_read.qual.c_str());
483 }
484 for (size_t i = 0; i < right_hits.hits.size(); ++i)
485 {
486 bool primary = (i == primaryHit);
487 size_t index = index_vector[i];
488 const BowtieHit& bh = right_hits.hits[index];
489
490 rewrite_sam_record(bam_writer, rt,
491 bh,
492 bh.hitfile_rec().c_str(),
493 right_read.alt_name.c_str(),
494 grade,
495 FRAG_RIGHT,
496 NULL,
497 right_hits.hits.size(),
498 (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
499 primary,
500 -1);
501
502 }
503 }
504 else if (right_hits.hits.empty())
505 { //only left read was mapped properly
506 if (left_um_out) {
507 //write it in the mapped file with the #MAPPED# flag
508 fprintf(left_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", left_read.alt_name.c_str(), left_read.seq.c_str(),
509 left_read.qual.c_str());
510 }
511
512 for (size_t i = 0; i < left_hits.hits.size(); ++i)
513 {
514 bool primary = (i == primaryHit);
515 size_t index = index_vector[i];
516 const BowtieHit& bh = left_hits.hits[index];
517 rewrite_sam_record(bam_writer, rt,
518 bh,
519 bh.hitfile_rec().c_str(),
520 left_read.alt_name.c_str(),
521 grade,
522 FRAG_LEFT,
523 NULL,
524 left_hits.hits.size(),
525 (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
526 primary,
527 -1);
528
529 }
530 }
531 else
532 {
533 assert (false);
534 }
535 }
536
537 /**
538 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
539 * @param hits hits The alignments for a particular read
540 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
541 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
542 */
543 void update_insertions_and_deletions(const HitsForRead& hits,
544 InsertionSet& insertions,
545 DeletionSet& deletions)
546 {
547 for (size_t i = 0; i < hits.hits.size(); ++i)
548 {
549 const BowtieHit& bh = hits.hits[i];
550 insertions_from_alignment(bh, insertions);
551 deletions_from_alignment(bh, deletions);
552 }
553 }
554
555
556
557 void update_junctions(const HitsForRead& hits,
558 JunctionSet& junctions)
559 {
560 for (size_t i = 0; i < hits.hits.size(); ++i)
561 {
562 const BowtieHit& bh = hits.hits[i];
563 junctions_from_alignment(bh, junctions);
564 }
565 }
566
567 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
568 // resets the stream when finished.
569 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions,
570 HitsForRead& hits)
571 {
572 HitsForRead remaining;
573 remaining.insert_id = hits.insert_id;
574
575 for (size_t i = 0; i < hits.hits.size(); ++i)
576 {
577 BowtieHit& bh = hits.hits[i];
578 bool filter_hit = false;
579 if (!bh.contiguous())
580 {
581 JunctionSet bh_juncs;
582 junctions_from_alignment(bh, bh_juncs);
583 for (JunctionSet::iterator itr = bh_juncs.begin();
584 itr != bh_juncs.end();
585 itr++)
586 {
587 const Junction& j = itr->first;
588 JunctionSet::const_iterator target = junctions.find(j);
589 if (target == junctions.end() || !target->second.accepted)
590 {
591 filter_hit = true;
592 break;
593 }
594 }
595 }
596 if (!filter_hit)
597 {
598 remaining.hits.push_back(bh);
599 }
600 else
601 {
602 //fprintf(stderr, "Filtering hit\n");
603 }
604 }
605 hits = remaining;
606 }
607
608 void get_junctions_from_best_hits(HitStream& left_hs,
609 HitStream& right_hs,
610 ReadTable& it,
611 JunctionSet& junctions,
612 const JunctionSet& gtf_junctions)
613 {
614 HitsForRead curr_left_hit_group;
615 HitsForRead curr_right_hit_group;
616
617 left_hs.next_read_hits(curr_left_hit_group);
618 right_hs.next_read_hits(curr_right_hit_group);
619
620 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
621 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
622
623 // While we still have unreported hits...
624 while(curr_left_obs_order != VMAXINT32 ||
625 curr_right_obs_order != VMAXINT32)
626 {
627 // Chew up left singletons
628 while (curr_left_obs_order < curr_right_obs_order &&
629 curr_left_obs_order != VMAXINT32)
630 {
631 HitsForRead best_hits;
632 best_hits.insert_id = curr_left_obs_order;
633 FragmentAlignmentGrade grade;
634
635 // Process hits for left singleton, select best alignments
636 read_best_alignments(curr_left_hit_group, grade, best_hits, gtf_junctions);
637 update_junctions(best_hits, junctions);
638
639 // Get next hit group
640 left_hs.next_read_hits(curr_left_hit_group);
641 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
642 }
643
644 // Chew up right singletons
645 while (curr_left_obs_order > curr_right_obs_order &&
646 curr_right_obs_order != VMAXINT32)
647 {
648 HitsForRead best_hits;
649 best_hits.insert_id = curr_right_obs_order;
650 FragmentAlignmentGrade grade;
651
652 // Process hit for right singleton, select best alignments
653 read_best_alignments(curr_right_hit_group,grade, best_hits, gtf_junctions);
654 update_junctions(best_hits, junctions);
655
656 // Get next hit group
657 right_hs.next_read_hits(curr_right_hit_group);
658 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
659 }
660
661 // Since we have both left hits and right hits for this insert,
662 // Find the best pairing and print both
663 while (curr_left_obs_order == curr_right_obs_order &&
664 curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32)
665 {
666 if (curr_left_hit_group.hits.empty())
667 {
668 HitsForRead right_best_hits;
669 right_best_hits.insert_id = curr_right_obs_order;
670
671 FragmentAlignmentGrade grade;
672 read_best_alignments(curr_right_hit_group, grade, right_best_hits, gtf_junctions);
673 update_junctions(right_best_hits, junctions);
674 }
675 else if (curr_right_hit_group.hits.empty())
676 {
677 HitsForRead left_best_hits;
678 left_best_hits.insert_id = curr_left_obs_order;
679
680 FragmentAlignmentGrade grade;
681 // Process hits for left singleton, select best alignments
682 read_best_alignments(curr_left_hit_group, grade, left_best_hits, gtf_junctions);
683 update_junctions(left_best_hits, junctions);
684 }
685 else
686 {
687 HitsForRead left_best_hits;
688 HitsForRead right_best_hits;
689 left_best_hits.insert_id = curr_left_obs_order;
690 right_best_hits.insert_id = curr_right_obs_order;
691
692 // daehwan - apply gtf_junctions here, too!
693
694 InsertAlignmentGrade grade;
695 pair_best_alignments(curr_left_hit_group,
696 curr_right_hit_group,
697 grade,
698 left_best_hits,
699 right_best_hits);
700
701 update_junctions(left_best_hits, junctions);
702 update_junctions(right_best_hits, junctions);
703 }
704
705 left_hs.next_read_hits(curr_left_hit_group);
706 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
707
708 right_hs.next_read_hits(curr_right_hit_group);
709 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
710 }
711 }
712 left_hs.reset();
713 right_hs.reset();
714 }
715
716
717 void driver(GBamWriter& bam_writer,
718 string& left_map_fname,
719 FILE* left_reads,
720 string& right_map_fname,
721 FILE* right_reads,
722 FILE* junctions_out,
723 FILE* insertions_out,
724 FILE* deletions_out,
725 FILE* left_um_out,
726 FILE* right_um_out
727 )
728 {
729 ReadTable it;
730 RefSequenceTable rt(sam_header, true);
731 srandom(1);
732 JunctionSet gtf_junctions;
733 if (!gtf_juncs.empty())
734 {
735 char splice_buf[2048];
736 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
737 if (splice_coords)
738 {
739 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
740 {
741 char* nl = strrchr(splice_buf, '\n');
742 char* buf = splice_buf;
743 if (nl) *nl = 0;
744
745 char* ref_name = get_token((char**)&buf, "\t");
746 char* scan_left_coord = get_token((char**)&buf, "\t");
747 char* scan_right_coord = get_token((char**)&buf, "\t");
748 char* orientation = get_token((char**)&buf, "\t");
749
750 if (!scan_left_coord || !scan_right_coord || !orientation)
751 {
752 fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
753 gtf_juncs.c_str(), buf);
754 exit(1);
755 }
756
757 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
758 uint32_t left_coord = atoi(scan_left_coord);
759 uint32_t right_coord = atoi(scan_right_coord);
760 bool antisense = *orientation == '-';
761
762 // add 1 to left_coord to meet BED format
763 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats()));
764 }
765 }
766 fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
767 }
768
769 BAMHitFactory hit_factory(it,rt);
770 JunctionSet junctions;
771 {
772 HitStream l_hs(left_map_fname, &hit_factory, false, true, true, true);
773 HitStream r_hs(right_map_fname, &hit_factory, false, true, true, true);
774 get_junctions_from_best_hits(l_hs, r_hs, it, junctions, gtf_junctions);
775 //this resets the streams
776 }
777
778 HitStream left_hs(left_map_fname, &hit_factory, false, true, true, true);
779 HitStream right_hs(right_map_fname, &hit_factory, false, true, true, true);
780
781 size_t num_unfiltered_juncs = junctions.size();
782 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
783
784 HitsForRead curr_left_hit_group;
785 HitsForRead curr_right_hit_group;
786
787 left_hs.next_read_hits(curr_left_hit_group);
788 right_hs.next_read_hits(curr_right_hit_group);
789
790 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
791 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
792
793 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
794 filter_junctions(junctions, gtf_junctions);
795 //size_t num_juncs_after_filter = junctions.size();
796 //fprintf(stderr, "Filtered %lu junctions\n",
797 // num_unfiltered_juncs - num_juncs_after_filter);
798
799 /*
800 size_t small_overhangs = 0;
801 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
802 {
803 if (i->second.accepted &&
804 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
805 {
806 small_overhangs++;
807 }
808 }
809
810 if (small_overhangs >0)
811 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
812 */
813 JunctionSet final_junctions; // the junctions formed from best hits
814 InsertionSet final_insertions;
815 DeletionSet final_deletions;
816
817 fprintf (stderr, "Reporting final accepted alignments...");
818 // While we still have unreported hits...
819 Read l_read;
820 Read r_read;
821 while(curr_left_obs_order != VMAXINT32 ||
822 curr_right_obs_order != VMAXINT32)
823 {
824 // Chew up left singletons (pairs with right reads unmapped)
825 while (curr_left_obs_order < curr_right_obs_order &&
826 curr_left_obs_order != VMAXINT32)
827 {
828 HitsForRead best_hits;
829 best_hits.insert_id = curr_left_obs_order;
830 FragmentAlignmentGrade grade;
831 bool got_read=get_read_from_stream(curr_left_obs_order,
832 left_reads,reads_format, false, l_read, left_um_out);
833 assert(got_read);
834 if (right_reads) {
835 fprintf(left_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
836 l_read.seq.c_str(), l_read.qual.c_str());
837 got_read=get_read_from_stream(curr_left_obs_order,
838 right_reads,reads_format, false,
839 r_read, right_um_out, true);
840 assert(got_read);
841 }
842 exclude_hits_on_filtered_junctions(junctions, curr_left_hit_group);
843
844 // Process hits for left singleton, select best alignments
845 read_best_alignments(curr_left_hit_group, grade, best_hits, gtf_junctions);
846 if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
847 {
848 update_junctions(best_hits, final_junctions);
849 update_insertions_and_deletions(best_hits, final_insertions, final_deletions);
850
851 print_sam_for_single(rt,
852 best_hits,
853 grade,
854 (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
855 l_read,
856 bam_writer,
857 left_um_out);
858 }
859
860 // Get next hit group
861 left_hs.next_read_hits(curr_left_hit_group);
862 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
863 } //left singletons
864
865 // Chew up right singletons
866 while (curr_left_obs_order > curr_right_obs_order &&
867 curr_right_obs_order != VMAXINT32)
868 {
869 HitsForRead best_hits;
870 best_hits.insert_id = curr_right_obs_order;
871 FragmentAlignmentGrade grade;
872
873 bool got_read=get_read_from_stream(curr_right_obs_order,
874 right_reads,reads_format, false, r_read, right_um_out);
875 assert(got_read);
876 fprintf(right_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
877 r_read.seq.c_str(), r_read.qual.c_str());
878 got_read=get_read_from_stream(curr_right_obs_order,
879 left_reads,reads_format, false,
880 l_read, left_um_out, true);
881 assert(got_read);
882
883 exclude_hits_on_filtered_junctions(junctions, curr_right_hit_group);
884
885 // Process hit for right singleton, select best alignments
886 read_best_alignments(curr_right_hit_group, grade, best_hits, gtf_junctions);
887
888 if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
889 {
890 update_junctions(best_hits, final_junctions);
891 update_insertions_and_deletions(best_hits, final_insertions, final_deletions);
892
893 print_sam_for_single(rt,
894 best_hits,
895 grade,
896 FRAG_RIGHT,
897 r_read,
898 bam_writer,
899 right_um_out);
900 }
901
902 // Get next hit group
903 right_hs.next_read_hits(curr_right_hit_group);
904 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
905 }
906
907 // Since we have both left hits and right hits for this insert,
908 // Find the best pairing and print both
909 while (curr_left_obs_order == curr_right_obs_order &&
910 curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32)
911 {
912 exclude_hits_on_filtered_junctions(junctions, curr_left_hit_group);
913 exclude_hits_on_filtered_junctions(junctions, curr_right_hit_group);
914
915 if (curr_left_hit_group.hits.empty())
916 { //only right read mapped
917 //write it in the mapped file with the #MAPPED# flag
918
919 bool got_read=get_read_from_stream(curr_left_obs_order,
920 right_reads,reads_format, false, r_read, right_um_out);
921 assert(got_read);
922 fprintf(right_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
923 r_read.seq.c_str(), r_read.qual.c_str());
924 HitsForRead right_best_hits;
925 right_best_hits.insert_id = curr_right_obs_order;
926
927 FragmentAlignmentGrade grade;
928 read_best_alignments(curr_right_hit_group, grade, right_best_hits, gtf_junctions);
929
930 if (right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
931 {
932 update_junctions(right_best_hits, final_junctions);
933 update_insertions_and_deletions(right_best_hits, final_insertions, final_deletions);
934
935 print_sam_for_single(rt,
936 right_best_hits,
937 grade,
938 FRAG_RIGHT,
939 r_read,
940 bam_writer,
941 right_um_out);
942 }
943 }
944 else if (curr_right_hit_group.hits.empty())
945 {
946 HitsForRead left_best_hits;
947 left_best_hits.insert_id = curr_left_obs_order;
948 //only left read mapped
949 bool got_read=get_read_from_stream(curr_left_obs_order,
950 left_reads,reads_format, false, l_read, left_um_out);
951 assert(got_read);
952 fprintf(left_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
953 l_read.seq.c_str(), l_read.qual.c_str());
954 FragmentAlignmentGrade grade;
955 // Process hits for left singleton, select best alignments
956 read_best_alignments(curr_left_hit_group, grade, left_best_hits, gtf_junctions);
957
958 if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits)
959 {
960 update_junctions(left_best_hits, final_junctions);
961 update_insertions_and_deletions(left_best_hits, final_insertions, final_deletions);
962
963 print_sam_for_single(rt,
964 left_best_hits,
965 grade,
966 FRAG_LEFT,
967 l_read,
968 bam_writer,
969 left_um_out);
970 }
971 }
972 else
973 { //hits for both left and right reads
974 HitsForRead left_best_hits;
975 HitsForRead right_best_hits;
976 left_best_hits.insert_id = curr_left_obs_order;
977 right_best_hits.insert_id = curr_right_obs_order;
978
979 InsertAlignmentGrade grade;
980 pair_best_alignments(curr_left_hit_group,
981 curr_right_hit_group,
982 grade,
983 left_best_hits,
984 right_best_hits);
985
986 if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits &&
987 right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
988 {
989 update_junctions(left_best_hits, final_junctions);
990 update_junctions(right_best_hits, final_junctions);
991 update_insertions_and_deletions(left_best_hits, final_insertions, final_deletions);
992 update_insertions_and_deletions(right_best_hits, final_insertions, final_deletions);
993
994 print_sam_for_pair(rt,
995 left_best_hits,
996 right_best_hits,
997 grade,
998 left_reads,
999 right_reads,
1000 bam_writer,
1001 left_um_out,
1002 right_um_out);
1003 }
1004 }
1005
1006 left_hs.next_read_hits(curr_left_hit_group);
1007 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1008
1009 right_hs.next_read_hits(curr_right_hit_group);
1010 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1011 }
1012
1013 } //while we still have unreported hits..
1014 //print the remaining unmapped reads at the end of each reads' stream
1015 get_read_from_stream(VMAXINT32,
1016 left_reads,
1017 reads_format,
1018 false,
1019 l_read,
1020 left_um_out);
1021 if (right_reads)
1022 get_read_from_stream(VMAXINT32,
1023 right_reads,
1024 reads_format,
1025 false,
1026 r_read,
1027 right_um_out);
1028 fprintf (stderr, "done.\n");
1029
1030 //small_overhangs = 0;
1031 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1032 {
1033 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
1034 {
1035 final_junctions.erase(i++);
1036 }
1037 else
1038 {
1039 ++i;
1040 }
1041 }
1042
1043 // if (small_overhangs > 0)
1044 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1045
1046 fprintf (stderr, "Printing junction BED track...");
1047 print_junctions(junctions_out, final_junctions, rt);
1048 fprintf (stderr, "done\n");
1049
1050 fprintf (stderr, "Printing insertions...");
1051 print_insertions(insertions_out, final_insertions,rt);
1052 fclose(insertions_out);
1053 fprintf (stderr, "done\n");
1054
1055 fprintf (stderr, "Printing deletions...");
1056 print_deletions(deletions_out, final_deletions, rt);
1057 fclose(deletions_out);
1058 fprintf (stderr, "done\n");
1059
1060 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1061 }
1062
1063 void print_usage()
1064 {
1065 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1066
1067 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1068 }
1069
1070 int main(int argc, char** argv)
1071 {
1072 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1073 fprintf(stderr, "---------------------------------------\n");
1074
1075 reads_format = FASTQ;
1076
1077 int parse_ret = parse_options(argc, argv, print_usage);
1078 if (parse_ret)
1079 return parse_ret;
1080
1081 if(optind >= argc)
1082 {
1083 print_usage();
1084 return 1;
1085 }
1086
1087 string junctions_file_name = argv[optind++];
1088
1089 if(optind >= argc)
1090 {
1091 print_usage();
1092 return 1;
1093 }
1094
1095 string insertions_file_name = argv[optind++];
1096 if(optind >= argc)
1097 {
1098 print_usage();
1099 return 1;
1100 }
1101
1102 string deletions_file_name = argv[optind++];
1103 if(optind >= argc)
1104 {
1105 print_usage();
1106 return 1;
1107 }
1108
1109 string accepted_hits_file_name = argv[optind++];
1110
1111 if(optind >= argc)
1112 {
1113 print_usage();
1114 return 1;
1115 }
1116
1117 string left_map_filename = argv[optind++];
1118
1119 if(optind >= argc)
1120 {
1121 print_usage();
1122 return 1;
1123 }
1124 //FZPipe left_map_file;
1125 //string unbamcmd=getBam2SamCmd(left_map_filename);
1126 //left_map_file.openRead(left_map_filename, unbamcmd);
1127 string left_reads_filename = argv[optind++];
1128 string unzcmd=getUnpackCmd(left_reads_filename, false);
1129
1130 string right_map_filename;
1131 string right_reads_filename;
1132 FZPipe right_reads_file;
1133 string right_um_filename;
1134 FZPipe right_um_file;
1135
1136 if (optind < argc)
1137 {
1138 right_map_filename = argv[optind++];
1139 if(optind >= argc) {
1140 print_usage();
1141 return 1;
1142 }
1143 //right_map_file.openRead(right_map_filename, unbamcmd);
1144 //if (optind<argc) {
1145 right_reads_filename=argv[optind++];
1146 right_reads_file.openRead(right_reads_filename,unzcmd);
1147 right_um_filename=output_dir+"/unmapped_right.fq";
1148 if (!zpacker.empty()) right_um_filename+=".z";
1149 if (right_um_file.openWrite(right_um_filename.c_str(), zpacker)==NULL)
1150 err_die("Error: cannot open file %s for writing!\n",right_um_filename.c_str());
1151
1152 // }
1153 }
1154 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1155 if (junctions_file == NULL)
1156 {
1157 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1158 junctions_file_name.c_str());
1159 exit(1);
1160 }
1161
1162 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1163 if (insertions_file == NULL)
1164 {
1165 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1166 insertions_file_name.c_str());
1167 exit(1);
1168 }
1169
1170
1171 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1172 if (deletions_file == NULL)
1173 {
1174 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1175 deletions_file_name.c_str());
1176 exit(1);
1177 }
1178 bool uncompressed_bam=(accepted_hits_file_name=="-");
1179 GBamWriter bam_writer(accepted_hits_file_name.c_str(), sam_header.c_str(), uncompressed_bam);
1180
1181 FZPipe left_reads_file(left_reads_filename, unzcmd);
1182 if (left_reads_file.file==NULL)
1183 {
1184 fprintf(stderr, "Error: cannot open reads file %s for reading\n",
1185 left_reads_filename.c_str());
1186 exit(1);
1187 }
1188 string left_um_filename(output_dir+"/unmapped_left.fq");
1189 if (!zpacker.empty()) left_um_filename+=".z";
1190 FZPipe left_um_file;
1191 if (left_um_file.openWrite(left_um_filename.c_str(), zpacker)==NULL)
1192 err_die("Error: cannot open file %s for writing!\n",left_um_filename.c_str());
1193
1194 driver(bam_writer, left_map_filename,
1195 left_reads_file.file,
1196 right_map_filename,
1197 right_reads_file.file,
1198 junctions_file,
1199 insertions_file,
1200 deletions_file,
1201 left_um_file.file,
1202 right_um_file.file);
1203 left_um_file.close();
1204 right_um_file.close();
1205 return 0;
1206 }