ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (13 years, 1 month ago) by gpertea
File size: 39886 byte(s)
Log Message:
adding tophat source work

Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include "common.h"
34 #include "bwt_map.h"
35 #include "junctions.h"
36 #include "insertions.h"
37 #include "deletions.h"
38 #include "align_status.h"
39 #include "fragments.h"
40 #include "wiggles.h"
41 #include "tokenize.h"
42 #include "reads.h"
43
44 #include "inserts.h"
45
46 using namespace std;
47 using namespace seqan;
48 using std::set;
49
50 void fragment_best_alignments(const HitsForRead& hits_for_read,
51 FragmentAlignmentGrade& best_grade,
52 HitsForRead& best_hits,
53 const JunctionSet& gtf_junctions)
54 {
55 const vector<BowtieHit>& hits = hits_for_read.hits;
56 for (size_t i = 0; i < hits.size(); ++i)
57 {
58 FragmentAlignmentGrade g(hits[i], gtf_junctions);
59 // Is the new status better than the current best one?
60 if (best_grade < g)
61 {
62 best_hits.hits.clear();
63 best_grade = g;
64 best_hits.hits.push_back(hits[i]);
65 }
66 else if (!(g < best_grade)) // is it just as good?
67 {
68 best_grade.num_alignments++;
69 best_hits.hits.push_back(hits[i]);
70 }
71 }
72 }
73
74 void insert_best_alignments(const HitsForRead& left_hits,
75 const HitsForRead& right_hits,
76 InsertAlignmentGrade& best_grade,
77 HitsForRead& left_best_hits,
78 HitsForRead& right_best_hits)
79 {
80 // max mate inner distance (genomic)
81 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
82 if (max_mate_inner_dist == -1)
83 {
84 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
85 }
86
87 const vector<BowtieHit>& left = left_hits.hits;
88 const vector<BowtieHit>& right = right_hits.hits;
89
90 for (size_t i = 0; i < left.size(); ++i)
91 {
92 const BowtieHit& lh = left[i];
93 for (size_t j = 0; j < right.size(); ++j)
94 {
95 const BowtieHit& rh = right[j];
96
97 if (lh.ref_id() != rh.ref_id())
98 continue;
99
100 InsertAlignmentGrade g(lh, rh, min_mate_inner_dist, max_mate_inner_dist);
101
102 // Is the new status better than the current best one?
103 if (best_grade < g)
104 {
105 left_best_hits.hits.clear();
106 right_best_hits.hits.clear();
107
108 best_grade = g;
109 left_best_hits.hits.push_back(lh);
110 right_best_hits.hits.push_back(rh);
111 }
112 else if (!(g < best_grade))
113 {
114 left_best_hits.hits.push_back(lh);
115 right_best_hits.hits.push_back(rh);
116 best_grade.num_alignments++;
117 }
118 }
119 }
120 }
121
122 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
123
124 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
125 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
126 int num_hits, const BowtieHit* next_hit, int hitIndex) {
127
128 if (sam_toks.size()>11) {
129 for (size_t i=11;i<sam_toks.size();++i) {
130 if (sam_toks[i].find("NH:i:")==string::npos)
131 auxdata.push_back(sam_toks[i]);
132 }
133 }
134 string aux("NH:i:");
135 str_appendInt(aux, num_hits);
136 auxdata.push_back(aux);
137 if (next_hit) {
138 const char* nh_ref_name = "=";
139 nh_ref_name = rt.get_name(next_hit->ref_id());
140 assert (nh_ref_name != NULL);
141 bool same_contig=(next_hit->ref_id()==bh.ref_id());
142 aux="CC:Z:";
143 aux+= (same_contig ? "=" : nh_ref_name);
144 auxdata.push_back(aux);
145 aux="CP:i:";
146 int nh_gpos=next_hit->left() + 1;
147 str_appendInt(aux, nh_gpos);
148 auxdata.push_back(aux);
149 } //has next_hit
150 // FIXME: this code is still a bit brittle, because it contains no
151 // consistency check that the mates are on opposite strands (a current protocol
152 // requirement, and that the strand indicated by the alignment is consistent
153 // with the orientation of the splices (though that should be handled upstream).
154 const string xs_f("XS:A:+");
155 const string xs_r("XS:A:-");
156 if (bh.contiguous()) {
157 if (library_type == FR_FIRSTSTRAND) {
158 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
159 if (bh.antisense_align())
160 auxdata.push_back(xs_f);
161 else
162 auxdata.push_back(xs_r);
163 }
164 else {
165 if (bh.antisense_align())
166 auxdata.push_back(xs_r);
167 else
168 auxdata.push_back(xs_f);
169 }
170 }
171 else if (library_type == FR_SECONDSTRAND) {
172 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
173 if (bh.antisense_align())
174 auxdata.push_back(xs_r);
175 else
176 auxdata.push_back(xs_f);
177 }
178 else
179 {
180 if (bh.antisense_align())
181 auxdata.push_back(xs_f);
182 else
183 auxdata.push_back(xs_r);
184 }
185 }
186 } //bh.contiguous()
187 if (hitIndex >= 0)
188 {
189 string aux("HI:i:");
190 str_appendInt(aux, hitIndex);
191 auxdata.push_back(aux);
192 }
193 }
194
195 bool rewrite_sam_record(GBamWriter& bam_writer, const RefSequenceTable& rt,
196 const BowtieHit& bh,
197 const char* bwt_buf,
198 //char* rebuf,
199 char* read_alt_name,
200 const FragmentAlignmentGrade& grade,
201 FragmentType insert_side,
202 int num_hits,
203 const BowtieHit* next_hit,
204 bool primary,
205 int hitIndex)
206 {
207 // Rewrite this hit, filling in the alt name, mate mapping
208 // and setting the pair flag
209 vector<string> sam_toks;
210 tokenize(bwt_buf, "\t", sam_toks);
211 char* slash_pos = strrchr(read_alt_name,'/');
212
213 if (slash_pos) *slash_pos = 0;
214 //*rebuf = 0;
215 //read_alt_name as QNAME
216 int flag=atoi(sam_toks[1].c_str()); //FLAG
217 if (insert_side != FRAG_UNPAIRED) {
218 //flag = atoi(sam_toks[1].c_str());
219 // mark this as a singleton mate
220 flag |= 0x0001;
221 if (insert_side == FRAG_LEFT)
222 flag |= 0x0040;
223 else if (insert_side == FRAG_RIGHT)
224 flag |= 0x0080;
225 flag |= 0x0008;
226 //char flag_buf[64];
227 //sprintf(flag_buf, "%d", flag);
228 //sam_toks[t] = flag_buf;
229 }
230 if (!primary)
231 flag |= 0x100;
232
233 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
234 int mapQ=255;
235 if (grade.num_alignments > 1) {
236 double err_prob = 1 - (1.0 / grade.num_alignments);
237 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
238 }
239 vector<string> auxdata;
240 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
241 int tlen =atoi(sam_toks[8].c_str()); //TLEN
242 int mate_pos=atoi(sam_toks[7].c_str());
243 GBamRecord* bamrec=bam_writer.new_record(read_alt_name, flag, sam_toks[2].c_str(), gpos, mapQ,
244 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
245 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
246 bam_writer.write(bamrec);
247 delete bamrec;
248 return true;
249 }
250
251 bool rewrite_sam_record(GBamWriter& bam_writer, const RefSequenceTable& rt,
252 const BowtieHit& bh,
253 const char* bwt_buf,
254 //char* rebuf,
255 char* read_alt_name,
256 const InsertAlignmentGrade& grade,
257 FragmentType insert_side,
258 const BowtieHit* partner,
259 int num_hits,
260 const BowtieHit* next_hit,
261 bool primary,
262 int hitIndex)
263 {
264 // Rewrite this hit, filling in the alt name, mate mapping
265 // and setting the pair flag
266 vector<string> sam_toks;
267 tokenize(bwt_buf, "\t", sam_toks);
268 char* slash_pos = strrchr(read_alt_name,'/');
269 if (slash_pos) *slash_pos = 0;
270 //read_alt_name as QNAME
271 int flag = atoi(sam_toks[1].c_str());
272 // 0x0010 (strand of query) is assumed to be set correctly
273 // to begin with
274 flag |= 0x0001; //it must be paired
275 if (insert_side == FRAG_LEFT)
276 flag |= 0x0040;
277 else if (insert_side == FRAG_RIGHT)
278 flag |= 0x0080;
279 if (!primary)
280 flag |= 0x100;
281
282 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
283 int mapQ=255;
284 if (grade.num_alignments > 1) {
285 double err_prob = 1 - (1.0 / grade.num_alignments);
286 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
287 }
288 int tlen=0; //TLEN
289 int mate_pos=atoi(sam_toks[7].c_str());
290 if (partner) {
291 if (partner->ref_id()==bh.ref_id()) {
292 sam_toks[6] = "="; //same chromosome
293 //TLEN:
294 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
295 partner->left() - bh.right();
296 }
297
298 else { //partner on different chromosome/contig
299 sam_toks[6] = rt.get_name(partner->ref_id());
300 if (sam_toks[6].empty()) {
301 //FIXME -- this should never happen
302 sam_toks[6] = "=";
303 fprintf(stderr, "Warning: partner ref_id %d has no entry in ref table?\n", partner->ref_id());
304 }
305 }
306 mate_pos = partner->left() + 1;
307 if (grade.happy())
308 flag |= 0x0002;
309 if (partner->antisense_align())
310 flag |= 0x0020;
311 }
312 else {
313 sam_toks[6] = "*";
314 mate_pos = 0;
315 flag |= 0x0008;
316 }
317 vector<string> auxdata;
318 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
319 GBamRecord* bamrec=bam_writer.new_record(read_alt_name, flag, sam_toks[2].c_str(), gpos, mapQ,
320 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
321 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
322 bam_writer.write(bamrec);
323 delete bamrec;
324 return true;
325 }
326
327 struct lex_hit_sort
328 {
329 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
330 : _rt(rt), _hits(hits)
331 {}
332
333 bool operator()(const uint32_t& l, const uint32_t& r) const
334 {
335 const BowtieHit& lhs = _hits.hits[l];
336 const BowtieHit& rhs = _hits.hits[r];
337
338 uint32_t l_id = lhs.ref_id();
339 uint32_t r_id = rhs.ref_id();
340 if (l_id != r_id)
341 {
342 return (strcmp(_rt.get_name(lhs.ref_id()), _rt.get_name(rhs.ref_id())) < 0);
343 }
344 return lhs.left() < rhs.left();
345 }
346
347 const RefSequenceTable& _rt;
348 const HitsForRead& _hits;
349 };
350
351 void print_sam_for_hits(const RefSequenceTable& rt,
352 const HitsForRead& hits,
353 const FragmentAlignmentGrade& grade,
354 FragmentType frag_type,
355 FILE* reads_file,
356 GBamWriter& bam_writer)
357 {
358 static const int buf_size = 2048;
359 char read_name[buf_size];
360 char read_seq[buf_size];
361 char read_alt_name[buf_size];
362 char read_quals[buf_size];
363
364 //char rebuf[buf_size];
365
366 lex_hit_sort s(rt, hits);
367 vector<uint32_t> index_vector;
368 for (size_t i = 0; i < hits.hits.size(); ++i)
369 index_vector.push_back(i);
370
371 sort(index_vector.begin(), index_vector.end(), s);
372
373 bool got_read = get_read_from_stream(hits.insert_id,
374 reads_file,
375 reads_format,
376 false,
377 read_name,
378 read_seq,
379 read_alt_name,
380 read_quals);
381
382 assert (got_read);
383
384 size_t primaryHit = (hits.hits.empty()? 0: random() % hits.hits.size());
385 bool multipleHits = (hits.hits.size() > 1);
386 for (size_t i = 0; i < hits.hits.size(); ++i)
387 {
388 bool primary = (i == primaryHit);
389 size_t index = index_vector[i];
390 const BowtieHit& bh = hits.hits[index];
391 rewrite_sam_record(bam_writer, rt,
392 bh,
393 bh.hitfile_rec().c_str(),
394 read_alt_name,
395 grade,
396 frag_type,
397 hits.hits.size(),
398 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
399 primary,
400 (multipleHits? i: -1));
401 }
402 }
403
404 void print_sam_for_hits(const RefSequenceTable& rt,
405 const HitsForRead& left_hits,
406 const HitsForRead& right_hits,
407 const InsertAlignmentGrade& grade,
408 FILE* left_reads_file,
409 FILE* right_reads_file,
410 GBamWriter& bam_writer)
411 {
412 assert (left_hits.insert_id == right_hits.insert_id);
413
414 static const int buf_size = 2048;
415 char left_read_name[buf_size];
416 char left_read_seq[buf_size];
417 char left_read_alt_name[buf_size];
418 char left_read_quals[buf_size];
419 //char left_rebuf[buf_size];
420
421 char right_read_name[buf_size];
422 char right_read_seq[buf_size];
423 char right_read_alt_name[buf_size];
424 char right_read_quals[buf_size];
425 //char right_rebuf[buf_size];
426
427 assert (left_hits.hits.size() == right_hits.hits.size() ||
428 (left_hits.hits.empty() || right_hits.hits.empty()));
429
430 size_t primaryHit = 0;
431 vector<uint32_t> index_vector;
432 if(right_hits.hits.size() > 0)
433 {
434 lex_hit_sort s(rt, right_hits);
435 for (size_t i = 0; i < right_hits.hits.size(); ++i)
436 index_vector.push_back(i);
437
438 sort(index_vector.begin(), index_vector.end(), s);
439 primaryHit = random() % right_hits.hits.size();
440 }
441 else if (left_hits.hits.size() > 0)
442 {
443 lex_hit_sort s(rt, left_hits);
444 for (size_t i = 0; i < left_hits.hits.size(); ++i)
445 index_vector.push_back(i);
446
447 sort(index_vector.begin(), index_vector.end(), s);
448 primaryHit = random() % left_hits.hits.size();
449 }
450
451 bool got_left_read = get_read_from_stream(left_hits.insert_id,
452 left_reads_file,
453 reads_format,
454 false,
455 left_read_name,
456 left_read_seq,
457 left_read_alt_name,
458 left_read_quals);
459
460 bool got_right_read = get_read_from_stream(right_hits.insert_id,
461 right_reads_file,
462 reads_format,
463 false,
464 right_read_name,
465 right_read_seq,
466 right_read_alt_name,
467 right_read_quals);
468
469
470
471 if (left_hits.hits.size() == right_hits.hits.size())
472 {
473 assert (got_left_read && got_right_read);
474 bool multipleHits = (left_hits.hits.size() > 1);
475 for (size_t i = 0; i < right_hits.hits.size(); ++i)
476 {
477 bool primary = (i == primaryHit);
478 size_t index = index_vector[i];
479 const BowtieHit& right_bh = right_hits.hits[index];
480 const BowtieHit& left_bh = left_hits.hits[index];
481
482 rewrite_sam_record(bam_writer, rt,
483 right_bh,
484 right_bh.hitfile_rec().c_str(),
485 right_read_alt_name,
486 grade,
487 FRAG_RIGHT,
488 &left_bh,
489 right_hits.hits.size(),
490 (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
491 primary,
492 (multipleHits? i: -1));
493 rewrite_sam_record(bam_writer, rt,
494 left_bh,
495 left_bh.hitfile_rec().c_str(),
496 left_read_alt_name,
497 grade,
498 FRAG_LEFT,
499 &right_bh,
500 left_hits.hits.size(),
501 (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
502 primary,
503 (multipleHits? i: -1));
504 }
505 }
506 else if (left_hits.hits.empty())
507 {
508 for (size_t i = 0; i < right_hits.hits.size(); ++i)
509 {
510 bool primary = (i == primaryHit);
511 size_t index = index_vector[i];
512 const BowtieHit& bh = right_hits.hits[index];
513
514 rewrite_sam_record(bam_writer, rt,
515 bh,
516 bh.hitfile_rec().c_str(),
517 right_read_alt_name,
518 grade,
519 FRAG_RIGHT,
520 NULL,
521 right_hits.hits.size(),
522 (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
523 primary,
524 -1);
525
526 }
527 }
528 else if (right_hits.hits.empty())
529 {
530 for (size_t i = 0; i < left_hits.hits.size(); ++i)
531 {
532 bool primary = (i == primaryHit);
533 size_t index = index_vector[i];
534 const BowtieHit& bh = left_hits.hits[index];
535 rewrite_sam_record(bam_writer, rt,
536 bh,
537 bh.hitfile_rec().c_str(),
538 left_read_alt_name,
539 grade,
540 FRAG_LEFT,
541 NULL,
542 left_hits.hits.size(),
543 (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
544 primary,
545 -1);
546
547 }
548 }
549 else
550 {
551 assert (false);
552 }
553 }
554
555 /**
556 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
557 * @param hits hits The alignments for a particular read
558 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
559 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
560 */
561 void update_insertions_and_deletions(const HitsForRead& hits,
562 InsertionSet& insertions,
563 DeletionSet& deletions)
564 {
565 for (size_t i = 0; i < hits.hits.size(); ++i)
566 {
567 const BowtieHit& bh = hits.hits[i];
568 insertions_from_alignment(bh, insertions);
569 deletions_from_alignment(bh, deletions);
570 }
571 }
572
573
574
575 void update_junctions(const HitsForRead& hits,
576 JunctionSet& junctions)
577 {
578 for (size_t i = 0; i < hits.hits.size(); ++i)
579 {
580 const BowtieHit& bh = hits.hits[i];
581 junctions_from_alignment(bh, junctions);
582 }
583 }
584
585 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
586 // resets the stream when finished.
587 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions,
588 HitsForRead& hits)
589 {
590 HitsForRead remaining;
591 remaining.insert_id = hits.insert_id;
592
593 for (size_t i = 0; i < hits.hits.size(); ++i)
594 {
595 BowtieHit& bh = hits.hits[i];
596 bool filter_hit = false;
597 if (!bh.contiguous())
598 {
599 JunctionSet bh_juncs;
600 junctions_from_alignment(bh, bh_juncs);
601 for (JunctionSet::iterator itr = bh_juncs.begin();
602 itr != bh_juncs.end();
603 itr++)
604 {
605 const Junction& j = itr->first;
606 JunctionSet::const_iterator target = junctions.find(j);
607 if (target == junctions.end() || !target->second.accepted)
608 {
609 filter_hit = true;
610 break;
611 }
612 }
613 }
614 if (!filter_hit)
615 {
616 remaining.hits.push_back(bh);
617 }
618 else
619 {
620 //fprintf(stderr, "Filtering hit\n");
621 }
622 }
623 hits = remaining;
624 }
625
626 void get_junctions_from_best_hits(HitStream& left_hs,
627 HitStream& right_hs,
628 ReadTable& it,
629 JunctionSet& junctions,
630 const JunctionSet& gtf_junctions)
631 {
632 HitsForRead curr_left_hit_group;
633 HitsForRead curr_right_hit_group;
634
635 left_hs.next_read_hits(curr_left_hit_group);
636 right_hs.next_read_hits(curr_right_hit_group);
637
638 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
639 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
640
641 // While we still have unreported hits...
642 while(curr_left_obs_order != 0xFFFFFFFF ||
643 curr_right_obs_order != 0xFFFFFFFF)
644 {
645 // Chew up left singletons
646 while (curr_left_obs_order < curr_right_obs_order &&
647 curr_left_obs_order != 0xFFFFFFFF)
648 {
649 HitsForRead best_hits;
650 best_hits.insert_id = curr_left_obs_order;
651 FragmentAlignmentGrade grade;
652
653 // Process hits for left singleton, select best alignments
654 fragment_best_alignments(curr_left_hit_group, grade, best_hits, gtf_junctions);
655 update_junctions(best_hits, junctions);
656
657 // Get next hit group
658 left_hs.next_read_hits(curr_left_hit_group);
659 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
660 }
661
662 // Chew up right singletons
663 while (curr_left_obs_order > curr_right_obs_order &&
664 curr_right_obs_order != 0xFFFFFFFF)
665 {
666 HitsForRead best_hits;
667 best_hits.insert_id = curr_right_obs_order;
668 FragmentAlignmentGrade grade;
669
670 // Process hit for right singleton, select best alignments
671 fragment_best_alignments(curr_right_hit_group,grade, best_hits, gtf_junctions);
672 update_junctions(best_hits, junctions);
673
674 // Get next hit group
675 right_hs.next_read_hits(curr_right_hit_group);
676 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
677 }
678
679 // Since we have both left hits and right hits for this insert,
680 // Find the best pairing and print both
681 while (curr_left_obs_order == curr_right_obs_order &&
682 curr_left_obs_order != 0xFFFFFFFF && curr_right_obs_order != 0xFFFFFFFF)
683 {
684 if (curr_left_hit_group.hits.empty())
685 {
686 HitsForRead right_best_hits;
687 right_best_hits.insert_id = curr_right_obs_order;
688
689 FragmentAlignmentGrade grade;
690 fragment_best_alignments(curr_right_hit_group, grade, right_best_hits, gtf_junctions);
691 update_junctions(right_best_hits, junctions);
692 }
693 else if (curr_right_hit_group.hits.empty())
694 {
695 HitsForRead left_best_hits;
696 left_best_hits.insert_id = curr_left_obs_order;
697
698 FragmentAlignmentGrade grade;
699 // Process hits for left singleton, select best alignments
700 fragment_best_alignments(curr_left_hit_group, grade, left_best_hits, gtf_junctions);
701 update_junctions(left_best_hits, junctions);
702 }
703 else
704 {
705 HitsForRead left_best_hits;
706 HitsForRead right_best_hits;
707 left_best_hits.insert_id = curr_left_obs_order;
708 right_best_hits.insert_id = curr_right_obs_order;
709
710 // daehwan - apply gtf_junctions here, too!
711
712 InsertAlignmentGrade grade;
713 insert_best_alignments(curr_left_hit_group,
714 curr_right_hit_group,
715 grade,
716 left_best_hits,
717 right_best_hits);
718
719 update_junctions(left_best_hits, junctions);
720 update_junctions(right_best_hits, junctions);
721 }
722
723 left_hs.next_read_hits(curr_left_hit_group);
724 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
725
726 right_hs.next_read_hits(curr_right_hit_group);
727 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
728 }
729 }
730 left_hs.reset();
731 right_hs.reset();
732 }
733
734
735 void driver(GBamWriter& bam_writer,
736 string& left_map_fname,
737 FILE* left_reads,
738 string& right_map_fname,
739 FILE* right_reads,
740 FILE* junctions_out,
741 FILE* insertions_out,
742 FILE* deletions_out)
743 {
744 ReadTable it;
745 RefSequenceTable rt(sam_header, true);
746 srandom(1);
747 JunctionSet gtf_junctions;
748 if (gtf_juncs != "")
749 {
750 char splice_buf[2048];
751 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
752 if (splice_coords)
753 {
754 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
755 {
756 char* nl = strrchr(splice_buf, '\n');
757 char* buf = splice_buf;
758 if (nl) *nl = 0;
759
760 char* ref_name = get_token((char**)&buf, "\t");
761 char* scan_left_coord = get_token((char**)&buf, "\t");
762 char* scan_right_coord = get_token((char**)&buf, "\t");
763 char* orientation = get_token((char**)&buf, "\t");
764
765 if (!scan_left_coord || !scan_right_coord || !orientation)
766 {
767 fprintf(stderr,"Error: malformed splice coordinate record\n");
768 exit(1);
769 }
770
771 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
772 uint32_t left_coord = atoi(scan_left_coord);
773 uint32_t right_coord = atoi(scan_right_coord);
774 bool antisense = *orientation == '-';
775
776 // add 1 to left_coord to meet BED format
777 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats()));
778 }
779 }
780 }
781
782 BAMHitFactory hit_factory(it,rt);
783 JunctionSet junctions;
784 {
785 HitStream l_hs(left_map_fname, &hit_factory, false, true, true, true);
786 HitStream r_hs(right_map_fname, &hit_factory, false, true, true, true);
787 get_junctions_from_best_hits(l_hs, r_hs, it, junctions, gtf_junctions);
788 //this resets the streams
789 }
790
791 HitStream left_hs(left_map_fname, &hit_factory, false, true, true, true);
792 HitStream right_hs(right_map_fname, &hit_factory, false, true, true, true);
793
794 size_t num_unfiltered_juncs = junctions.size();
795 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
796
797 HitsForRead curr_left_hit_group;
798 HitsForRead curr_right_hit_group;
799
800 left_hs.next_read_hits(curr_left_hit_group);
801 right_hs.next_read_hits(curr_right_hit_group);
802
803 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
804 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
805
806 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
807 filter_junctions(junctions, gtf_junctions);
808 //size_t num_juncs_after_filter = junctions.size();
809 //fprintf(stderr, "Filtered %lu junctions\n", num_unfiltered_juncs - num_juncs_after_filter);
810
811 size_t small_overhangs = 0;
812 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
813 {
814 if (i->second.accepted &&
815 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
816 {
817 small_overhangs++;
818 }
819 }
820
821 if (small_overhangs >0)
822 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
823
824 JunctionSet final_junctions; // the junctions formed from best hits
825 InsertionSet final_insertions;
826 DeletionSet final_deletions;
827
828 fprintf (stderr, "Reporting final accepted alignments...");
829 // While we still have unreported hits...
830 while(curr_left_obs_order != 0xFFFFFFFF ||
831 curr_right_obs_order != 0xFFFFFFFF)
832 {
833 // Chew up left singletons
834 while (curr_left_obs_order < curr_right_obs_order &&
835 curr_left_obs_order != 0xFFFFFFFF)
836 {
837 HitsForRead best_hits;
838 best_hits.insert_id = curr_left_obs_order;
839 FragmentAlignmentGrade grade;
840
841 exclude_hits_on_filtered_junctions(junctions, curr_left_hit_group);
842
843 // Process hits for left singleton, select best alignments
844 fragment_best_alignments(curr_left_hit_group, grade, best_hits, gtf_junctions);
845
846 if (best_hits.hits.size() <= max_multihits)
847 {
848 update_junctions(best_hits, final_junctions);
849 update_insertions_and_deletions(best_hits, final_insertions, final_deletions);
850
851 print_sam_for_hits(rt,
852 best_hits,
853 grade,
854 (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
855 left_reads,
856 bam_writer);
857 }
858
859 // Get next hit group
860 left_hs.next_read_hits(curr_left_hit_group);
861 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
862 } //left singletons
863
864 // Chew up right singletons
865 while (curr_left_obs_order > curr_right_obs_order &&
866 curr_right_obs_order != 0xFFFFFFFF)
867 {
868 HitsForRead best_hits;
869 best_hits.insert_id = curr_right_obs_order;
870 FragmentAlignmentGrade grade;
871
872 exclude_hits_on_filtered_junctions(junctions, curr_right_hit_group);
873
874 // Process hit for right singleton, select best alignments
875 fragment_best_alignments(curr_right_hit_group, grade, best_hits, gtf_junctions);
876
877 if (best_hits.hits.size() <= max_multihits)
878 {
879 update_junctions(best_hits, final_junctions);
880 update_insertions_and_deletions(best_hits, final_insertions, final_deletions);
881
882 print_sam_for_hits(rt,
883 best_hits,
884 grade,
885 FRAG_RIGHT,
886 right_reads,
887 bam_writer);
888 }
889
890 // Get next hit group
891 right_hs.next_read_hits(curr_right_hit_group);
892 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
893 }
894
895 // Since we have both left hits and right hits for this insert,
896 // Find the best pairing and print both
897 while (curr_left_obs_order == curr_right_obs_order &&
898 curr_left_obs_order != 0xFFFFFFFF && curr_right_obs_order != 0xFFFFFFFF)
899 {
900 exclude_hits_on_filtered_junctions(junctions, curr_left_hit_group);
901 exclude_hits_on_filtered_junctions(junctions, curr_right_hit_group);
902
903 if (curr_left_hit_group.hits.empty())
904 {
905 HitsForRead right_best_hits;
906 right_best_hits.insert_id = curr_right_obs_order;
907
908 FragmentAlignmentGrade grade;
909 fragment_best_alignments(curr_right_hit_group, grade, right_best_hits, gtf_junctions);
910
911 if (right_best_hits.hits.size() <= max_multihits)
912 {
913 update_junctions(right_best_hits, final_junctions);
914 update_insertions_and_deletions(right_best_hits, final_insertions, final_deletions);
915
916 print_sam_for_hits(rt,
917 right_best_hits,
918 grade,
919 FRAG_RIGHT,
920 right_reads,
921 bam_writer);
922 }
923 }
924 else if (curr_right_hit_group.hits.empty())
925 {
926 HitsForRead left_best_hits;
927 left_best_hits.insert_id = curr_left_obs_order;
928
929 FragmentAlignmentGrade grade;
930 // Process hits for left singleton, select best alignments
931 fragment_best_alignments(curr_left_hit_group, grade, left_best_hits, gtf_junctions);
932
933 if (left_best_hits.hits.size() <= max_multihits)
934 {
935 update_junctions(left_best_hits, final_junctions);
936 update_insertions_and_deletions(left_best_hits, final_insertions, final_deletions);
937
938 print_sam_for_hits(rt,
939 left_best_hits,
940 grade,
941 FRAG_LEFT,
942 left_reads,
943 bam_writer);
944 }
945 }
946 else
947 {
948 HitsForRead left_best_hits;
949 HitsForRead right_best_hits;
950 left_best_hits.insert_id = curr_left_obs_order;
951 right_best_hits.insert_id = curr_right_obs_order;
952
953 InsertAlignmentGrade grade;
954 insert_best_alignments(curr_left_hit_group,
955 curr_right_hit_group,
956 grade,
957 left_best_hits,
958 right_best_hits);
959
960 if (left_best_hits.hits.size() <= max_multihits && right_best_hits.hits.size() <= max_multihits)
961 {
962 update_junctions(left_best_hits, final_junctions);
963 update_junctions(right_best_hits, final_junctions);
964 update_insertions_and_deletions(left_best_hits, final_insertions, final_deletions);
965 update_insertions_and_deletions(right_best_hits, final_insertions, final_deletions);
966
967 print_sam_for_hits(rt,
968 left_best_hits,
969 right_best_hits,
970 grade,
971 left_reads,
972 right_reads,
973 bam_writer);
974 }
975 }
976
977 left_hs.next_read_hits(curr_left_hit_group);
978 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
979
980 right_hs.next_read_hits(curr_right_hit_group);
981 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
982 }
983
984 } //while we still have unreported hits..
985
986 fprintf (stderr, "done.\n");
987
988 small_overhangs = 0;
989 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
990 {
991 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
992 {
993 final_junctions.erase(i++);
994 }
995 else
996 {
997 ++i;
998 }
999 }
1000
1001 // if (small_overhangs > 0)
1002 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1003
1004 fprintf (stderr, "Printing junction BED track...");
1005 print_junctions(junctions_out, final_junctions, rt);
1006 fprintf (stderr, "done\n");
1007
1008 fprintf (stderr, "Printing insertions...");
1009 print_insertions(insertions_out, final_insertions,rt);
1010 fclose(insertions_out);
1011 fprintf (stderr, "done\n");
1012
1013 fprintf (stderr, "Printing deletions...");
1014 print_deletions(deletions_out, final_deletions, rt);
1015 fclose(deletions_out);
1016 fprintf (stderr, "done\n");
1017
1018 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1019 }
1020
1021 void print_usage()
1022 {
1023 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1024
1025 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1026 }
1027
1028 int main(int argc, char** argv)
1029 {
1030 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1031 fprintf(stderr, "---------------------------------------\n");
1032
1033 reads_format = FASTQ;
1034
1035 int parse_ret = parse_options(argc, argv, print_usage);
1036 if (parse_ret)
1037 return parse_ret;
1038
1039 if(optind >= argc)
1040 {
1041 print_usage();
1042 return 1;
1043 }
1044
1045 string junctions_file_name = argv[optind++];
1046
1047 if(optind >= argc)
1048 {
1049 print_usage();
1050 return 1;
1051 }
1052
1053 string insertions_file_name = argv[optind++];
1054 if(optind >= argc)
1055 {
1056 print_usage();
1057 return 1;
1058 }
1059
1060 string deletions_file_name = argv[optind++];
1061 if(optind >= argc)
1062 {
1063 print_usage();
1064 return 1;
1065 }
1066
1067 string accepted_hits_file_name = argv[optind++];
1068
1069 if(optind >= argc)
1070 {
1071 print_usage();
1072 return 1;
1073 }
1074
1075 string left_map_filename = argv[optind++];
1076
1077 if(optind >= argc)
1078 {
1079 print_usage();
1080 return 1;
1081 }
1082 //FZPipe left_map_file;
1083 //string unbamcmd=getBam2SamCmd(left_map_filename);
1084 //left_map_file.openRead(left_map_filename, unbamcmd);
1085 string left_reads_filename = argv[optind++];
1086 string unzcmd=getUnpackCmd(left_reads_filename, false);
1087
1088 string right_map_filename;
1089 string right_reads_filename;
1090 FZPipe right_reads_file;
1091
1092 if (optind < argc)
1093 {
1094 right_map_filename = argv[optind++];
1095 if(optind >= argc) {
1096 print_usage();
1097 return 1;
1098 }
1099 //right_map_file.openRead(right_map_filename, unbamcmd);
1100 //if (optind<argc) {
1101 right_reads_filename=argv[optind++];
1102 right_reads_file.openRead(right_reads_filename,unzcmd);
1103 // }
1104 }
1105 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1106 if (junctions_file == NULL)
1107 {
1108 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1109 junctions_file_name.c_str());
1110 exit(1);
1111 }
1112
1113 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1114 if (insertions_file == NULL)
1115 {
1116 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1117 insertions_file_name.c_str());
1118 exit(1);
1119 }
1120
1121
1122 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1123 if (deletions_file == NULL)
1124 {
1125 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1126 deletions_file_name.c_str());
1127 exit(1);
1128 }
1129 bool uncompressed_bam=(accepted_hits_file_name=="-");
1130 GBamWriter bam_writer(accepted_hits_file_name.c_str(), sam_header.c_str(), uncompressed_bam);
1131 FZPipe left_reads_file(left_reads_filename, unzcmd);
1132 if (left_reads_file.file==NULL)
1133 {
1134 fprintf(stderr, "Error: cannot open reads file %s for reading\n",
1135 left_reads_filename.c_str());
1136 exit(1);
1137 }
1138
1139 driver(bam_writer, left_map_filename,
1140 left_reads_file.file,
1141 right_map_filename,
1142 right_reads_file.file,
1143 junctions_file,
1144 insertions_file,
1145 deletions_file);
1146 return 0;
1147 }