44 |
|
#include "wiggles.h" |
45 |
|
#include "tokenize.h" |
46 |
|
#include "reads.h" |
47 |
+ |
#include "coverage.h" |
48 |
|
|
49 |
|
|
50 |
|
#include "inserts.h" |
53 |
|
using namespace seqan; |
54 |
|
using std::set; |
55 |
|
|
56 |
+ |
static const JunctionSet empty_junctions; |
57 |
+ |
static const InsertionSet empty_insertions; |
58 |
+ |
static const DeletionSet empty_deletions; |
59 |
+ |
static const FusionSet empty_fusions; |
60 |
+ |
static const Coverage empty_coverage; |
61 |
+ |
|
62 |
|
// daehwan - this is redundancy, which should be removed. |
63 |
|
void get_seqs(istream& ref_stream, |
64 |
|
RefSequenceTable& rt, |
82 |
|
|
83 |
|
struct cmp_read_alignment |
84 |
|
{ |
78 |
– |
cmp_read_alignment(const JunctionSet& gtf_juncs) : |
79 |
– |
_gtf_juncs(>f_juncs) {} |
80 |
– |
|
85 |
|
bool operator() (const BowtieHit& l, const BowtieHit& r) const |
86 |
|
{ |
87 |
< |
FragmentAlignmentGrade gl(l, *_gtf_juncs); |
84 |
< |
FragmentAlignmentGrade gr(r, *_gtf_juncs); |
85 |
< |
bool better = gr < gl; |
86 |
< |
bool worse = gl < gr; |
87 |
< |
|
88 |
< |
if (better && !worse) |
89 |
< |
return true; |
90 |
< |
else |
91 |
< |
return false; |
87 |
> |
return l.alignment_score() > r.alignment_score(); |
88 |
|
} |
93 |
– |
|
94 |
– |
const JunctionSet* _gtf_juncs; |
89 |
|
}; |
90 |
|
|
91 |
|
void read_best_alignments(const HitsForRead& hits_for_read, |
98 |
– |
FragmentAlignmentGrade& best_grade, |
92 |
|
HitsForRead& best_hits, |
93 |
< |
const JunctionSet& gtf_junctions) |
93 |
> |
const JunctionSet& gtf_junctions, |
94 |
> |
const JunctionSet& junctions = empty_junctions, |
95 |
> |
const InsertionSet& insertions = empty_insertions, |
96 |
> |
const DeletionSet& deletions = empty_deletions, |
97 |
> |
const FusionSet& fusions = empty_fusions, |
98 |
> |
const Coverage& coverage = empty_coverage, |
99 |
> |
bool final_report = false) |
100 |
|
{ |
101 |
|
const vector<BowtieHit>& hits = hits_for_read.hits; |
102 |
|
for (size_t i = 0; i < hits.size(); ++i) |
104 |
|
if (hits[i].edit_dist() > max_read_mismatches) |
105 |
|
continue; |
106 |
|
|
107 |
< |
if (report_secondary_alignments) |
107 |
> |
BowtieHit hit = hits[i]; |
108 |
> |
AlignStatus align_status(hit, gtf_junctions, |
109 |
> |
junctions, insertions, deletions, fusions, coverage); |
110 |
> |
hit.alignment_score(align_status._alignment_score); |
111 |
> |
|
112 |
> |
if (report_secondary_alignments || !final_report) |
113 |
|
{ |
114 |
< |
best_hits.hits.push_back(hits[i]); |
114 |
> |
best_hits.hits.push_back(hit); |
115 |
|
} |
116 |
|
else |
117 |
|
{ |
114 |
– |
FragmentAlignmentGrade g(hits[i], gtf_junctions); |
115 |
– |
|
118 |
|
// Is the new status better than the current best one? |
119 |
< |
if (best_grade < g) |
119 |
> |
if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0])) |
120 |
|
{ |
121 |
|
best_hits.hits.clear(); |
122 |
< |
best_grade = g; |
121 |
< |
best_hits.hits.push_back(hits[i]); |
122 |
> |
best_hits.hits.push_back(hit); |
123 |
|
} |
124 |
< |
else if (!(g < best_grade)) // is it just as good? |
124 |
> |
else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good? |
125 |
|
{ |
126 |
< |
best_grade.num_alignments++; |
126 |
< |
best_hits.hits.push_back(hits[i]); |
126 |
> |
best_hits.hits.push_back(hit); |
127 |
|
} |
128 |
|
} |
129 |
|
} |
130 |
|
|
131 |
< |
if (report_secondary_alignments && best_hits.hits.size() > 0) |
131 |
> |
if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0) |
132 |
> |
{ |
133 |
> |
sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment()); |
134 |
> |
} |
135 |
> |
|
136 |
> |
if (final_report) |
137 |
|
{ |
138 |
< |
cmp_read_alignment cmp(gtf_junctions); |
139 |
< |
sort(best_hits.hits.begin(), best_hits.hits.end(), cmp); |
135 |
< |
best_grade = FragmentAlignmentGrade(best_hits.hits[0], gtf_junctions); |
136 |
< |
best_grade.num_alignments = best_hits.hits.size(); |
138 |
> |
if (best_hits.hits.size() > max_multihits) |
139 |
> |
best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end()); |
140 |
|
} |
141 |
|
} |
142 |
|
|
143 |
< |
void set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade) |
143 |
> |
bool set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade) |
144 |
|
{ |
145 |
|
// max mate inner distance (genomic) |
146 |
|
int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev; |
148 |
|
max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev; |
149 |
|
|
150 |
|
bool fusion = false; |
151 |
< |
if (fusion_search) |
151 |
> |
bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING; |
152 |
> |
bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING; |
153 |
> |
if (left_fusion && right_fusion) |
154 |
> |
return false; |
155 |
> |
|
156 |
> |
fusion = left_fusion || right_fusion; |
157 |
> |
if (!fusion && lh.ref_id() != rh.ref_id()) |
158 |
> |
fusion = true; |
159 |
> |
|
160 |
> |
if (!fusion && lh.ref_id() == rh.ref_id()) |
161 |
|
{ |
162 |
< |
bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING; |
151 |
< |
bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING; |
152 |
< |
if (left_fusion && right_fusion) |
153 |
< |
return; |
154 |
< |
|
155 |
< |
fusion = left_fusion || right_fusion; |
156 |
< |
if (!fusion && lh.ref_id() != rh.ref_id()) |
162 |
> |
if (lh.antisense_align() == rh.antisense_align()) |
163 |
|
fusion = true; |
164 |
< |
|
159 |
< |
if (!fusion && lh.ref_id() == rh.ref_id()) |
164 |
> |
else |
165 |
|
{ |
166 |
< |
if (lh.antisense_align() == rh.antisense_align()) |
167 |
< |
fusion = true; |
166 |
> |
int inter_dist = 0; |
167 |
> |
if (lh.antisense_align()) |
168 |
> |
inter_dist = lh.left() - rh.right(); |
169 |
|
else |
170 |
< |
{ |
171 |
< |
int inter_dist = 0; |
172 |
< |
if (lh.antisense_align()) |
173 |
< |
inter_dist = lh.left() - rh.right(); |
168 |
< |
else |
169 |
< |
inter_dist = rh.left() - lh.right(); |
170 |
< |
|
171 |
< |
if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist) |
172 |
< |
fusion = true; |
173 |
< |
} |
170 |
> |
inter_dist = rh.left() - lh.right(); |
171 |
> |
|
172 |
> |
if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist) |
173 |
> |
fusion = true; |
174 |
|
} |
175 |
|
} |
176 |
|
|
177 |
|
grade = InsertAlignmentGrade(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion); |
178 |
+ |
|
179 |
+ |
return true; |
180 |
|
} |
181 |
|
|
182 |
|
struct cmp_pair_alignment |
205 |
|
const HitsForRead& right_hits, |
206 |
|
InsertAlignmentGrade& best_grade, |
207 |
|
vector<pair<BowtieHit, BowtieHit> >& best_hits, |
208 |
< |
const JunctionSet& gtf_junctions) |
208 |
> |
const JunctionSet& gtf_junctions, |
209 |
> |
const JunctionSet& junctions = empty_junctions, |
210 |
> |
const InsertionSet& insertions = empty_insertions, |
211 |
> |
const DeletionSet& deletions = empty_deletions, |
212 |
> |
const FusionSet& fusions = empty_fusions, |
213 |
> |
const Coverage& coverage = empty_coverage, |
214 |
> |
bool final_report = false) |
215 |
|
{ |
216 |
|
const vector<BowtieHit>& left = left_hits.hits; |
217 |
|
const vector<BowtieHit>& right = right_hits.hits; |
218 |
|
|
219 |
|
for (size_t i = 0; i < left.size(); ++i) |
220 |
|
{ |
221 |
< |
const BowtieHit& lh = left[i]; |
222 |
< |
if (lh.edit_dist() > max_read_mismatches) continue; |
221 |
> |
if (left[i].edit_dist() > max_read_mismatches) continue; |
222 |
> |
|
223 |
> |
BowtieHit lh = left[i]; |
224 |
> |
AlignStatus align_status(lh, gtf_junctions, |
225 |
> |
junctions, insertions, deletions, fusions, coverage); |
226 |
> |
lh.alignment_score(align_status._alignment_score); |
227 |
> |
|
228 |
|
for (size_t j = 0; j < right.size(); ++j) |
229 |
|
{ |
230 |
< |
const BowtieHit& rh = right[j]; |
231 |
< |
if (rh.edit_dist() > max_read_mismatches) continue; |
230 |
> |
if (right[j].edit_dist() > max_read_mismatches) continue; |
231 |
> |
|
232 |
> |
BowtieHit rh = right[j]; |
233 |
> |
AlignStatus align_status(rh, gtf_junctions, |
234 |
> |
junctions, insertions, deletions, fusions, coverage); |
235 |
> |
rh.alignment_score(align_status._alignment_score); |
236 |
> |
InsertAlignmentGrade g; |
237 |
> |
bool allowed = set_insert_alignment_grade(lh, rh, g); |
238 |
|
|
239 |
< |
InsertAlignmentGrade g; set_insert_alignment_grade(lh, rh, g); |
239 |
> |
if (!allowed) continue; |
240 |
> |
if (!fusion_search && !report_discordant_pair_alignments && g.is_fusion()) continue; |
241 |
|
|
242 |
< |
if (report_secondary_alignments) |
242 |
> |
if (report_secondary_alignments || !final_report) |
243 |
|
{ |
244 |
|
best_hits.push_back(make_pair(lh, rh)); |
245 |
|
} |
255 |
|
else if (!(g < best_grade)) |
256 |
|
{ |
257 |
|
best_hits.push_back(make_pair(lh, rh)); |
238 |
– |
best_grade.num_alignments++; |
258 |
|
} |
259 |
|
} |
260 |
|
} |
261 |
|
} |
262 |
|
|
263 |
< |
if (report_secondary_alignments && best_hits.size() > 0) |
263 |
> |
if ((report_secondary_alignments || !final_report) && best_hits.size() > 0) |
264 |
|
{ |
265 |
|
cmp_pair_alignment cmp(gtf_junctions); |
266 |
|
sort(best_hits.begin(), best_hits.end(), cmp); |
267 |
|
set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade); |
249 |
– |
best_grade.num_alignments = best_hits.size(); |
268 |
|
} |
269 |
+ |
|
270 |
+ |
if (final_report) |
271 |
+ |
{ |
272 |
+ |
if (best_hits.size() > max_multihits) |
273 |
+ |
best_hits.erase(best_hits.begin() + max_multihits, best_hits.end()); |
274 |
+ |
} |
275 |
+ |
|
276 |
+ |
best_grade.num_alignments = best_hits.size(); |
277 |
|
} |
278 |
|
|
279 |
|
enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT}; |
359 |
|
const BowtieHit& bh, |
360 |
|
const char* bwt_buf, |
361 |
|
const char* read_alt_name, |
336 |
– |
const FragmentAlignmentGrade& grade, |
362 |
|
FragmentType insert_side, |
363 |
|
int num_hits, |
364 |
|
const BowtieHit* next_hit, |
371 |
|
tokenize(bwt_buf, "\t", sam_toks); |
372 |
|
|
373 |
|
string ref_name = sam_toks[2], ref_name2 = ""; |
374 |
< |
char rebuf2[2048] = {0}; |
350 |
< |
char cigar1[256] = {0}, cigar2[256] = {0}; |
374 |
> |
char cigar1[1024] = {0}, cigar2[1024] = {0}; |
375 |
|
string left_seq, right_seq, left_qual, right_qual; |
376 |
|
int left = -1, left1 = -1, left2 = -1; |
377 |
|
bool fusion_alignment = false; |
378 |
|
size_t XF_index = 0; |
379 |
|
for (size_t i = 11; i < sam_toks.size(); ++i) |
380 |
|
{ |
381 |
< |
const string& tok = sam_toks[i]; |
381 |
> |
string& tok = sam_toks[i]; |
382 |
|
if (strncmp(tok.c_str(), "XF", 2) == 0) |
383 |
|
{ |
384 |
|
fusion_alignment = true; |
400 |
|
|
401 |
|
break; |
402 |
|
} |
403 |
+ |
|
404 |
+ |
else if (strncmp(tok.c_str(), "AS", 2) == 0) |
405 |
+ |
{ |
406 |
+ |
char AS_score[128] = {0}; |
407 |
+ |
sprintf(AS_score, "AS:i:%d", bh.alignment_score()); |
408 |
+ |
tok = AS_score; |
409 |
+ |
} |
410 |
|
} |
411 |
|
|
412 |
|
string qname(read_alt_name); |
433 |
|
|
434 |
|
int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0; |
435 |
|
int mapQ=255; |
436 |
< |
if (grade.num_alignments > 1) { |
437 |
< |
double err_prob = 1 - (1.0 / grade.num_alignments); |
436 |
> |
if (num_hits > 1) { |
437 |
> |
double err_prob = 1 - (1.0 / num_hits); |
438 |
|
mapQ = (int)(-10.0 * log(err_prob) / log(10.0)); |
439 |
|
} |
440 |
|
int tlen =atoi(sam_toks[8].c_str()); //TLEN |
505 |
|
flag |= 0x100; |
506 |
|
|
507 |
|
string ref_name = sam_toks[2], ref_name2 = ""; |
508 |
< |
char rebuf2[2048] = {0}; |
478 |
< |
char cigar1[256] = {0}, cigar2[256] = {0}; |
508 |
> |
char cigar1[1024] = {0}, cigar2[1024] = {0}; |
509 |
|
string left_seq, right_seq, left_qual, right_qual; |
510 |
|
int left = -1, left1 = -1, left2 = -1; |
511 |
|
bool fusion_alignment = false; |
512 |
|
size_t XF_tok_idx = 11; |
513 |
|
for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx) |
514 |
|
{ |
515 |
< |
const string& tok = sam_toks[XF_tok_idx]; |
515 |
> |
string& tok = sam_toks[XF_tok_idx]; |
516 |
|
if (strncmp(tok.c_str(), "XF", 2) == 0) |
517 |
|
{ |
518 |
|
fusion_alignment = true; |
533 |
|
|
534 |
|
break; |
535 |
|
} |
536 |
+ |
|
537 |
+ |
else if (strncmp(tok.c_str(), "AS", 2) == 0) |
538 |
+ |
{ |
539 |
+ |
char AS_score[128] = {0}; |
540 |
+ |
sprintf(AS_score, "AS:i:%d", bh.alignment_score()); |
541 |
+ |
tok = AS_score; |
542 |
+ |
} |
543 |
|
} |
544 |
|
|
545 |
|
int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0; |
636 |
|
|
637 |
|
void print_sam_for_single(const RefSequenceTable& rt, |
638 |
|
const HitsForRead& hits, |
602 |
– |
const FragmentAlignmentGrade& grade, |
639 |
|
FragmentType frag_type, |
640 |
|
Read& read, |
641 |
|
GBamWriter& bam_writer, |
669 |
|
bh, |
670 |
|
bh.hitfile_rec().c_str(), |
671 |
|
read.alt_name.c_str(), |
636 |
– |
grade, |
672 |
|
frag_type, |
673 |
|
hits.hits.size(), |
674 |
|
(i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL, |
761 |
|
InsertionSet& insertions, |
762 |
|
DeletionSet& deletions) |
763 |
|
{ |
764 |
< |
for (size_t i = 0; i < hits.hits.size(); ++i) |
764 |
> |
for (size_t i = 0; i < hits.hits.size(); ++i) |
765 |
> |
{ |
766 |
> |
const BowtieHit& bh = hits.hits[i]; |
767 |
> |
insertions_from_alignment(bh, insertions); |
768 |
> |
deletions_from_alignment(bh, deletions); |
769 |
> |
} |
770 |
> |
} |
771 |
> |
|
772 |
> |
void update_coverage(const HitsForRead& hits, |
773 |
> |
Coverage& coverage) |
774 |
> |
{ |
775 |
> |
for (size_t i = 0; i < hits.hits.size(); ++i) |
776 |
> |
{ |
777 |
> |
const BowtieHit& hit = hits.hits[i]; |
778 |
> |
const vector<CigarOp>& cigar = hit.cigar(); |
779 |
> |
unsigned int positionInGenome = hit.left(); |
780 |
> |
RefID ref_id = hit.ref_id(); |
781 |
> |
|
782 |
> |
for(size_t c = 0; c < cigar.size(); ++c) |
783 |
|
{ |
784 |
< |
const BowtieHit& bh = hits.hits[i]; |
785 |
< |
insertions_from_alignment(bh, insertions); |
786 |
< |
deletions_from_alignment(bh, deletions); |
784 |
> |
int opcode = cigar[c].opcode; |
785 |
> |
int length = cigar[c].length; |
786 |
> |
switch(opcode) |
787 |
> |
{ |
788 |
> |
case REF_SKIP: |
789 |
> |
case MATCH: |
790 |
> |
case DEL: |
791 |
> |
if (opcode == MATCH) |
792 |
> |
coverage.add_coverage(ref_id, positionInGenome, length); |
793 |
> |
|
794 |
> |
positionInGenome += length; |
795 |
> |
break; |
796 |
> |
case rEF_SKIP: |
797 |
> |
case mATCH: |
798 |
> |
case dEL: |
799 |
> |
positionInGenome -= length; |
800 |
> |
|
801 |
> |
if (opcode == mATCH) |
802 |
> |
coverage.add_coverage(ref_id, positionInGenome + 1, length); |
803 |
> |
break; |
804 |
> |
case FUSION_FF: |
805 |
> |
case FUSION_FR: |
806 |
> |
case FUSION_RF: |
807 |
> |
case FUSION_RR: |
808 |
> |
positionInGenome = length; |
809 |
> |
ref_id = hit.ref_id2(); |
810 |
> |
break; |
811 |
> |
default: |
812 |
> |
break; |
813 |
> |
} |
814 |
|
} |
815 |
+ |
} |
816 |
|
} |
817 |
|
|
818 |
|
|
738 |
– |
FusionSet empty_fusions; |
819 |
|
void update_fusions(const HitsForRead& hits, |
820 |
|
RefSequenceTable& rt, |
821 |
|
FusionSet& fusions, |
822 |
< |
FusionSet& fusions_ref = empty_fusions) |
822 |
> |
const FusionSet& fusions_ref = empty_fusions) |
823 |
|
{ |
824 |
|
if (hits.hits.size() > fusion_multireads) |
825 |
|
return; |
832 |
|
if (bh.edit_dist() > fusion_read_mismatches) |
833 |
|
continue; |
834 |
|
|
755 |
– |
// daehwan |
756 |
– |
#if 0 |
757 |
– |
vector<Fusion> new_fusions; |
758 |
– |
fusions_from_spliced_hit(bh, new_fusions); |
759 |
– |
|
760 |
– |
for(size_t i = 0; i < new_fusions.size(); ++i) |
761 |
– |
{ |
762 |
– |
Fusion fusion = new_fusions[i]; |
763 |
– |
if (fusion.left == 110343414 && fusion.right == 80211173 && hits.hits.size() > 1) |
764 |
– |
{ |
765 |
– |
for (size_t t = 0; t < hits.hits.size(); ++t) |
766 |
– |
{ |
767 |
– |
const BowtieHit& lh = hits.hits[t]; |
768 |
– |
cout << "insert id: " << lh.insert_id() << endl; |
769 |
– |
cout << (lh.antisense_align() ? "-" : "+") << endl; |
770 |
– |
cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl; |
771 |
– |
cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl; |
772 |
– |
} |
773 |
– |
} |
774 |
– |
} |
775 |
– |
#endif |
776 |
– |
|
835 |
|
fusions_from_alignment(bh, fusions, rt, update_stat); |
836 |
|
|
837 |
|
if (update_stat) |
842 |
|
void update_junctions(const HitsForRead& hits, |
843 |
|
JunctionSet& junctions) |
844 |
|
{ |
845 |
< |
for (size_t i = 0; i < hits.hits.size(); ++i) |
846 |
< |
{ |
847 |
< |
const BowtieHit& bh = hits.hits[i]; |
848 |
< |
junctions_from_alignment(bh, junctions); |
849 |
< |
} |
845 |
> |
for (size_t i = 0; i < hits.hits.size(); ++i) |
846 |
> |
{ |
847 |
> |
const BowtieHit& bh = hits.hits[i]; |
848 |
> |
junctions_from_alignment(bh, junctions); |
849 |
> |
} |
850 |
|
} |
851 |
|
|
852 |
|
// Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file |
923 |
|
{ |
924 |
|
HitsForRead best_hits; |
925 |
|
best_hits.insert_id = curr_left_obs_order; |
868 |
– |
FragmentAlignmentGrade best_grade; |
926 |
|
|
927 |
|
// Process hits for left singleton, select best alignments |
928 |
< |
read_best_alignments(curr_left_hit_group, best_grade, best_hits, *gtf_junctions); |
928 |
> |
read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions); |
929 |
> |
update_coverage(best_hits, *coverage); |
930 |
|
update_junctions(best_hits, *junctions); |
931 |
+ |
update_insertions_and_deletions(best_hits, *insertions, *deletions); |
932 |
|
update_fusions(best_hits, *rt, *fusions); |
933 |
|
|
934 |
|
// Get next hit group |
943 |
|
{ |
944 |
|
HitsForRead best_hits; |
945 |
|
best_hits.insert_id = curr_right_obs_order; |
887 |
– |
FragmentAlignmentGrade best_grade; |
946 |
|
if (curr_right_obs_order >= begin_id) |
947 |
|
{ |
948 |
|
// Process hit for right singleton, select best alignments |
949 |
< |
read_best_alignments(curr_right_hit_group, best_grade, best_hits, *gtf_junctions); |
949 |
> |
read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions); |
950 |
> |
update_coverage(best_hits, *coverage); |
951 |
|
update_junctions(best_hits, *junctions); |
952 |
+ |
update_insertions_and_deletions(best_hits, *insertions, *deletions); |
953 |
|
update_fusions(best_hits, *rt, *fusions); |
954 |
|
} |
955 |
|
|
973 |
|
best_hits, |
974 |
|
*gtf_junctions); |
975 |
|
|
976 |
< |
HitsForRead single_best_hits; |
977 |
< |
single_best_hits.insert_id = curr_left_obs_order; |
976 |
> |
HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order; |
977 |
> |
HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order; |
978 |
|
|
919 |
– |
// this is too much memory copy - we will make it efficient soon |
979 |
|
for (size_t i = 0; i < best_hits.size(); ++i) |
980 |
|
{ |
981 |
< |
single_best_hits.hits.push_back(best_hits[i].first); |
982 |
< |
single_best_hits.hits.push_back(best_hits[i].second); |
981 |
> |
best_left_hit_group.hits.push_back(best_hits[i].first); |
982 |
> |
best_right_hit_group.hits.push_back(best_hits[i].second); |
983 |
|
} |
984 |
|
|
985 |
< |
update_junctions(single_best_hits, *junctions); |
986 |
< |
update_fusions(single_best_hits, *rt, *fusions); |
985 |
> |
update_coverage(best_left_hit_group, *coverage); |
986 |
> |
update_junctions(best_left_hit_group, *junctions); |
987 |
> |
update_insertions_and_deletions(best_left_hit_group, *insertions, *deletions); |
988 |
> |
update_fusions(best_left_hit_group, *rt, *fusions); |
989 |
> |
|
990 |
> |
update_coverage(best_right_hit_group, *coverage); |
991 |
> |
update_junctions(best_right_hit_group, *junctions); |
992 |
> |
update_insertions_and_deletions(best_right_hit_group, *insertions, *deletions); |
993 |
> |
update_fusions(best_right_hit_group, *rt, *fusions); |
994 |
|
|
995 |
|
l_hs.next_read_hits(curr_left_hit_group); |
996 |
|
curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); |
1009 |
|
string left_map_fname; |
1010 |
|
string right_map_fname; |
1011 |
|
|
946 |
– |
JunctionSet* junctions; |
947 |
– |
JunctionSet* gtf_junctions; |
948 |
– |
FusionSet* fusions; |
1012 |
|
RefSequenceTable* rt; |
1013 |
|
|
1014 |
+ |
JunctionSet* gtf_junctions; |
1015 |
+ |
|
1016 |
|
uint64_t begin_id; |
1017 |
|
uint64_t end_id; |
1018 |
|
int64_t left_map_offset; |
1019 |
|
int64_t right_map_offset; |
1020 |
+ |
|
1021 |
+ |
JunctionSet* junctions; |
1022 |
+ |
InsertionSet* insertions; |
1023 |
+ |
DeletionSet* deletions; |
1024 |
+ |
FusionSet* fusions; |
1025 |
+ |
|
1026 |
+ |
Coverage* coverage; |
1027 |
|
}; |
1028 |
|
|
1029 |
|
struct ReportWorker |
1077 |
|
uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); |
1078 |
|
uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); |
1079 |
|
|
1080 |
+ |
const bool final_report = true; |
1081 |
+ |
|
1082 |
|
// While we still have unreported hits... |
1083 |
|
Read l_read; |
1084 |
|
Read r_read; |
1092 |
|
{ |
1093 |
|
HitsForRead best_hits; |
1094 |
|
best_hits.insert_id = curr_left_obs_order; |
1021 |
– |
FragmentAlignmentGrade grade; |
1095 |
|
bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read, |
1096 |
|
reads_format, false, begin_id, end_id, |
1097 |
|
left_um_out.file, false); |
1098 |
|
//assert(got_read); |
1099 |
|
|
1100 |
|
if (right_reads_file.file()) { |
1028 |
– |
/* |
1029 |
– |
fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(), |
1030 |
– |
l_read.seq.c_str(), l_read.qual.c_str()); |
1031 |
– |
*/ |
1101 |
|
got_read=right_reads_file.getRead(curr_left_obs_order, r_read, |
1102 |
|
reads_format, false, begin_id, end_id, |
1103 |
|
right_um_out.file, true); |
1107 |
|
exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group); |
1108 |
|
|
1109 |
|
// Process hits for left singleton, select best alignments |
1110 |
< |
read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions); |
1111 |
< |
|
1112 |
< |
if (best_hits.hits.size() > max_multihits) |
1044 |
< |
{ |
1045 |
< |
best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end()); |
1046 |
< |
grade.num_alignments = best_hits.hits.size(); |
1047 |
< |
} |
1110 |
> |
read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions, |
1111 |
> |
*junctions, *insertions, *deletions, *fusions, *coverage, |
1112 |
> |
final_report); |
1113 |
|
|
1114 |
|
if (best_hits.hits.size() > 0) |
1115 |
|
{ |
1119 |
|
|
1120 |
|
print_sam_for_single(*rt, |
1121 |
|
best_hits, |
1057 |
– |
grade, |
1122 |
|
(right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT), |
1123 |
|
l_read, |
1124 |
|
bam_writer, |
1136 |
|
{ |
1137 |
|
HitsForRead best_hits; |
1138 |
|
best_hits.insert_id = curr_right_obs_order; |
1075 |
– |
FragmentAlignmentGrade grade; |
1139 |
|
|
1140 |
|
if (curr_right_obs_order >= begin_id) |
1141 |
|
{ |
1156 |
|
exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group); |
1157 |
|
|
1158 |
|
// Process hit for right singleton, select best alignments |
1159 |
< |
read_best_alignments(curr_right_hit_group, grade, best_hits, *gtf_junctions); |
1159 |
> |
read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions, |
1160 |
> |
*junctions, *insertions, *deletions, *fusions, *coverage, |
1161 |
> |
final_report); |
1162 |
|
|
1098 |
– |
if (best_hits.hits.size() > max_multihits) |
1099 |
– |
{ |
1100 |
– |
best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end()); |
1101 |
– |
grade.num_alignments = best_hits.hits.size(); |
1102 |
– |
} |
1103 |
– |
|
1163 |
|
if (best_hits.hits.size() > 0) |
1164 |
|
{ |
1165 |
|
update_junctions(best_hits, *final_junctions); |
1168 |
|
|
1169 |
|
print_sam_for_single(*rt, |
1170 |
|
best_hits, |
1112 |
– |
grade, |
1171 |
|
FRAG_RIGHT, |
1172 |
|
r_read, |
1173 |
|
bam_writer, |
1203 |
|
HitsForRead right_best_hits; |
1204 |
|
right_best_hits.insert_id = curr_right_obs_order; |
1205 |
|
|
1206 |
< |
FragmentAlignmentGrade grade; |
1207 |
< |
read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions); |
1206 |
> |
read_best_alignments(curr_right_hit_group, right_best_hits, *gtf_junctions, |
1207 |
> |
*junctions, *insertions, *deletions, *fusions, *coverage, |
1208 |
> |
final_report); |
1209 |
|
|
1151 |
– |
if (right_best_hits.hits.size() > max_multihits) |
1152 |
– |
{ |
1153 |
– |
right_best_hits.hits.erase(right_best_hits.hits.begin() + max_multihits, right_best_hits.hits.end()); |
1154 |
– |
grade.num_alignments = right_best_hits.hits.size(); |
1155 |
– |
} |
1156 |
– |
|
1210 |
|
if (right_best_hits.hits.size() > 0) |
1211 |
|
{ |
1212 |
|
update_junctions(right_best_hits, *final_junctions); |
1215 |
|
|
1216 |
|
print_sam_for_single(*rt, |
1217 |
|
right_best_hits, |
1165 |
– |
grade, |
1218 |
|
FRAG_RIGHT, |
1219 |
|
r_read, |
1220 |
|
bam_writer, |
1229 |
|
bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read, |
1230 |
|
reads_format, false, begin_id, end_id, |
1231 |
|
left_um_out.file, false); |
1180 |
– |
//assert(got_read); |
1181 |
– |
/* |
1182 |
– |
fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(), |
1183 |
– |
l_read.seq.c_str(), l_read.qual.c_str()); |
1184 |
– |
*/ |
1185 |
– |
FragmentAlignmentGrade grade; |
1186 |
– |
// Process hits for left singleton, select best alignments |
1187 |
– |
read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions); |
1232 |
|
|
1233 |
< |
if (left_best_hits.hits.size() > max_multihits) |
1234 |
< |
{ |
1235 |
< |
left_best_hits.hits.erase(left_best_hits.hits.begin() + max_multihits, left_best_hits.hits.end()); |
1236 |
< |
grade.num_alignments = left_best_hits.hits.size(); |
1193 |
< |
} |
1233 |
> |
// Process hits for left singleton, select best alignments |
1234 |
> |
read_best_alignments(curr_left_hit_group, left_best_hits, *gtf_junctions, |
1235 |
> |
*junctions, *insertions, *deletions, *fusions, *coverage, |
1236 |
> |
final_report); |
1237 |
|
|
1238 |
|
if (left_best_hits.hits.size() > 0) |
1239 |
|
{ |
1243 |
|
|
1244 |
|
print_sam_for_single(*rt, |
1245 |
|
left_best_hits, |
1203 |
– |
grade, |
1246 |
|
FRAG_LEFT, |
1247 |
|
l_read, |
1248 |
|
bam_writer, |
1256 |
|
InsertAlignmentGrade grade; |
1257 |
|
pair_best_alignments(curr_left_hit_group, |
1258 |
|
curr_right_hit_group, |
1259 |
< |
grade, |
1260 |
< |
best_hits, |
1261 |
< |
*gtf_junctions); |
1259 |
> |
grade, best_hits, *gtf_junctions, |
1260 |
> |
*junctions, *insertions, *deletions, *fusions, *coverage, |
1261 |
> |
final_report); |
1262 |
|
|
1263 |
< |
if (best_hits.size() > max_multihits) |
1264 |
< |
{ |
1265 |
< |
best_hits.erase(best_hits.begin() + max_multihits, best_hits.end()); |
1224 |
< |
grade.num_alignments = best_hits.size(); |
1225 |
< |
} |
1226 |
< |
|
1227 |
< |
//hits for both left and right reads |
1228 |
< |
HitsForRead single_best_hits; |
1229 |
< |
single_best_hits.insert_id = curr_left_obs_order; |
1263 |
> |
HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order; |
1264 |
> |
HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order; |
1265 |
> |
|
1266 |
|
for (size_t i = 0; i < best_hits.size(); ++i) |
1267 |
|
{ |
1268 |
< |
single_best_hits.hits.push_back(best_hits[i].first); |
1269 |
< |
single_best_hits.hits.push_back(best_hits[i].second); |
1268 |
> |
best_left_hit_group.hits.push_back(best_hits[i].first); |
1269 |
> |
best_right_hit_group.hits.push_back(best_hits[i].second); |
1270 |
|
} |
1271 |
< |
|
1272 |
< |
if (single_best_hits.hits.size() > 0) |
1271 |
> |
|
1272 |
> |
if (best_hits.size() > 0) |
1273 |
|
{ |
1274 |
< |
update_junctions(single_best_hits, *final_junctions); |
1275 |
< |
update_insertions_and_deletions(single_best_hits, *final_insertions, *final_deletions); |
1274 |
> |
update_junctions(best_left_hit_group, *final_junctions); |
1275 |
> |
update_insertions_and_deletions(best_left_hit_group, *final_insertions, *final_deletions); |
1276 |
> |
update_fusions(best_left_hit_group, *rt, *final_fusions, *fusions); |
1277 |
> |
|
1278 |
> |
update_junctions(best_right_hit_group, *final_junctions); |
1279 |
> |
update_insertions_and_deletions(best_right_hit_group, *final_insertions, *final_deletions); |
1280 |
> |
update_fusions(best_right_hit_group, *rt, *final_fusions, *fusions); |
1281 |
|
|
1282 |
|
pair_support(best_hits, *final_fusions, *fusions); |
1242 |
– |
update_fusions(single_best_hits, *rt, *final_fusions, *fusions); |
1283 |
|
|
1284 |
|
print_sam_for_pair(*rt, |
1285 |
|
best_hits, |
1302 |
|
} |
1303 |
|
} //while we still have unreported hits.. |
1304 |
|
//print the remaining unmapped reads at the end of each reads' stream |
1305 |
+ |
|
1306 |
|
left_reads_file.getRead(VMAXINT32, l_read, |
1307 |
|
reads_format, false, begin_id, end_id, |
1308 |
|
left_um_out.file, false); |
1311 |
|
reads_format, false, begin_id, end_id, |
1312 |
|
right_um_out.file, false); |
1313 |
|
|
1314 |
+ |
|
1315 |
+ |
// pclose (pipe close), which waits for a process to end, seems to conflict with boost::thread::join somehow, |
1316 |
+ |
// resulting in deadlock like behavior. |
1317 |
+ |
#if 0 |
1318 |
|
left_um_out.close(); |
1319 |
|
right_um_out.close(); |
1320 |
+ |
#endif |
1321 |
|
|
1322 |
|
for (size_t i = 0; i < hit_factories.size(); ++i) |
1323 |
|
delete hit_factories[i]; |
1337 |
|
string right_um_fname; |
1338 |
|
|
1339 |
|
JunctionSet* gtf_junctions; |
1340 |
+ |
|
1341 |
|
JunctionSet* junctions; |
1342 |
+ |
InsertionSet* insertions; |
1343 |
+ |
DeletionSet* deletions; |
1344 |
|
FusionSet* fusions; |
1345 |
+ |
Coverage* coverage; |
1346 |
+ |
|
1347 |
|
JunctionSet* final_junctions; |
1348 |
|
InsertionSet* final_insertions; |
1349 |
|
DeletionSet* final_deletions; |
1383 |
|
JunctionSet gtf_junctions; |
1384 |
|
if (!gtf_juncs.empty()) |
1385 |
|
{ |
1386 |
< |
char splice_buf[2048]; |
1386 |
> |
char splice_buf[4096]; |
1387 |
|
FILE* splice_coords = fopen(gtf_juncs.c_str(), "r"); |
1388 |
|
if (splice_coords) |
1389 |
|
{ |
1410 |
|
uint32_t right_coord = atoi(scan_right_coord); |
1411 |
|
bool antisense = *orientation == '-'; |
1412 |
|
|
1413 |
< |
// add 1 to left_coord to meet BED format |
1363 |
< |
gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats())); |
1413 |
> |
gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats())); |
1414 |
|
} |
1415 |
|
} |
1416 |
|
fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str()); |
1434 |
|
} |
1435 |
|
|
1436 |
|
JunctionSet vjunctions[num_threads]; |
1437 |
+ |
InsertionSet vinsertions[num_threads]; |
1438 |
+ |
DeletionSet vdeletions[num_threads]; |
1439 |
|
FusionSet vfusions[num_threads]; |
1440 |
+ |
Coverage vcoverages[num_threads]; |
1441 |
+ |
|
1442 |
|
vector<boost::thread*> threads; |
1443 |
|
for (int i = 0; i < num_threads; ++i) |
1444 |
|
{ |
1446 |
|
|
1447 |
|
worker.left_map_fname = left_map_fname; |
1448 |
|
worker.right_map_fname = right_map_fname; |
1449 |
+ |
worker.rt = &rt; |
1450 |
+ |
worker.gtf_junctions = >f_junctions; |
1451 |
+ |
|
1452 |
|
worker.junctions = &vjunctions[i]; |
1453 |
+ |
worker.insertions = &vinsertions[i]; |
1454 |
+ |
worker.deletions = &vdeletions[i]; |
1455 |
|
worker.fusions = &vfusions[i]; |
1456 |
< |
worker.gtf_junctions = >f_junctions; |
1398 |
< |
worker.rt = &rt; |
1456 |
> |
worker.coverage = &vcoverages[i]; |
1457 |
|
|
1458 |
|
worker.right_map_offset = 0; |
1459 |
|
|
1488 |
|
} |
1489 |
|
threads.clear(); |
1490 |
|
|
1491 |
< |
JunctionSet junctions = vjunctions[0]; |
1492 |
< |
FusionSet fusions = vfusions[0]; |
1491 |
> |
JunctionSet& junctions = vjunctions[0]; |
1492 |
> |
InsertionSet& insertions = vinsertions[0]; |
1493 |
> |
DeletionSet& deletions = vdeletions[0]; |
1494 |
> |
FusionSet& fusions = vfusions[0]; |
1495 |
> |
Coverage& coverage = vcoverages[0]; |
1496 |
|
for (size_t i = 1; i < num_threads; ++i) |
1497 |
|
{ |
1498 |
|
merge_with(junctions, vjunctions[i]); |
1499 |
|
vjunctions[i].clear(); |
1500 |
|
|
1501 |
+ |
merge_with(insertions, vinsertions[i]); |
1502 |
+ |
vinsertions[i].clear(); |
1503 |
+ |
|
1504 |
+ |
merge_with(deletions, vdeletions[i]); |
1505 |
+ |
vdeletions[i].clear(); |
1506 |
+ |
|
1507 |
|
merge_with(fusions, vfusions[i]); |
1508 |
|
vfusions[i].clear(); |
1509 |
+ |
|
1510 |
+ |
coverage.merge_with(vcoverages[i]); |
1511 |
+ |
vcoverages[i].clear(); |
1512 |
|
} |
1513 |
|
|
1514 |
+ |
coverage.calculate_coverage(); |
1515 |
+ |
|
1516 |
|
size_t num_unfiltered_juncs = junctions.size(); |
1517 |
|
fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs); |
1518 |
|
|
1519 |
|
// Read hits, extract junctions, and toss the ones that arent strongly enough supported. |
1448 |
– |
|
1520 |
|
filter_junctions(junctions, gtf_junctions); |
1521 |
|
|
1522 |
|
//size_t num_juncs_after_filter = junctions.size(); |
1524 |
|
// num_unfiltered_juncs - num_juncs_after_filter); |
1525 |
|
|
1526 |
|
/* |
1527 |
< |
size_t small_overhangs = 0; |
1528 |
< |
for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) |
1527 |
> |
size_t small_overhangs = 0; |
1528 |
> |
for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) |
1529 |
|
{ |
1530 |
< |
if (i->second.accepted && |
1531 |
< |
(i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len)) |
1532 |
< |
{ |
1533 |
< |
small_overhangs++; |
1534 |
< |
} |
1530 |
> |
if (i->second.accepted && |
1531 |
> |
(i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len)) |
1532 |
> |
{ |
1533 |
> |
small_overhangs++; |
1534 |
> |
} |
1535 |
|
} |
1536 |
|
|
1537 |
< |
if (small_overhangs >0) |
1538 |
< |
fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs); |
1537 |
> |
if (small_overhangs >0) |
1538 |
> |
fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs); |
1539 |
|
*/ |
1540 |
|
|
1541 |
|
JunctionSet vfinal_junctions[num_threads]; |
1567 |
|
|
1568 |
|
worker.gtf_junctions = >f_junctions; |
1569 |
|
worker.junctions = &junctions; |
1570 |
+ |
worker.insertions = &insertions; |
1571 |
+ |
worker.deletions = &deletions; |
1572 |
|
worker.fusions = &fusions; |
1573 |
+ |
worker.coverage = &coverage; |
1574 |
+ |
|
1575 |
|
worker.final_junctions = &vfinal_junctions[i]; |
1576 |
|
worker.final_insertions = &vfinal_insertions[i]; |
1577 |
|
worker.final_deletions = &vfinal_deletions[i]; |