31 |
|
#include <getopt.h> |
32 |
|
|
33 |
|
#include <boost/thread.hpp> |
34 |
+ |
#include <boost/random/mersenne_twister.hpp> |
35 |
|
|
36 |
|
#include "common.h" |
37 |
|
#include "utils.h" |
45 |
|
#include "wiggles.h" |
46 |
|
#include "tokenize.h" |
47 |
|
#include "reads.h" |
48 |
+ |
#include "coverage.h" |
49 |
|
|
50 |
|
|
51 |
|
#include "inserts.h" |
54 |
|
using namespace seqan; |
55 |
|
using std::set; |
56 |
|
|
57 |
+ |
static const JunctionSet empty_junctions; |
58 |
+ |
static const InsertionSet empty_insertions; |
59 |
+ |
static const DeletionSet empty_deletions; |
60 |
+ |
static const FusionSet empty_fusions; |
61 |
+ |
static const Coverage empty_coverage; |
62 |
+ |
|
63 |
|
// daehwan - this is redundancy, which should be removed. |
64 |
|
void get_seqs(istream& ref_stream, |
65 |
|
RefSequenceTable& rt, |
75 |
|
{ |
76 |
|
name.resize(space_pos); |
77 |
|
} |
78 |
+ |
fprintf(stderr, "\tLoading %s...", name.c_str()); |
79 |
|
seqan::read(ref_stream, *ref_str, Fasta()); |
80 |
+ |
fprintf(stderr, "done\n"); |
81 |
|
|
82 |
|
rt.get_id(name, keep_seqs ? ref_str : NULL, 0); |
83 |
|
} |
85 |
|
|
86 |
|
struct cmp_read_alignment |
87 |
|
{ |
78 |
– |
cmp_read_alignment(const JunctionSet& gtf_juncs) : |
79 |
– |
_gtf_juncs(>f_juncs) {} |
80 |
– |
|
88 |
|
bool operator() (const BowtieHit& l, const BowtieHit& r) const |
89 |
|
{ |
90 |
< |
FragmentAlignmentGrade gl(l, *_gtf_juncs); |
84 |
< |
FragmentAlignmentGrade gr(r, *_gtf_juncs); |
85 |
< |
bool better = gr < gl; |
86 |
< |
bool worse = gl < gr; |
87 |
< |
|
88 |
< |
if (better && !worse) |
89 |
< |
return true; |
90 |
< |
else |
91 |
< |
return false; |
90 |
> |
return l.alignment_score() > r.alignment_score(); |
91 |
|
} |
93 |
– |
|
94 |
– |
const JunctionSet* _gtf_juncs; |
92 |
|
}; |
93 |
|
|
94 |
|
void read_best_alignments(const HitsForRead& hits_for_read, |
98 |
– |
FragmentAlignmentGrade& best_grade, |
95 |
|
HitsForRead& best_hits, |
96 |
< |
const JunctionSet& gtf_junctions) |
96 |
> |
const JunctionSet& gtf_junctions, |
97 |
> |
const JunctionSet& junctions = empty_junctions, |
98 |
> |
const InsertionSet& insertions = empty_insertions, |
99 |
> |
const DeletionSet& deletions = empty_deletions, |
100 |
> |
const FusionSet& fusions = empty_fusions, |
101 |
> |
const Coverage& coverage = empty_coverage, |
102 |
> |
bool final_report = false) |
103 |
|
{ |
104 |
|
const vector<BowtieHit>& hits = hits_for_read.hits; |
105 |
+ |
|
106 |
+ |
if (hits.size() >= max_multihits * 5) |
107 |
+ |
return; |
108 |
+ |
|
109 |
|
for (size_t i = 0; i < hits.size(); ++i) |
110 |
|
{ |
111 |
|
if (hits[i].edit_dist() > max_read_mismatches) |
112 |
|
continue; |
113 |
|
|
114 |
< |
if (report_secondary_alignments) |
114 |
> |
BowtieHit hit = hits[i]; |
115 |
> |
AlignStatus align_status(hit, gtf_junctions, |
116 |
> |
junctions, insertions, deletions, fusions, coverage); |
117 |
> |
hit.alignment_score(align_status._alignment_score); |
118 |
> |
|
119 |
> |
if (report_secondary_alignments || !final_report) |
120 |
|
{ |
121 |
< |
best_hits.hits.push_back(hits[i]); |
121 |
> |
best_hits.hits.push_back(hit); |
122 |
|
} |
123 |
|
else |
124 |
|
{ |
114 |
– |
FragmentAlignmentGrade g(hits[i], gtf_junctions); |
115 |
– |
|
125 |
|
// Is the new status better than the current best one? |
126 |
< |
if (best_grade < g) |
126 |
> |
if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0])) |
127 |
|
{ |
128 |
|
best_hits.hits.clear(); |
129 |
< |
best_grade = g; |
121 |
< |
best_hits.hits.push_back(hits[i]); |
129 |
> |
best_hits.hits.push_back(hit); |
130 |
|
} |
131 |
< |
else if (!(g < best_grade)) // is it just as good? |
131 |
> |
else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good? |
132 |
|
{ |
133 |
< |
best_grade.num_alignments++; |
126 |
< |
best_hits.hits.push_back(hits[i]); |
133 |
> |
best_hits.hits.push_back(hit); |
134 |
|
} |
135 |
|
} |
136 |
|
} |
137 |
|
|
138 |
< |
if (report_secondary_alignments && best_hits.hits.size() > 0) |
138 |
> |
if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0) |
139 |
> |
{ |
140 |
> |
sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment()); |
141 |
> |
} |
142 |
> |
|
143 |
> |
if (final_report) |
144 |
|
{ |
145 |
< |
cmp_read_alignment cmp(gtf_junctions); |
146 |
< |
sort(best_hits.hits.begin(), best_hits.hits.end(), cmp); |
135 |
< |
best_grade = FragmentAlignmentGrade(best_hits.hits[0], gtf_junctions); |
136 |
< |
best_grade.num_alignments = best_hits.hits.size(); |
145 |
> |
if (best_hits.hits.size() > max_multihits) |
146 |
> |
best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end()); |
147 |
|
} |
148 |
|
} |
149 |
|
|
150 |
< |
void set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade) |
150 |
> |
bool set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, const JunctionSet& junctions, InsertAlignmentGrade& grade) |
151 |
|
{ |
142 |
– |
// max mate inner distance (genomic) |
143 |
– |
int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev; |
144 |
– |
if (max_mate_inner_dist == -1) |
145 |
– |
max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev; |
146 |
– |
|
152 |
|
bool fusion = false; |
153 |
< |
if (fusion_search) |
153 |
> |
bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING; |
154 |
> |
bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING; |
155 |
> |
if (left_fusion && right_fusion) |
156 |
> |
return false; |
157 |
> |
|
158 |
> |
fusion = left_fusion || right_fusion; |
159 |
> |
if (!fusion && lh.ref_id() != rh.ref_id()) |
160 |
> |
fusion = true; |
161 |
> |
|
162 |
> |
if (!fusion && lh.ref_id() == rh.ref_id()) |
163 |
|
{ |
164 |
< |
bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING; |
151 |
< |
bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING; |
152 |
< |
if (left_fusion && right_fusion) |
153 |
< |
return; |
154 |
< |
|
155 |
< |
fusion = left_fusion || right_fusion; |
156 |
< |
if (!fusion && lh.ref_id() != rh.ref_id()) |
164 |
> |
if (lh.antisense_align() == rh.antisense_align()) |
165 |
|
fusion = true; |
166 |
< |
|
159 |
< |
if (!fusion && lh.ref_id() == rh.ref_id()) |
166 |
> |
else |
167 |
|
{ |
168 |
< |
if (lh.antisense_align() == rh.antisense_align()) |
169 |
< |
fusion = true; |
168 |
> |
int inner_dist = 0; |
169 |
> |
if (lh.antisense_align()) |
170 |
> |
// used rh.left() instead of rh.right() for the cases, |
171 |
> |
// where reads overlap with each other or reads span introns |
172 |
> |
inner_dist = lh.left() - rh.left(); |
173 |
|
else |
174 |
< |
{ |
175 |
< |
int inter_dist = 0; |
176 |
< |
if (lh.antisense_align()) |
177 |
< |
inter_dist = lh.left() - rh.right(); |
168 |
< |
else |
169 |
< |
inter_dist = rh.left() - lh.right(); |
170 |
< |
|
171 |
< |
if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist) |
172 |
< |
fusion = true; |
173 |
< |
} |
174 |
> |
inner_dist = rh.left() - lh.left(); |
175 |
> |
|
176 |
> |
if (inner_dist < 0 || inner_dist > (int)fusion_min_dist) |
177 |
> |
fusion = true; |
178 |
|
} |
179 |
|
} |
180 |
|
|
181 |
< |
grade = InsertAlignmentGrade(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion); |
181 |
> |
// a read contains its partner, in which case the paired mapping will be ignored. |
182 |
> |
if (!fusion) |
183 |
> |
{ |
184 |
> |
if (lh.left() <= rh.left() && lh.right() >= rh.right() && lh.right() - lh.left() > rh.right() - rh.left()) |
185 |
> |
return false; |
186 |
> |
else if (rh.left() <= lh.left() && rh.right() >= lh.right() && rh.right() - rh.left() > lh.right() - lh.left()) |
187 |
> |
return false; |
188 |
> |
} |
189 |
> |
|
190 |
> |
grade = InsertAlignmentGrade(lh, rh, junctions, fusion); |
191 |
> |
|
192 |
> |
return true; |
193 |
|
} |
194 |
|
|
195 |
|
struct cmp_pair_alignment |
196 |
|
{ |
197 |
< |
cmp_pair_alignment(const JunctionSet& gtf_juncs) : |
198 |
< |
_gtf_juncs(>f_juncs) {} |
197 |
> |
cmp_pair_alignment(const JunctionSet& junctions) : |
198 |
> |
_junctions(&junctions) {} |
199 |
|
|
200 |
|
bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const |
201 |
|
{ |
202 |
< |
InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, gl); |
203 |
< |
InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, gr); |
202 |
> |
InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, *_junctions, gl); |
203 |
> |
InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, *_junctions, gr); |
204 |
|
|
205 |
|
bool better = gr < gl; |
206 |
|
bool worse = gl < gr; |
211 |
|
return false; |
212 |
|
} |
213 |
|
|
214 |
< |
const JunctionSet* _gtf_juncs; |
214 |
> |
const JunctionSet* _junctions; |
215 |
|
}; |
216 |
|
|
217 |
|
void pair_best_alignments(const HitsForRead& left_hits, |
218 |
|
const HitsForRead& right_hits, |
219 |
|
InsertAlignmentGrade& best_grade, |
220 |
|
vector<pair<BowtieHit, BowtieHit> >& best_hits, |
221 |
< |
const JunctionSet& gtf_junctions) |
221 |
> |
const JunctionSet& gtf_junctions, |
222 |
> |
const JunctionSet& junctions = empty_junctions, |
223 |
> |
const InsertionSet& insertions = empty_insertions, |
224 |
> |
const DeletionSet& deletions = empty_deletions, |
225 |
> |
const FusionSet& fusions = empty_fusions, |
226 |
> |
const Coverage& coverage = empty_coverage, |
227 |
> |
bool final_report = false) |
228 |
|
{ |
229 |
|
const vector<BowtieHit>& left = left_hits.hits; |
230 |
|
const vector<BowtieHit>& right = right_hits.hits; |
231 |
< |
|
231 |
> |
|
232 |
> |
if (left.size() >= max_multihits * 5 || right.size() >= max_multihits * 5) |
233 |
> |
return; |
234 |
> |
|
235 |
|
for (size_t i = 0; i < left.size(); ++i) |
236 |
|
{ |
237 |
< |
const BowtieHit& lh = left[i]; |
238 |
< |
if (lh.edit_dist() > max_read_mismatches) continue; |
237 |
> |
if (left[i].edit_dist() > max_read_mismatches) continue; |
238 |
> |
|
239 |
> |
BowtieHit lh = left[i]; |
240 |
> |
AlignStatus align_status(lh, gtf_junctions, |
241 |
> |
junctions, insertions, deletions, fusions, coverage); |
242 |
> |
lh.alignment_score(align_status._alignment_score); |
243 |
> |
|
244 |
|
for (size_t j = 0; j < right.size(); ++j) |
245 |
|
{ |
246 |
< |
const BowtieHit& rh = right[j]; |
247 |
< |
if (rh.edit_dist() > max_read_mismatches) continue; |
246 |
> |
if (right[j].edit_dist() > max_read_mismatches) continue; |
247 |
> |
|
248 |
> |
BowtieHit rh = right[j]; |
249 |
> |
AlignStatus align_status(rh, gtf_junctions, |
250 |
> |
junctions, insertions, deletions, fusions, coverage); |
251 |
> |
rh.alignment_score(align_status._alignment_score); |
252 |
> |
InsertAlignmentGrade g; |
253 |
> |
bool allowed; |
254 |
> |
allowed = set_insert_alignment_grade(lh, rh, final_report ? junctions : gtf_junctions, g); |
255 |
|
|
256 |
< |
InsertAlignmentGrade g; set_insert_alignment_grade(lh, rh, g); |
256 |
> |
// daehwan - for debugging purposes |
257 |
> |
#if 0 |
258 |
> |
if (lh.insert_id() == 325708 && !g.is_fusion() && false) |
259 |
> |
{ |
260 |
> |
fprintf(stderr, "lh %d:%d %s score: %d (from %d) NM: %d\n", |
261 |
> |
lh.ref_id(), lh.left(), print_cigar(lh.cigar()).c_str(), |
262 |
> |
lh.alignment_score(), left[i].alignment_score(), lh.edit_dist()); |
263 |
> |
fprintf(stderr, "rh %d:%d %s score: %d (from %d) NM: %d\n", |
264 |
> |
rh.ref_id(), rh.left(), print_cigar(rh.cigar()).c_str(), |
265 |
> |
rh.alignment_score(), right[j].alignment_score(), rh.edit_dist()); |
266 |
> |
fprintf(stderr, "combined score: %d is_fusion(%d)\n", g.align_score(), g.is_fusion()); |
267 |
> |
} |
268 |
> |
#endif |
269 |
> |
|
270 |
> |
if (!allowed) continue; |
271 |
> |
if (!fusion_search && !report_discordant_pair_alignments && g.is_fusion()) continue; |
272 |
|
|
273 |
< |
if (report_secondary_alignments) |
273 |
> |
if (report_secondary_alignments || !final_report) |
274 |
|
{ |
275 |
|
best_hits.push_back(make_pair(lh, rh)); |
276 |
|
} |
286 |
|
else if (!(g < best_grade)) |
287 |
|
{ |
288 |
|
best_hits.push_back(make_pair(lh, rh)); |
238 |
– |
best_grade.num_alignments++; |
289 |
|
} |
290 |
|
} |
291 |
|
} |
292 |
|
} |
293 |
< |
|
294 |
< |
if (report_secondary_alignments && best_hits.size() > 0) |
293 |
> |
|
294 |
> |
if ((report_secondary_alignments || !final_report) && best_hits.size() > 0) |
295 |
|
{ |
296 |
< |
cmp_pair_alignment cmp(gtf_junctions); |
296 |
> |
cmp_pair_alignment cmp(final_report ? junctions : gtf_junctions); |
297 |
|
sort(best_hits.begin(), best_hits.end(), cmp); |
298 |
< |
set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade); |
299 |
< |
best_grade.num_alignments = best_hits.size(); |
298 |
> |
set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, final_report ? junctions : gtf_junctions, best_grade); |
299 |
> |
} |
300 |
> |
|
301 |
> |
if (final_report) |
302 |
> |
{ |
303 |
> |
if (best_hits.size() > max_multihits) |
304 |
> |
best_hits.erase(best_hits.begin() + max_multihits, best_hits.end()); |
305 |
|
} |
306 |
+ |
|
307 |
+ |
best_grade.num_alignments = best_hits.size(); |
308 |
|
} |
309 |
|
|
310 |
|
enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT}; |
390 |
|
const BowtieHit& bh, |
391 |
|
const char* bwt_buf, |
392 |
|
const char* read_alt_name, |
336 |
– |
const FragmentAlignmentGrade& grade, |
393 |
|
FragmentType insert_side, |
394 |
|
int num_hits, |
395 |
|
const BowtieHit* next_hit, |
402 |
|
tokenize(bwt_buf, "\t", sam_toks); |
403 |
|
|
404 |
|
string ref_name = sam_toks[2], ref_name2 = ""; |
405 |
< |
char rebuf2[2048] = {0}; |
350 |
< |
char cigar1[256] = {0}, cigar2[256] = {0}; |
405 |
> |
char cigar1[1024] = {0}, cigar2[1024] = {0}; |
406 |
|
string left_seq, right_seq, left_qual, right_qual; |
407 |
< |
int left = -1, left1 = -1, left2 = -1; |
407 |
> |
int left1 = -1, left2 = -1; |
408 |
|
bool fusion_alignment = false; |
409 |
|
size_t XF_index = 0; |
410 |
|
for (size_t i = 11; i < sam_toks.size(); ++i) |
411 |
|
{ |
412 |
< |
const string& tok = sam_toks[i]; |
412 |
> |
string& tok = sam_toks[i]; |
413 |
|
if (strncmp(tok.c_str(), "XF", 2) == 0) |
414 |
|
{ |
415 |
|
fusion_alignment = true; |
431 |
|
|
432 |
|
break; |
433 |
|
} |
434 |
+ |
|
435 |
+ |
else if (strncmp(tok.c_str(), "AS", 2) == 0) |
436 |
+ |
{ |
437 |
+ |
char AS_score[128] = {0}; |
438 |
+ |
sprintf(AS_score, "AS:i:%d", min(0, bh.alignment_score())); |
439 |
+ |
tok = AS_score; |
440 |
+ |
} |
441 |
|
} |
442 |
|
|
443 |
|
string qname(read_alt_name); |
463 |
|
flag |= 0x100; |
464 |
|
|
465 |
|
int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0; |
466 |
< |
int mapQ=255; |
467 |
< |
if (grade.num_alignments > 1) { |
468 |
< |
double err_prob = 1 - (1.0 / grade.num_alignments); |
466 |
> |
int mapQ = 50; |
467 |
> |
if (num_hits > 1) { |
468 |
> |
double err_prob = 1 - (1.0 / num_hits); |
469 |
|
mapQ = (int)(-10.0 * log(err_prob) / log(10.0)); |
470 |
|
} |
471 |
|
int tlen =atoi(sam_toks[8].c_str()); //TLEN |
472 |
|
int mate_pos=atoi(sam_toks[7].c_str()); |
473 |
|
|
474 |
+ |
string rg_aux = ""; |
475 |
+ |
if (!sam_readgroup_id.empty()) |
476 |
+ |
rg_aux = string("RG:Z:") + sam_readgroup_id; |
477 |
+ |
|
478 |
|
GBamRecord* bamrec=NULL; |
479 |
|
if (fusion_alignment) { |
480 |
|
vector<string> auxdata; |
481 |
|
add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); |
482 |
+ |
if (rg_aux != "") |
483 |
+ |
auxdata.push_back(rg_aux); |
484 |
|
bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ, |
485 |
|
cigar1, sam_toks[6].c_str(), mate_pos, |
486 |
|
tlen, left_seq.c_str(), left_qual.c_str(), &auxdata); |
490 |
|
auxdata.clear(); |
491 |
|
sam_toks[XF_index][5] = '2'; |
492 |
|
add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); |
493 |
+ |
if (rg_aux != "") |
494 |
+ |
auxdata.push_back(rg_aux); |
495 |
|
bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ, |
496 |
|
cigar2, sam_toks[6].c_str(), mate_pos, |
497 |
|
tlen, right_seq.c_str(), right_qual.c_str(), &auxdata); |
500 |
|
} else { |
501 |
|
vector<string> auxdata; |
502 |
|
add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); |
503 |
+ |
if (rg_aux != "") |
504 |
+ |
auxdata.push_back(rg_aux); |
505 |
|
bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ, |
506 |
|
sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos, |
507 |
|
tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata); |
546 |
|
flag |= 0x100; |
547 |
|
|
548 |
|
string ref_name = sam_toks[2], ref_name2 = ""; |
549 |
< |
char rebuf2[2048] = {0}; |
478 |
< |
char cigar1[256] = {0}, cigar2[256] = {0}; |
549 |
> |
char cigar1[1024] = {0}, cigar2[1024] = {0}; |
550 |
|
string left_seq, right_seq, left_qual, right_qual; |
551 |
< |
int left = -1, left1 = -1, left2 = -1; |
551 |
> |
int left1 = -1, left2 = -1; |
552 |
|
bool fusion_alignment = false; |
553 |
|
size_t XF_tok_idx = 11; |
554 |
|
for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx) |
555 |
|
{ |
556 |
< |
const string& tok = sam_toks[XF_tok_idx]; |
556 |
> |
string& tok = sam_toks[XF_tok_idx]; |
557 |
|
if (strncmp(tok.c_str(), "XF", 2) == 0) |
558 |
|
{ |
559 |
|
fusion_alignment = true; |
574 |
|
|
575 |
|
break; |
576 |
|
} |
577 |
+ |
|
578 |
+ |
else if (strncmp(tok.c_str(), "AS", 2) == 0) |
579 |
+ |
{ |
580 |
+ |
char AS_score[128] = {0}; |
581 |
+ |
sprintf(AS_score, "AS:i:%d", min(0, bh.alignment_score())); |
582 |
+ |
tok = AS_score; |
583 |
+ |
} |
584 |
|
} |
585 |
|
|
586 |
|
int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0; |
587 |
< |
int mapQ=255; |
587 |
> |
int mapQ = 50; |
588 |
|
if (grade.num_alignments > 1) { |
589 |
|
double err_prob = 1 - (1.0 / grade.num_alignments); |
590 |
|
mapQ = (int)(-10.0 * log(err_prob) / log(10.0)); |
621 |
|
flag |= 0x0008; |
622 |
|
} |
623 |
|
|
624 |
+ |
string rg_aux = ""; |
625 |
+ |
if (!sam_readgroup_id.empty()) |
626 |
+ |
rg_aux = string("RG:Z:") + sam_readgroup_id; |
627 |
+ |
|
628 |
|
GBamRecord* bamrec=NULL; |
629 |
|
if (fusion_alignment) { |
630 |
|
vector<string> auxdata; |
631 |
|
add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); |
632 |
+ |
if (rg_aux != "") |
633 |
+ |
auxdata.push_back(rg_aux); |
634 |
|
bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ, |
635 |
|
cigar1, sam_toks[6].c_str(), mate_pos, |
636 |
|
tlen, left_seq.c_str(), left_qual.c_str(), &auxdata); |
640 |
|
auxdata.clear(); |
641 |
|
sam_toks[XF_tok_idx][5] = '2'; |
642 |
|
add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); |
643 |
+ |
if (rg_aux != "") |
644 |
+ |
auxdata.push_back(rg_aux); |
645 |
|
bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ, |
646 |
|
cigar2, sam_toks[6].c_str(), mate_pos, |
647 |
|
tlen, right_seq.c_str(), right_qual.c_str(), &auxdata); |
650 |
|
} else { |
651 |
|
vector<string> auxdata; |
652 |
|
add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); |
653 |
+ |
if (rg_aux != "") |
654 |
+ |
auxdata.push_back(rg_aux); |
655 |
|
bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ, |
656 |
|
sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos, |
657 |
|
tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata); |
687 |
|
|
688 |
|
void print_sam_for_single(const RefSequenceTable& rt, |
689 |
|
const HitsForRead& hits, |
602 |
– |
const FragmentAlignmentGrade& grade, |
690 |
|
FragmentType frag_type, |
691 |
< |
Read& read, |
691 |
> |
const string& alt_name, |
692 |
|
GBamWriter& bam_writer, |
693 |
< |
FILE* um_out //write unmapped reads here |
607 |
< |
) |
693 |
> |
boost::random::mt19937& rng) |
694 |
|
{ |
695 |
< |
assert(!read.alt_name.empty()); |
695 |
> |
//assert(!read.alt_name.empty()); |
696 |
|
if (hits.hits.empty()) |
697 |
|
return; |
698 |
|
|
707 |
|
|
708 |
|
size_t primaryHit = 0; |
709 |
|
if (!report_secondary_alignments) |
710 |
< |
primaryHit = random() % hits.hits.size(); |
710 |
> |
primaryHit = rng() % hits.hits.size(); |
711 |
|
|
712 |
|
bool multipleHits = (hits.hits.size() > 1); |
713 |
|
for (i = 0; i < hits.hits.size(); ++i) |
718 |
|
rewrite_sam_record(bam_writer, rt, |
719 |
|
bh, |
720 |
|
bh.hitfile_rec().c_str(), |
721 |
< |
read.alt_name.c_str(), |
636 |
< |
grade, |
721 |
> |
alt_name.c_str(), |
722 |
|
frag_type, |
723 |
|
hits.hits.size(), |
724 |
|
(i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL, |
730 |
|
void print_sam_for_pair(const RefSequenceTable& rt, |
731 |
|
const vector<pair<BowtieHit, BowtieHit> >& best_hits, |
732 |
|
const InsertAlignmentGrade& grade, |
648 |
– |
ReadStream& left_reads_file, |
649 |
– |
ReadStream& right_reads_file, |
733 |
|
GBamWriter& bam_writer, |
734 |
< |
FILE* left_um_out, |
735 |
< |
FILE* right_um_out, |
734 |
> |
const string& left_alt_name, |
735 |
> |
const string& right_alt_name, |
736 |
> |
boost::random::mt19937& rng, |
737 |
|
uint64_t begin_id = 0, |
738 |
|
uint64_t end_id = std::numeric_limits<uint64_t>::max()) |
739 |
|
{ |
756 |
|
sort(index_vector.begin(), index_vector.end(), s); |
757 |
|
|
758 |
|
if (!report_secondary_alignments) |
759 |
< |
primaryHit = random() % right_hits.hits.size(); |
676 |
< |
|
677 |
< |
bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), left_read, |
678 |
< |
reads_format, false, begin_id, end_id, |
679 |
< |
left_um_out, false); |
680 |
< |
|
681 |
< |
bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), right_read, |
682 |
< |
reads_format, false, begin_id, end_id, |
683 |
< |
right_um_out, false); |
759 |
> |
primaryHit = rng() % right_hits.hits.size(); |
760 |
|
|
685 |
– |
assert (got_left_read && got_right_read); |
761 |
|
bool multipleHits = (best_hits.size() > 1); |
762 |
|
for (i = 0; i < best_hits.size(); ++i) |
763 |
|
{ |
769 |
|
rewrite_sam_record(bam_writer, rt, |
770 |
|
right_bh, |
771 |
|
right_bh.hitfile_rec().c_str(), |
772 |
< |
right_read.alt_name.c_str(), |
772 |
> |
right_alt_name.c_str(), |
773 |
|
grade, |
774 |
|
FRAG_RIGHT, |
775 |
|
&left_bh, |
780 |
|
rewrite_sam_record(bam_writer, rt, |
781 |
|
left_bh, |
782 |
|
left_bh.hitfile_rec().c_str(), |
783 |
< |
left_read.alt_name.c_str(), |
783 |
> |
left_alt_name.c_str(), |
784 |
|
grade, |
785 |
|
FRAG_LEFT, |
786 |
|
&right_bh, |
801 |
|
InsertionSet& insertions, |
802 |
|
DeletionSet& deletions) |
803 |
|
{ |
804 |
< |
for (size_t i = 0; i < hits.hits.size(); ++i) |
804 |
> |
for (size_t i = 0; i < hits.hits.size(); ++i) |
805 |
> |
{ |
806 |
> |
const BowtieHit& bh = hits.hits[i]; |
807 |
> |
insertions_from_alignment(bh, insertions); |
808 |
> |
deletions_from_alignment(bh, deletions); |
809 |
> |
} |
810 |
> |
} |
811 |
> |
|
812 |
> |
void update_coverage(const HitsForRead& hits, |
813 |
> |
Coverage& coverage) |
814 |
> |
{ |
815 |
> |
for (size_t i = 0; i < hits.hits.size(); ++i) |
816 |
> |
{ |
817 |
> |
const BowtieHit& hit = hits.hits[i]; |
818 |
> |
const vector<CigarOp>& cigar = hit.cigar(); |
819 |
> |
unsigned int positionInGenome = hit.left(); |
820 |
> |
RefID ref_id = hit.ref_id(); |
821 |
> |
|
822 |
> |
for(size_t c = 0; c < cigar.size(); ++c) |
823 |
|
{ |
824 |
< |
const BowtieHit& bh = hits.hits[i]; |
825 |
< |
insertions_from_alignment(bh, insertions); |
826 |
< |
deletions_from_alignment(bh, deletions); |
824 |
> |
int opcode = cigar[c].opcode; |
825 |
> |
int length = cigar[c].length; |
826 |
> |
switch(opcode) |
827 |
> |
{ |
828 |
> |
case REF_SKIP: |
829 |
> |
case MATCH: |
830 |
> |
case DEL: |
831 |
> |
if (opcode == MATCH) |
832 |
> |
coverage.add_coverage(ref_id, positionInGenome, length); |
833 |
> |
|
834 |
> |
positionInGenome += length; |
835 |
> |
break; |
836 |
> |
case rEF_SKIP: |
837 |
> |
case mATCH: |
838 |
> |
case dEL: |
839 |
> |
positionInGenome -= length; |
840 |
> |
|
841 |
> |
if (opcode == mATCH) |
842 |
> |
coverage.add_coverage(ref_id, positionInGenome + 1, length); |
843 |
> |
break; |
844 |
> |
case FUSION_FF: |
845 |
> |
case FUSION_FR: |
846 |
> |
case FUSION_RF: |
847 |
> |
case FUSION_RR: |
848 |
> |
positionInGenome = length; |
849 |
> |
ref_id = hit.ref_id2(); |
850 |
> |
break; |
851 |
> |
default: |
852 |
> |
break; |
853 |
> |
} |
854 |
|
} |
855 |
+ |
} |
856 |
|
} |
857 |
|
|
858 |
|
|
738 |
– |
FusionSet empty_fusions; |
859 |
|
void update_fusions(const HitsForRead& hits, |
860 |
|
RefSequenceTable& rt, |
861 |
|
FusionSet& fusions, |
862 |
< |
FusionSet& fusions_ref = empty_fusions) |
862 |
> |
const FusionSet& fusions_ref = empty_fusions) |
863 |
|
{ |
864 |
+ |
if (!fusion_search) |
865 |
+ |
return; |
866 |
+ |
|
867 |
|
if (hits.hits.size() > fusion_multireads) |
868 |
|
return; |
869 |
|
|
875 |
|
if (bh.edit_dist() > fusion_read_mismatches) |
876 |
|
continue; |
877 |
|
|
755 |
– |
// daehwan |
756 |
– |
#if 0 |
757 |
– |
vector<Fusion> new_fusions; |
758 |
– |
fusions_from_spliced_hit(bh, new_fusions); |
759 |
– |
|
760 |
– |
for(size_t i = 0; i < new_fusions.size(); ++i) |
761 |
– |
{ |
762 |
– |
Fusion fusion = new_fusions[i]; |
763 |
– |
if (fusion.left == 110343414 && fusion.right == 80211173 && hits.hits.size() > 1) |
764 |
– |
{ |
765 |
– |
for (size_t t = 0; t < hits.hits.size(); ++t) |
766 |
– |
{ |
767 |
– |
const BowtieHit& lh = hits.hits[t]; |
768 |
– |
cout << "insert id: " << lh.insert_id() << endl; |
769 |
– |
cout << (lh.antisense_align() ? "-" : "+") << endl; |
770 |
– |
cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl; |
771 |
– |
cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl; |
772 |
– |
} |
773 |
– |
} |
774 |
– |
} |
775 |
– |
#endif |
776 |
– |
|
878 |
|
fusions_from_alignment(bh, fusions, rt, update_stat); |
879 |
|
|
880 |
|
if (update_stat) |
885 |
|
void update_junctions(const HitsForRead& hits, |
886 |
|
JunctionSet& junctions) |
887 |
|
{ |
888 |
< |
for (size_t i = 0; i < hits.hits.size(); ++i) |
889 |
< |
{ |
890 |
< |
const BowtieHit& bh = hits.hits[i]; |
891 |
< |
junctions_from_alignment(bh, junctions); |
892 |
< |
} |
888 |
> |
for (size_t i = 0; i < hits.hits.size(); ++i) |
889 |
> |
{ |
890 |
> |
const BowtieHit& bh = hits.hits[i]; |
891 |
> |
junctions_from_alignment(bh, junctions); |
892 |
> |
} |
893 |
|
} |
894 |
|
|
895 |
|
// Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file |
929 |
|
hits = remaining; |
930 |
|
} |
931 |
|
|
932 |
+ |
void realign_reads(HitsForRead& hits, |
933 |
+ |
const RefSequenceTable& rt, |
934 |
+ |
const JunctionSet& junctions, |
935 |
+ |
const JunctionSet& rev_junctions, |
936 |
+ |
const InsertionSet& insertions, |
937 |
+ |
const DeletionSet& deletions, |
938 |
+ |
const DeletionSet& rev_deletions, |
939 |
+ |
const FusionSet& fusions) |
940 |
+ |
{ |
941 |
+ |
if (color) |
942 |
+ |
return; |
943 |
+ |
|
944 |
+ |
vector<BowtieHit> additional_hits; |
945 |
+ |
for (size_t i = 0; i < hits.hits.size(); ++i) |
946 |
+ |
{ |
947 |
+ |
BowtieHit& bh = hits.hits[i]; |
948 |
+ |
if (fusion_search && bh.fusion_opcode() != FUSION_NOTHING) |
949 |
+ |
return; |
950 |
+ |
|
951 |
+ |
const vector<CigarOp>& cigars = bh.cigar(); |
952 |
+ |
int pos = bh.left(); |
953 |
+ |
int refid = bh.ref_id(); |
954 |
+ |
for (size_t j = 0; j < cigars.size(); ++j) |
955 |
+ |
{ |
956 |
+ |
const CigarOp& op = cigars[j]; |
957 |
+ |
if (j == 0 || j == cigars.size() - 1) |
958 |
+ |
{ |
959 |
+ |
// let's do this for MATCH case only, |
960 |
+ |
if (op.opcode == MATCH || op.opcode == mATCH) |
961 |
+ |
{ |
962 |
+ |
int left1, left2; |
963 |
+ |
if (op.opcode == MATCH) |
964 |
+ |
{ |
965 |
+ |
left1 = pos; |
966 |
+ |
left2 = pos + op.length - 1; |
967 |
+ |
} |
968 |
+ |
else |
969 |
+ |
{ |
970 |
+ |
left1 = pos - op.length + 1; |
971 |
+ |
left2 = pos; |
972 |
+ |
} |
973 |
+ |
|
974 |
+ |
{ |
975 |
+ |
JunctionSet::const_iterator lb, ub; |
976 |
+ |
JunctionSet temp_junctions; |
977 |
+ |
if (j == 0) |
978 |
+ |
{ |
979 |
+ |
lb = rev_junctions.upper_bound(Junction(refid, left1, 0, true)); |
980 |
+ |
ub = rev_junctions.lower_bound(Junction(refid, left2, left2, false)); |
981 |
+ |
while (lb != ub && lb != rev_junctions.end()) |
982 |
+ |
{ |
983 |
+ |
Junction temp_junction = lb->first; |
984 |
+ |
temp_junction.left = lb->first.right; |
985 |
+ |
temp_junction.right = lb->first.left; |
986 |
+ |
temp_junctions[temp_junction] = lb->second; |
987 |
+ |
++lb; |
988 |
+ |
} |
989 |
+ |
} |
990 |
+ |
|
991 |
+ |
if (j == cigars.size() - 1) |
992 |
+ |
{ |
993 |
+ |
int common_right = left2 + max_report_intron_length; |
994 |
+ |
lb = junctions.upper_bound(Junction(refid, left1, common_right, true)); |
995 |
+ |
ub = junctions.lower_bound(Junction(refid, left2, common_right, false)); |
996 |
+ |
while (lb != ub && lb != junctions.end()) |
997 |
+ |
{ |
998 |
+ |
temp_junctions[lb->first] = lb->second; |
999 |
+ |
++lb; |
1000 |
+ |
} |
1001 |
+ |
} |
1002 |
+ |
|
1003 |
+ |
if (temp_junctions.size() > 10) |
1004 |
+ |
continue; |
1005 |
+ |
|
1006 |
+ |
JunctionSet::const_iterator junc_iter = temp_junctions.begin(); |
1007 |
+ |
for (; junc_iter != temp_junctions.end(); ++junc_iter) |
1008 |
+ |
{ |
1009 |
+ |
Junction junc = junc_iter->first; |
1010 |
+ |
|
1011 |
+ |
/* |
1012 |
+ |
fprintf(stderr, "%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", |
1013 |
+ |
bh.insert_id(), bh.left(), bh.right(), |
1014 |
+ |
print_cigar(bh.cigar()).c_str(), |
1015 |
+ |
bh.alignment_score(), bh.edit_dist(), |
1016 |
+ |
junc.left, junc.right); |
1017 |
+ |
//*/ |
1018 |
+ |
|
1019 |
+ |
int new_left = bh.left(); |
1020 |
+ |
int intron_length = junc.right - junc.left - 1; |
1021 |
+ |
vector<CigarOp> new_cigars; |
1022 |
+ |
bool anchored = false; |
1023 |
+ |
if (j == 0 && bh.left() > (int)junc.left) |
1024 |
+ |
{ |
1025 |
+ |
new_left -= intron_length; |
1026 |
+ |
int before_match_length = junc.left - new_left + 1;; |
1027 |
+ |
int after_match_length = op.length - before_match_length; |
1028 |
+ |
|
1029 |
+ |
if (before_match_length > 0 && after_match_length) |
1030 |
+ |
{ |
1031 |
+ |
anchored = true; |
1032 |
+ |
new_cigars.push_back(CigarOp(MATCH, before_match_length)); |
1033 |
+ |
new_cigars.push_back(CigarOp(REF_SKIP, intron_length)); |
1034 |
+ |
new_cigars.push_back(CigarOp(MATCH, after_match_length)); |
1035 |
+ |
|
1036 |
+ |
new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end()); |
1037 |
+ |
} |
1038 |
+ |
} |
1039 |
+ |
else |
1040 |
+ |
{ |
1041 |
+ |
new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1); |
1042 |
+ |
|
1043 |
+ |
int before_match_length = junc.left - pos + 1; |
1044 |
+ |
int after_match_length = op.length - before_match_length; |
1045 |
+ |
|
1046 |
+ |
if (before_match_length > 0 && after_match_length > 0) |
1047 |
+ |
{ |
1048 |
+ |
anchored = true; |
1049 |
+ |
new_cigars.push_back(CigarOp(MATCH, before_match_length)); |
1050 |
+ |
new_cigars.push_back(CigarOp(REF_SKIP, intron_length)); |
1051 |
+ |
new_cigars.push_back(CigarOp(MATCH, after_match_length)); |
1052 |
+ |
} |
1053 |
+ |
} |
1054 |
+ |
|
1055 |
+ |
BowtieHit new_bh(bh.ref_id(), |
1056 |
+ |
bh.ref_id2(), |
1057 |
+ |
bh.insert_id(), |
1058 |
+ |
new_left, |
1059 |
+ |
new_cigars, |
1060 |
+ |
bh.antisense_align(), |
1061 |
+ |
junc.antisense, |
1062 |
+ |
0, /* edit_dist - needs to be recalculated */ |
1063 |
+ |
0, /* splice_mms - needs to be recalculated */ |
1064 |
+ |
false); |
1065 |
+ |
|
1066 |
+ |
new_bh.seq(bh.seq()); |
1067 |
+ |
new_bh.qual(bh.qual()); |
1068 |
+ |
|
1069 |
+ |
const RefSequenceTable::Sequence* ref_str = rt.get_seq(bh.ref_id()); |
1070 |
+ |
|
1071 |
+ |
if (new_left >= 0 && |
1072 |
+ |
new_bh.right() <= (int)length(*ref_str) && |
1073 |
+ |
anchored) |
1074 |
+ |
{ |
1075 |
+ |
vector<string> aux_fields; |
1076 |
+ |
bowtie_sam_extra(new_bh, rt, aux_fields); |
1077 |
+ |
|
1078 |
+ |
vector<string>::const_iterator aux_iter = aux_fields.begin(); |
1079 |
+ |
for (; aux_iter != aux_fields.end(); ++aux_iter) |
1080 |
+ |
{ |
1081 |
+ |
const string& aux_field = *aux_iter; |
1082 |
+ |
if (strncmp(aux_field.c_str(), "AS", 2) == 0) |
1083 |
+ |
{ |
1084 |
+ |
int alignment_score = atoi(aux_field.c_str() + 5); |
1085 |
+ |
new_bh.alignment_score(alignment_score); |
1086 |
+ |
} |
1087 |
+ |
else if (strncmp(aux_field.c_str(), "XM", 2) == 0) |
1088 |
+ |
{ |
1089 |
+ |
int XM_value = atoi(aux_field.c_str() + 5); |
1090 |
+ |
new_bh.edit_dist(XM_value); |
1091 |
+ |
} |
1092 |
+ |
} |
1093 |
+ |
|
1094 |
+ |
vector<string> sam_toks; |
1095 |
+ |
tokenize(bh.hitfile_rec().c_str(), "\t", sam_toks); |
1096 |
+ |
|
1097 |
+ |
char coord[20] = {0,}; |
1098 |
+ |
sprintf(coord, "%d", new_bh.left() + 1); |
1099 |
+ |
sam_toks[3] = coord; |
1100 |
+ |
sam_toks[5] = print_cigar(new_bh.cigar()); |
1101 |
+ |
for (size_t a = 11; a < sam_toks.size(); ++a) |
1102 |
+ |
{ |
1103 |
+ |
string& sam_tok = sam_toks[a]; |
1104 |
+ |
for (size_t b = 0; b < aux_fields.size(); ++b) |
1105 |
+ |
{ |
1106 |
+ |
const string& aux_tok = aux_fields[b]; |
1107 |
+ |
if (strncmp(sam_tok.c_str(), aux_tok.c_str(), 5) == 0) |
1108 |
+ |
{ |
1109 |
+ |
sam_tok = aux_tok; |
1110 |
+ |
break; |
1111 |
+ |
} |
1112 |
+ |
} |
1113 |
+ |
} |
1114 |
+ |
|
1115 |
+ |
if (!bh.is_spliced()) |
1116 |
+ |
{ |
1117 |
+ |
if (junc.antisense) |
1118 |
+ |
sam_toks.push_back("XS:A:-"); |
1119 |
+ |
else |
1120 |
+ |
sam_toks.push_back("XS:A:+"); |
1121 |
+ |
} |
1122 |
+ |
|
1123 |
+ |
|
1124 |
+ |
string new_rec = ""; |
1125 |
+ |
for (size_t d = 0; d < sam_toks.size(); ++d) |
1126 |
+ |
{ |
1127 |
+ |
new_rec += sam_toks[d]; |
1128 |
+ |
if (d < sam_toks.size() - 1) |
1129 |
+ |
new_rec += "\t"; |
1130 |
+ |
} |
1131 |
+ |
|
1132 |
+ |
new_bh.hitfile_rec(new_rec); |
1133 |
+ |
|
1134 |
+ |
if (new_bh.edit_dist() <= bh.edit_dist()) |
1135 |
+ |
additional_hits.push_back(new_bh); |
1136 |
+ |
|
1137 |
+ |
/* |
1138 |
+ |
fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", |
1139 |
+ |
new_bh.insert_id(), new_bh.left(), new_bh.right(), |
1140 |
+ |
print_cigar(new_bh.cigar()).c_str(), |
1141 |
+ |
new_bh.alignment_score(), new_bh.edit_dist(), |
1142 |
+ |
junc.left, junc.right); |
1143 |
+ |
//*/ |
1144 |
+ |
} |
1145 |
+ |
} |
1146 |
+ |
} |
1147 |
+ |
|
1148 |
+ |
#if 0 |
1149 |
+ |
{ |
1150 |
+ |
DeletionSet::const_iterator lb, ub; |
1151 |
+ |
bool use_rev_deletions = (j == 0); |
1152 |
+ |
const DeletionSet& curr_deletions = (use_rev_deletions ? rev_deletions : deletions); |
1153 |
+ |
if (use_rev_deletions) |
1154 |
+ |
{ |
1155 |
+ |
lb = curr_deletions.upper_bound(Deletion(refid, left1, 0, true)); |
1156 |
+ |
ub = curr_deletions.lower_bound(Deletion(refid, left2, left2, false)); |
1157 |
+ |
} |
1158 |
+ |
else |
1159 |
+ |
{ |
1160 |
+ |
int common_right = left2 + 100; |
1161 |
+ |
lb = curr_deletions.upper_bound(Deletion(refid, left1, common_right, true)); |
1162 |
+ |
ub = curr_deletions.lower_bound(Deletion(refid, left2, common_right, false)); |
1163 |
+ |
} |
1164 |
+ |
|
1165 |
+ |
while (lb != curr_deletions.end() && lb != ub) |
1166 |
+ |
{ |
1167 |
+ |
Deletion del = lb->first; |
1168 |
+ |
if (use_rev_deletions) |
1169 |
+ |
{ |
1170 |
+ |
int temp = del.left; |
1171 |
+ |
del.left = del.right; |
1172 |
+ |
del.right = temp; |
1173 |
+ |
} |
1174 |
+ |
|
1175 |
+ |
// daehwan - for debuggin purposes |
1176 |
+ |
/* |
1177 |
+ |
fprintf(stderr, "(type%d) %d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", |
1178 |
+ |
!use_rev_junctions, |
1179 |
+ |
bh.insert_id(), bh.left(), bh.right(), |
1180 |
+ |
print_cigar(bh.cigar()).c_str(), |
1181 |
+ |
bh.alignment_score(), bh.edit_dist(), |
1182 |
+ |
junc.left, junc.right); |
1183 |
+ |
*/ |
1184 |
+ |
|
1185 |
+ |
int del_length = del.right - del.left - 1; |
1186 |
+ |
int new_left = bh.left(); |
1187 |
+ |
if (j == 0) |
1188 |
+ |
new_left -= del_length; |
1189 |
+ |
|
1190 |
+ |
vector<CigarOp> new_cigars; |
1191 |
+ |
if (j == 0) |
1192 |
+ |
{ |
1193 |
+ |
int before_match_length = del.left - new_left + 1;; |
1194 |
+ |
int after_match_length = op.length - before_match_length; |
1195 |
+ |
|
1196 |
+ |
if (before_match_length > 0) |
1197 |
+ |
new_cigars.push_back(CigarOp(MATCH, before_match_length)); |
1198 |
+ |
new_cigars.push_back(CigarOp(DEL, del_length)); |
1199 |
+ |
if (after_match_length > 0) |
1200 |
+ |
new_cigars.push_back(CigarOp(MATCH, after_match_length)); |
1201 |
+ |
|
1202 |
+ |
new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end()); |
1203 |
+ |
} |
1204 |
+ |
else |
1205 |
+ |
{ |
1206 |
+ |
new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1); |
1207 |
+ |
|
1208 |
+ |
int before_match_length = del.left - pos + 1; |
1209 |
+ |
int after_match_length = op.length - before_match_length; |
1210 |
+ |
|
1211 |
+ |
if (before_match_length > 0) |
1212 |
+ |
new_cigars.push_back(CigarOp(MATCH, before_match_length)); |
1213 |
+ |
new_cigars.push_back(CigarOp(DEL, del_length)); |
1214 |
+ |
if (after_match_length > 0) |
1215 |
+ |
new_cigars.push_back(CigarOp(MATCH, after_match_length)); |
1216 |
+ |
} |
1217 |
+ |
|
1218 |
+ |
BowtieHit new_bh(bh.ref_id(), |
1219 |
+ |
bh.ref_id2(), |
1220 |
+ |
bh.insert_id(), |
1221 |
+ |
new_left, |
1222 |
+ |
new_cigars, |
1223 |
+ |
bh.antisense_align(), |
1224 |
+ |
bh.antisense_splice(), |
1225 |
+ |
0, /* edit_dist - needs to be recalculated */ |
1226 |
+ |
0, /* splice_mms - needs to be recalculated */ |
1227 |
+ |
false); |
1228 |
+ |
|
1229 |
+ |
new_bh.seq(bh.seq()); |
1230 |
+ |
new_bh.qual(bh.qual()); |
1231 |
+ |
|
1232 |
+ |
vector<string> aux_fields; |
1233 |
+ |
bowtie_sam_extra(new_bh, rt, aux_fields); |
1234 |
+ |
|
1235 |
+ |
vector<string>::const_iterator aux_iter = aux_fields.begin(); |
1236 |
+ |
for (; aux_iter != aux_fields.end(); ++aux_iter) |
1237 |
+ |
{ |
1238 |
+ |
const string& aux_field = *aux_iter; |
1239 |
+ |
if (strncmp(aux_field.c_str(), "AS", 2) == 0) |
1240 |
+ |
{ |
1241 |
+ |
int alignment_score = atoi(aux_field.c_str() + 5); |
1242 |
+ |
new_bh.alignment_score(alignment_score); |
1243 |
+ |
} |
1244 |
+ |
else if (strncmp(aux_field.c_str(), "XM", 2) == 0) |
1245 |
+ |
{ |
1246 |
+ |
int XM_value = atoi(aux_field.c_str() + 5); |
1247 |
+ |
new_bh.edit_dist(XM_value); |
1248 |
+ |
} |
1249 |
+ |
} |
1250 |
+ |
|
1251 |
+ |
vector<string> sam_toks; |
1252 |
+ |
tokenize(bh.hitfile_rec().c_str(), "\t", sam_toks); |
1253 |
+ |
|
1254 |
+ |
char coord[20] = {0,}; |
1255 |
+ |
sprintf(coord, "%d", new_bh.left() + 1); |
1256 |
+ |
sam_toks[3] = coord; |
1257 |
+ |
sam_toks[5] = print_cigar(new_bh.cigar()); |
1258 |
+ |
for (size_t a = 11; a < sam_toks.size(); ++a) |
1259 |
+ |
{ |
1260 |
+ |
string& sam_tok = sam_toks[a]; |
1261 |
+ |
for (size_t b = 0; b < aux_fields.size(); ++b) |
1262 |
+ |
{ |
1263 |
+ |
const string& aux_tok = aux_fields[b]; |
1264 |
+ |
if (strncmp(sam_tok.c_str(), aux_tok.c_str(), 5) == 0) |
1265 |
+ |
{ |
1266 |
+ |
sam_tok = aux_tok; |
1267 |
+ |
break; |
1268 |
+ |
} |
1269 |
+ |
} |
1270 |
+ |
} |
1271 |
+ |
|
1272 |
+ |
string new_rec = ""; |
1273 |
+ |
for (size_t d = 0; d < sam_toks.size(); ++d) |
1274 |
+ |
{ |
1275 |
+ |
new_rec += sam_toks[d]; |
1276 |
+ |
if (d < sam_toks.size() - 1) |
1277 |
+ |
new_rec += "\t"; |
1278 |
+ |
} |
1279 |
+ |
|
1280 |
+ |
new_bh.hitfile_rec(new_rec); |
1281 |
+ |
|
1282 |
+ |
if (new_bh.edit_dist() <= bh.edit_dist()) |
1283 |
+ |
additional_hits.push_back(new_bh); |
1284 |
+ |
|
1285 |
+ |
/* |
1286 |
+ |
fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", |
1287 |
+ |
new_bh.insert_id(), new_bh.left(), new_bh.right(), |
1288 |
+ |
print_cigar(new_bh.cigar()).c_str(), |
1289 |
+ |
new_bh.alignment_score(), new_bh.edit_dist(), |
1290 |
+ |
junc.left, junc.right); |
1291 |
+ |
*/ |
1292 |
+ |
|
1293 |
+ |
++lb; |
1294 |
+ |
} |
1295 |
+ |
} |
1296 |
+ |
|
1297 |
+ |
{ |
1298 |
+ |
InsertionSet::const_iterator lb, ub; |
1299 |
+ |
lb = insertions.upper_bound(Insertion(refid, left1, "")); |
1300 |
+ |
ub = insertions.lower_bound(Insertion(refid, left2, "")); |
1301 |
+ |
|
1302 |
+ |
while (lb != insertions.end() && lb != ub) |
1303 |
+ |
{ |
1304 |
+ |
// daehwan - for debugging purposse |
1305 |
+ |
break; |
1306 |
+ |
|
1307 |
+ |
Insertion ins = lb->first; |
1308 |
+ |
|
1309 |
+ |
// daehwan - for debugging purposes |
1310 |
+ |
/* |
1311 |
+ |
fprintf(stderr, "(type%d) %d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", |
1312 |
+ |
!use_rev_junctions, |
1313 |
+ |
bh.insert_id(), bh.left(), bh.right(), |
1314 |
+ |
print_cigar(bh.cigar()).c_str(), |
1315 |
+ |
bh.alignment_score(), bh.edit_dist(), |
1316 |
+ |
junc.left, junc.right); |
1317 |
+ |
*/ |
1318 |
+ |
|
1319 |
+ |
int ins_length = ins.sequence.length(); |
1320 |
+ |
int new_left = bh.left(); |
1321 |
+ |
if (j == 0) |
1322 |
+ |
new_left -= ins_length; |
1323 |
+ |
|
1324 |
+ |
vector<CigarOp> new_cigars; |
1325 |
+ |
if (j == 0) |
1326 |
+ |
{ |
1327 |
+ |
int before_match_length = ins.left - new_left + 1;; |
1328 |
+ |
int after_match_length = op.length - before_match_length - ins_length; |
1329 |
+ |
|
1330 |
+ |
if (before_match_length > 0) |
1331 |
+ |
new_cigars.push_back(CigarOp(MATCH, before_match_length)); |
1332 |
+ |
new_cigars.push_back(CigarOp(INS, ins_length)); |
1333 |
+ |
if (after_match_length > 0) |
1334 |
+ |
new_cigars.push_back(CigarOp(MATCH, after_match_length)); |
1335 |
+ |
|
1336 |
+ |
new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end()); |
1337 |
+ |
} |
1338 |
+ |
else |
1339 |
+ |
{ |
1340 |
+ |
new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1); |
1341 |
+ |
|
1342 |
+ |
int before_match_length = ins.left - pos + 1; |
1343 |
+ |
int after_match_length = op.length - before_match_length - ins_length; |
1344 |
+ |
|
1345 |
+ |
if (before_match_length > 0) |
1346 |
+ |
new_cigars.push_back(CigarOp(MATCH, before_match_length)); |
1347 |
+ |
new_cigars.push_back(CigarOp(INS, ins_length)); |
1348 |
+ |
if (after_match_length > 0) |
1349 |
+ |
new_cigars.push_back(CigarOp(MATCH, after_match_length)); |
1350 |
+ |
} |
1351 |
+ |
|
1352 |
+ |
BowtieHit new_bh(bh.ref_id(), |
1353 |
+ |
bh.ref_id2(), |
1354 |
+ |
bh.insert_id(), |
1355 |
+ |
new_left, |
1356 |
+ |
new_cigars, |
1357 |
+ |
bh.antisense_align(), |
1358 |
+ |
bh.antisense_splice(), |
1359 |
+ |
0, /* edit_dist - needs to be recalculated */ |
1360 |
+ |
0, /* splice_mms - needs to be recalculated */ |
1361 |
+ |
false); |
1362 |
+ |
|
1363 |
+ |
new_bh.seq(bh.seq()); |
1364 |
+ |
new_bh.qual(bh.qual()); |
1365 |
+ |
|
1366 |
+ |
vector<string> aux_fields; |
1367 |
+ |
bowtie_sam_extra(new_bh, rt, aux_fields); |
1368 |
+ |
|
1369 |
+ |
vector<string>::const_iterator aux_iter = aux_fields.begin(); |
1370 |
+ |
for (; aux_iter != aux_fields.end(); ++aux_iter) |
1371 |
+ |
{ |
1372 |
+ |
const string& aux_field = *aux_iter; |
1373 |
+ |
if (strncmp(aux_field.c_str(), "AS", 2) == 0) |
1374 |
+ |
{ |
1375 |
+ |
int alignment_score = atoi(aux_field.c_str() + 5); |
1376 |
+ |
new_bh.alignment_score(alignment_score); |
1377 |
+ |
} |
1378 |
+ |
else if (strncmp(aux_field.c_str(), "XM", 2) == 0) |
1379 |
+ |
{ |
1380 |
+ |
int XM_value = atoi(aux_field.c_str() + 5); |
1381 |
+ |
new_bh.edit_dist(XM_value); |
1382 |
+ |
} |
1383 |
+ |
} |
1384 |
+ |
|
1385 |
+ |
/* |
1386 |
+ |
fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", |
1387 |
+ |
new_bh.insert_id(), new_bh.left(), new_bh.right(), |
1388 |
+ |
print_cigar(new_bh.cigar()).c_str(), |
1389 |
+ |
new_bh.alignment_score(), new_bh.edit_dist(), |
1390 |
+ |
junc.left, junc.right); |
1391 |
+ |
*/ |
1392 |
+ |
|
1393 |
+ |
++lb; |
1394 |
+ |
} |
1395 |
+ |
} |
1396 |
+ |
#endif |
1397 |
+ |
} |
1398 |
+ |
} |
1399 |
+ |
|
1400 |
+ |
switch(op.opcode) |
1401 |
+ |
{ |
1402 |
+ |
case REF_SKIP: |
1403 |
+ |
pos += op.length; |
1404 |
+ |
break; |
1405 |
+ |
case rEF_SKIP: |
1406 |
+ |
pos -= op.length; |
1407 |
+ |
break; |
1408 |
+ |
case MATCH: |
1409 |
+ |
case DEL: |
1410 |
+ |
pos += op.length; |
1411 |
+ |
break; |
1412 |
+ |
case mATCH: |
1413 |
+ |
case dEL: |
1414 |
+ |
pos -= op.length; |
1415 |
+ |
break; |
1416 |
+ |
case FUSION_FF: |
1417 |
+ |
case FUSION_FR: |
1418 |
+ |
case FUSION_RF: |
1419 |
+ |
case FUSION_RR: |
1420 |
+ |
pos = op.length; |
1421 |
+ |
refid = bh.ref_id2(); |
1422 |
+ |
break; |
1423 |
+ |
default: |
1424 |
+ |
break; |
1425 |
+ |
} |
1426 |
+ |
} |
1427 |
+ |
} |
1428 |
+ |
|
1429 |
+ |
hits.hits.insert(hits.hits.end(), additional_hits.begin(), additional_hits.end()); |
1430 |
+ |
|
1431 |
+ |
std::sort(hits.hits.begin(), hits.hits.end()); |
1432 |
+ |
vector<BowtieHit>::iterator new_end = std::unique(hits.hits.begin(), hits.hits.end()); |
1433 |
+ |
hits.hits.erase(new_end, hits.hits.end()); |
1434 |
+ |
} |
1435 |
+ |
|
1436 |
+ |
|
1437 |
|
// events include splice junction, indels, and fusion points. |
1438 |
|
struct ConsensusEventsWorker |
1439 |
|
{ |
1471 |
|
{ |
1472 |
|
HitsForRead best_hits; |
1473 |
|
best_hits.insert_id = curr_left_obs_order; |
868 |
– |
FragmentAlignmentGrade best_grade; |
1474 |
|
|
1475 |
|
// Process hits for left singleton, select best alignments |
1476 |
< |
read_best_alignments(curr_left_hit_group, best_grade, best_hits, *gtf_junctions); |
1476 |
> |
read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions); |
1477 |
> |
update_coverage(best_hits, *coverage); |
1478 |
|
update_junctions(best_hits, *junctions); |
1479 |
+ |
update_insertions_and_deletions(best_hits, *insertions, *deletions); |
1480 |
|
update_fusions(best_hits, *rt, *fusions); |
1481 |
< |
|
1481 |
> |
|
1482 |
|
// Get next hit group |
1483 |
|
l_hs.next_read_hits(curr_left_hit_group); |
1484 |
|
curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); |
1491 |
|
{ |
1492 |
|
HitsForRead best_hits; |
1493 |
|
best_hits.insert_id = curr_right_obs_order; |
887 |
– |
FragmentAlignmentGrade best_grade; |
1494 |
|
if (curr_right_obs_order >= begin_id) |
1495 |
|
{ |
1496 |
|
// Process hit for right singleton, select best alignments |
1497 |
< |
read_best_alignments(curr_right_hit_group, best_grade, best_hits, *gtf_junctions); |
1497 |
> |
read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions); |
1498 |
> |
update_coverage(best_hits, *coverage); |
1499 |
|
update_junctions(best_hits, *junctions); |
1500 |
+ |
update_insertions_and_deletions(best_hits, *insertions, *deletions); |
1501 |
|
update_fusions(best_hits, *rt, *fusions); |
1502 |
|
} |
1503 |
|
|
1521 |
|
best_hits, |
1522 |
|
*gtf_junctions); |
1523 |
|
|
1524 |
< |
HitsForRead single_best_hits; |
1525 |
< |
single_best_hits.insert_id = curr_left_obs_order; |
1524 |
> |
HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order; |
1525 |
> |
HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order; |
1526 |
|
|
1527 |
< |
// this is too much memory copy - we will make it efficient soon |
1528 |
< |
for (size_t i = 0; i < best_hits.size(); ++i) |
1527 |
> |
if (best_hits.size() > 0) |
1528 |
> |
{ |
1529 |
> |
for (size_t i = 0; i < best_hits.size(); ++i) |
1530 |
> |
{ |
1531 |
> |
best_left_hit_group.hits.push_back(best_hits[i].first); |
1532 |
> |
best_right_hit_group.hits.push_back(best_hits[i].second); |
1533 |
> |
} |
1534 |
> |
} |
1535 |
> |
else |
1536 |
|
{ |
1537 |
< |
single_best_hits.hits.push_back(best_hits[i].first); |
1538 |
< |
single_best_hits.hits.push_back(best_hits[i].second); |
1537 |
> |
best_left_hit_group.hits = curr_left_hit_group.hits; |
1538 |
> |
best_right_hit_group.hits = curr_right_hit_group.hits; |
1539 |
|
} |
1540 |
|
|
1541 |
< |
update_junctions(single_best_hits, *junctions); |
1542 |
< |
update_fusions(single_best_hits, *rt, *fusions); |
1541 |
> |
update_coverage(best_left_hit_group, *coverage); |
1542 |
> |
update_junctions(best_left_hit_group, *junctions); |
1543 |
> |
update_insertions_and_deletions(best_left_hit_group, *insertions, *deletions); |
1544 |
> |
update_fusions(best_left_hit_group, *rt, *fusions); |
1545 |
> |
|
1546 |
> |
update_coverage(best_right_hit_group, *coverage); |
1547 |
> |
update_junctions(best_right_hit_group, *junctions); |
1548 |
> |
update_insertions_and_deletions(best_right_hit_group, *insertions, *deletions); |
1549 |
> |
update_fusions(best_right_hit_group, *rt, *fusions); |
1550 |
|
|
1551 |
|
l_hs.next_read_hits(curr_left_hit_group); |
1552 |
|
curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); |
1565 |
|
string left_map_fname; |
1566 |
|
string right_map_fname; |
1567 |
|
|
946 |
– |
JunctionSet* junctions; |
947 |
– |
JunctionSet* gtf_junctions; |
948 |
– |
FusionSet* fusions; |
1568 |
|
RefSequenceTable* rt; |
1569 |
|
|
1570 |
+ |
JunctionSet* gtf_junctions; |
1571 |
+ |
|
1572 |
|
uint64_t begin_id; |
1573 |
|
uint64_t end_id; |
1574 |
|
int64_t left_map_offset; |
1575 |
|
int64_t right_map_offset; |
1576 |
+ |
|
1577 |
+ |
JunctionSet* junctions; |
1578 |
+ |
InsertionSet* insertions; |
1579 |
+ |
DeletionSet* deletions; |
1580 |
+ |
FusionSet* fusions; |
1581 |
+ |
|
1582 |
+ |
Coverage* coverage; |
1583 |
|
}; |
1584 |
|
|
1585 |
|
struct ReportWorker |
1586 |
|
{ |
1587 |
+ |
void write_singleton_alignments(uint64_t curr_obs_order, |
1588 |
+ |
HitsForRead& curr_hit_group, |
1589 |
+ |
ReadStream& reads_file, |
1590 |
+ |
GBamWriter& bam_writer, |
1591 |
+ |
GBamWriter& um_out, |
1592 |
+ |
FragmentType fragment_type) |
1593 |
+ |
{ |
1594 |
+ |
HitsForRead best_hits; |
1595 |
+ |
best_hits.insert_id = curr_obs_order; |
1596 |
+ |
|
1597 |
+ |
realign_reads(curr_hit_group, *rt, *junctions, *rev_junctions, |
1598 |
+ |
*insertions, *deletions, *rev_deletions, *fusions); |
1599 |
+ |
|
1600 |
+ |
exclude_hits_on_filtered_junctions(*junctions, curr_hit_group); |
1601 |
+ |
|
1602 |
+ |
// Process hits for singleton, select best alignments |
1603 |
+ |
const bool final_report = true; |
1604 |
+ |
read_best_alignments(curr_hit_group, best_hits, *gtf_junctions, |
1605 |
+ |
*junctions, *insertions, *deletions, *fusions, *coverage, |
1606 |
+ |
final_report); |
1607 |
+ |
|
1608 |
+ |
if (best_hits.hits.size() > 0) |
1609 |
+ |
{ |
1610 |
+ |
Read read; |
1611 |
+ |
bool got_read = reads_file.getRead(curr_obs_order, read, |
1612 |
+ |
reads_format, false, begin_id, end_id, |
1613 |
+ |
&um_out, false); |
1614 |
+ |
assert (got_read); |
1615 |
+ |
if (got_read) |
1616 |
+ |
{ |
1617 |
+ |
update_junctions(best_hits, *final_junctions); |
1618 |
+ |
update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions); |
1619 |
+ |
update_fusions(best_hits, *rt, *final_fusions, *fusions); |
1620 |
+ |
|
1621 |
+ |
print_sam_for_single(*rt, |
1622 |
+ |
best_hits, |
1623 |
+ |
fragment_type, |
1624 |
+ |
read.alt_name, |
1625 |
+ |
bam_writer, |
1626 |
+ |
rng); |
1627 |
+ |
} |
1628 |
+ |
} |
1629 |
+ |
} |
1630 |
+ |
|
1631 |
|
void operator()() |
1632 |
|
{ |
1633 |
+ |
rng.seed(1); |
1634 |
+ |
|
1635 |
|
ReadTable it; |
1636 |
|
GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str()); |
1637 |
|
|
1638 |
|
ReadStream left_reads_file(left_reads_fname); |
1639 |
|
if (left_reads_file.file() == NULL) |
1640 |
|
err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str()); |
1641 |
< |
|
1641 |
> |
if (left_reads_file.isBam()) { |
1642 |
> |
left_reads_file.use_alt_name(); |
1643 |
> |
left_reads_file.ignoreQC(); |
1644 |
> |
} |
1645 |
|
if (left_reads_offset > 0) |
1646 |
|
left_reads_file.seek(left_reads_offset); |
1647 |
|
|
1648 |
< |
if (!zpacker.empty()) left_um_fname+=".z"; |
1649 |
< |
FZPipe left_um_out; |
1650 |
< |
if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL) |
974 |
< |
err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str()); |
1648 |
> |
//if (!zpacker.empty()) left_um_fname+=".z"; |
1649 |
> |
GBamWriter* left_um_out=new GBamWriter(left_um_fname.c_str(), sam_header.c_str()); |
1650 |
> |
GBamWriter* right_um_out=NULL; |
1651 |
|
|
1652 |
|
ReadStream right_reads_file(right_reads_fname); |
1653 |
|
if (right_reads_offset > 0) |
1654 |
|
right_reads_file.seek(right_reads_offset); |
1655 |
|
|
1656 |
< |
FZPipe right_um_out; |
1657 |
< |
if (right_reads_fname != "") |
1656 |
> |
//FZPipe right_um_out; |
1657 |
> |
if (!right_reads_fname.empty()) |
1658 |
|
{ |
1659 |
< |
if (!zpacker.empty()) right_um_fname+=".z"; |
1660 |
< |
if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL) |
1661 |
< |
err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str()); |
1659 |
> |
if (right_reads_file.isBam()) { |
1660 |
> |
right_reads_file.use_alt_name(); |
1661 |
> |
right_reads_file.ignoreQC(); |
1662 |
> |
right_um_out=new GBamWriter(right_um_fname.c_str(), sam_header.c_str()); |
1663 |
> |
} |
1664 |
> |
//if (!zpacker.empty()) right_um_fname+=".z"; |
1665 |
> |
//if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL) |
1666 |
> |
// err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str()); |
1667 |
|
} |
1668 |
|
|
1669 |
|
vector<BAMHitFactory*> hit_factories; |
1686 |
|
uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); |
1687 |
|
uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); |
1688 |
|
|
1689 |
+ |
const bool final_report = true; |
1690 |
+ |
|
1691 |
|
// While we still have unreported hits... |
1692 |
|
Read l_read; |
1693 |
|
Read r_read; |
1699 |
|
curr_left_obs_order < end_id && |
1700 |
|
curr_left_obs_order != VMAXINT32) |
1701 |
|
{ |
1702 |
< |
HitsForRead best_hits; |
1703 |
< |
best_hits.insert_id = curr_left_obs_order; |
1704 |
< |
FragmentAlignmentGrade grade; |
1705 |
< |
bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read, |
1706 |
< |
reads_format, false, begin_id, end_id, |
1707 |
< |
left_um_out.file, false); |
1708 |
< |
//assert(got_read); |
1026 |
< |
|
1027 |
< |
if (right_reads_file.file()) { |
1028 |
< |
/* |
1029 |
< |
fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(), |
1030 |
< |
l_read.seq.c_str(), l_read.qual.c_str()); |
1031 |
< |
*/ |
1032 |
< |
got_read=right_reads_file.getRead(curr_left_obs_order, r_read, |
1033 |
< |
reads_format, false, begin_id, end_id, |
1034 |
< |
right_um_out.file, true); |
1035 |
< |
//assert(got_read); |
1036 |
< |
} |
1037 |
< |
|
1038 |
< |
exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group); |
1039 |
< |
|
1040 |
< |
// Process hits for left singleton, select best alignments |
1041 |
< |
read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions); |
1042 |
< |
|
1043 |
< |
if (best_hits.hits.size() > max_multihits) |
1044 |
< |
{ |
1045 |
< |
best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end()); |
1046 |
< |
grade.num_alignments = best_hits.hits.size(); |
1047 |
< |
} |
1048 |
< |
|
1049 |
< |
if (best_hits.hits.size() > 0) |
1050 |
< |
{ |
1051 |
< |
update_junctions(best_hits, *final_junctions); |
1052 |
< |
update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions); |
1053 |
< |
update_fusions(best_hits, *rt, *final_fusions, *fusions); |
1054 |
< |
|
1055 |
< |
print_sam_for_single(*rt, |
1056 |
< |
best_hits, |
1057 |
< |
grade, |
1058 |
< |
(right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT), |
1059 |
< |
l_read, |
1060 |
< |
bam_writer, |
1061 |
< |
left_um_out.file); |
1062 |
< |
} |
1702 |
> |
write_singleton_alignments(curr_left_obs_order, |
1703 |
> |
curr_left_hit_group, |
1704 |
> |
left_reads_file, |
1705 |
> |
bam_writer, |
1706 |
> |
*left_um_out, |
1707 |
> |
right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT); |
1708 |
> |
|
1709 |
|
// Get next hit group |
1710 |
|
left_hs.next_read_hits(curr_left_hit_group); |
1711 |
|
curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); |
1716 |
|
curr_right_obs_order < end_id && |
1717 |
|
curr_right_obs_order != VMAXINT32) |
1718 |
|
{ |
1719 |
< |
HitsForRead best_hits; |
1720 |
< |
best_hits.insert_id = curr_right_obs_order; |
1721 |
< |
FragmentAlignmentGrade grade; |
1722 |
< |
|
1723 |
< |
if (curr_right_obs_order >= begin_id) |
1724 |
< |
{ |
1079 |
< |
bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read, |
1080 |
< |
reads_format, false, begin_id, end_id, |
1081 |
< |
right_um_out.file, false); |
1082 |
< |
|
1083 |
< |
//assert(got_read); |
1084 |
< |
/* |
1085 |
< |
fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(), |
1086 |
< |
r_read.seq.c_str(), r_read.qual.c_str()); |
1087 |
< |
*/ |
1088 |
< |
got_read=left_reads_file.getRead(curr_right_obs_order, l_read, |
1089 |
< |
reads_format, false, begin_id, end_id, |
1090 |
< |
left_um_out.file, true); |
1091 |
< |
//assert(got_read); |
1092 |
< |
|
1093 |
< |
exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group); |
1094 |
< |
|
1095 |
< |
// Process hit for right singleton, select best alignments |
1096 |
< |
read_best_alignments(curr_right_hit_group, grade, best_hits, *gtf_junctions); |
1097 |
< |
|
1098 |
< |
if (best_hits.hits.size() > max_multihits) |
1099 |
< |
{ |
1100 |
< |
best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end()); |
1101 |
< |
grade.num_alignments = best_hits.hits.size(); |
1102 |
< |
} |
1103 |
< |
|
1104 |
< |
if (best_hits.hits.size() > 0) |
1105 |
< |
{ |
1106 |
< |
update_junctions(best_hits, *final_junctions); |
1107 |
< |
update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions); |
1108 |
< |
update_fusions(best_hits, *rt, *final_fusions, *fusions); |
1109 |
< |
|
1110 |
< |
print_sam_for_single(*rt, |
1111 |
< |
best_hits, |
1112 |
< |
grade, |
1113 |
< |
FRAG_RIGHT, |
1114 |
< |
r_read, |
1115 |
< |
bam_writer, |
1116 |
< |
right_um_out.file); |
1117 |
< |
} |
1118 |
< |
} |
1719 |
> |
write_singleton_alignments(curr_right_obs_order, |
1720 |
> |
curr_right_hit_group, |
1721 |
> |
right_reads_file, |
1722 |
> |
bam_writer, |
1723 |
> |
*right_um_out, |
1724 |
> |
FRAG_RIGHT); |
1725 |
|
|
1726 |
|
// Get next hit group |
1727 |
|
right_hs.next_read_hits(curr_right_hit_group); |
1734 |
|
curr_left_obs_order < end_id && |
1735 |
|
curr_left_obs_order != VMAXINT32) |
1736 |
|
{ |
1737 |
+ |
realign_reads(curr_left_hit_group, *rt, *junctions, *rev_junctions, |
1738 |
+ |
*insertions, *deletions, *rev_deletions, *fusions); |
1739 |
|
exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group); |
1740 |
+ |
|
1741 |
+ |
realign_reads(curr_right_hit_group, *rt, *junctions, *rev_junctions, |
1742 |
+ |
*insertions, *deletions, *rev_deletions, *fusions); |
1743 |
|
exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group); |
1744 |
< |
if (curr_left_hit_group.hits.empty()) |
1745 |
< |
{ //only right read mapped |
1135 |
< |
//write it in the mapped file with the #MAPPED# flag |
1136 |
< |
|
1137 |
< |
bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read, |
1138 |
< |
reads_format, false, begin_id, end_id, |
1139 |
< |
right_um_out.file, false); |
1140 |
< |
//assert(got_read); |
1141 |
< |
/* |
1142 |
< |
fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(), |
1143 |
< |
r_read.seq.c_str(), r_read.qual.c_str()); |
1144 |
< |
*/ |
1145 |
< |
HitsForRead right_best_hits; |
1146 |
< |
right_best_hits.insert_id = curr_right_obs_order; |
1147 |
< |
|
1148 |
< |
FragmentAlignmentGrade grade; |
1149 |
< |
read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions); |
1744 |
> |
|
1745 |
> |
vector<pair<BowtieHit, BowtieHit> > best_hits; |
1746 |
|
|
1747 |
< |
if (right_best_hits.hits.size() > max_multihits) |
1748 |
< |
{ |
1749 |
< |
right_best_hits.hits.erase(right_best_hits.hits.begin() + max_multihits, right_best_hits.hits.end()); |
1750 |
< |
grade.num_alignments = right_best_hits.hits.size(); |
1155 |
< |
} |
1156 |
< |
|
1157 |
< |
if (right_best_hits.hits.size() > 0) |
1158 |
< |
{ |
1159 |
< |
update_junctions(right_best_hits, *final_junctions); |
1160 |
< |
update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions); |
1161 |
< |
update_fusions(right_best_hits, *rt, *final_fusions, *fusions); |
1162 |
< |
|
1163 |
< |
print_sam_for_single(*rt, |
1164 |
< |
right_best_hits, |
1165 |
< |
grade, |
1166 |
< |
FRAG_RIGHT, |
1167 |
< |
r_read, |
1168 |
< |
bam_writer, |
1169 |
< |
right_um_out.file); |
1170 |
< |
} |
1171 |
< |
} |
1172 |
< |
else if (curr_right_hit_group.hits.empty()) |
1747 |
> |
bool paired_alignments = curr_left_hit_group.hits.size() > 0 && curr_right_hit_group.hits.size() > 0; |
1748 |
> |
InsertAlignmentGrade grade; |
1749 |
> |
|
1750 |
> |
if (paired_alignments) |
1751 |
|
{ |
1752 |
< |
HitsForRead left_best_hits; |
1753 |
< |
left_best_hits.insert_id = curr_left_obs_order; |
1754 |
< |
//only left read mapped |
1755 |
< |
bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read, |
1756 |
< |
reads_format, false, begin_id, end_id, |
1757 |
< |
left_um_out.file, false); |
1758 |
< |
//assert(got_read); |
1759 |
< |
/* |
1760 |
< |
fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(), |
1761 |
< |
l_read.seq.c_str(), l_read.qual.c_str()); |
1184 |
< |
*/ |
1185 |
< |
FragmentAlignmentGrade grade; |
1186 |
< |
// Process hits for left singleton, select best alignments |
1187 |
< |
read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions); |
1752 |
> |
pair_best_alignments(curr_left_hit_group, |
1753 |
> |
curr_right_hit_group, |
1754 |
> |
grade, best_hits, *gtf_junctions, |
1755 |
> |
*junctions, *insertions, *deletions, *fusions, *coverage, |
1756 |
> |
final_report); |
1757 |
> |
|
1758 |
> |
if (best_hits.size() <= 0 || |
1759 |
> |
(grade.fusion && !fusion_search && !report_discordant_pair_alignments)) |
1760 |
> |
paired_alignments = false; |
1761 |
> |
} |
1762 |
|
|
1763 |
< |
if (left_best_hits.hits.size() > max_multihits) |
1763 |
> |
if (paired_alignments) |
1764 |
> |
{ |
1765 |
> |
HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order; |
1766 |
> |
HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order; |
1767 |
> |
|
1768 |
> |
for (size_t i = 0; i < best_hits.size(); ++i) |
1769 |
|
{ |
1770 |
< |
left_best_hits.hits.erase(left_best_hits.hits.begin() + max_multihits, left_best_hits.hits.end()); |
1771 |
< |
grade.num_alignments = left_best_hits.hits.size(); |
1770 |
> |
best_left_hit_group.hits.push_back(best_hits[i].first); |
1771 |
> |
best_right_hit_group.hits.push_back(best_hits[i].second); |
1772 |
|
} |
1773 |
|
|
1774 |
< |
if (left_best_hits.hits.size() > 0) |
1774 |
> |
if (best_hits.size() > 0) |
1775 |
|
{ |
1776 |
< |
update_junctions(left_best_hits, *final_junctions); |
1777 |
< |
update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions); |
1778 |
< |
update_fusions(left_best_hits, *rt, *final_fusions, *fusions); |
1776 |
> |
bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), l_read, |
1777 |
> |
reads_format, false, begin_id, end_id, |
1778 |
> |
left_um_out, false); |
1779 |
|
|
1780 |
< |
print_sam_for_single(*rt, |
1781 |
< |
left_best_hits, |
1782 |
< |
grade, |
1783 |
< |
FRAG_LEFT, |
1784 |
< |
l_read, |
1785 |
< |
bam_writer, |
1786 |
< |
left_um_out.file); |
1780 |
> |
bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), r_read, |
1781 |
> |
reads_format, false, begin_id, end_id, |
1782 |
> |
right_um_out, false); |
1783 |
> |
|
1784 |
> |
assert (got_left_read && got_right_read); |
1785 |
> |
|
1786 |
> |
if (got_left_read && got_right_read) |
1787 |
> |
{ |
1788 |
> |
update_junctions(best_left_hit_group, *final_junctions); |
1789 |
> |
update_insertions_and_deletions(best_left_hit_group, *final_insertions, *final_deletions); |
1790 |
> |
update_fusions(best_left_hit_group, *rt, *final_fusions, *fusions); |
1791 |
> |
|
1792 |
> |
update_junctions(best_right_hit_group, *final_junctions); |
1793 |
> |
update_insertions_and_deletions(best_right_hit_group, *final_insertions, *final_deletions); |
1794 |
> |
update_fusions(best_right_hit_group, *rt, *final_fusions, *fusions); |
1795 |
> |
|
1796 |
> |
pair_support(best_hits, *final_fusions, *fusions); |
1797 |
> |
|
1798 |
> |
print_sam_for_pair(*rt, |
1799 |
> |
best_hits, |
1800 |
> |
grade, |
1801 |
> |
bam_writer, |
1802 |
> |
l_read.alt_name, |
1803 |
> |
r_read.alt_name, |
1804 |
> |
rng, |
1805 |
> |
begin_id, |
1806 |
> |
end_id); |
1807 |
> |
} |
1808 |
|
} |
1809 |
|
} |
1810 |
|
else |
1811 |
|
{ |
1812 |
< |
vector<pair<BowtieHit, BowtieHit> > best_hits; |
1213 |
< |
|
1214 |
< |
InsertAlignmentGrade grade; |
1215 |
< |
pair_best_alignments(curr_left_hit_group, |
1216 |
< |
curr_right_hit_group, |
1217 |
< |
grade, |
1218 |
< |
best_hits, |
1219 |
< |
*gtf_junctions); |
1220 |
< |
|
1221 |
< |
if (best_hits.size() > max_multihits) |
1812 |
> |
if (curr_left_hit_group.hits.size() > 0) |
1813 |
|
{ |
1814 |
< |
best_hits.erase(best_hits.begin() + max_multihits, best_hits.end()); |
1815 |
< |
grade.num_alignments = best_hits.size(); |
1816 |
< |
} |
1817 |
< |
|
1818 |
< |
//hits for both left and right reads |
1819 |
< |
HitsForRead single_best_hits; |
1229 |
< |
single_best_hits.insert_id = curr_left_obs_order; |
1230 |
< |
for (size_t i = 0; i < best_hits.size(); ++i) |
1231 |
< |
{ |
1232 |
< |
single_best_hits.hits.push_back(best_hits[i].first); |
1233 |
< |
single_best_hits.hits.push_back(best_hits[i].second); |
1814 |
> |
write_singleton_alignments(curr_left_obs_order, |
1815 |
> |
curr_left_hit_group, |
1816 |
> |
left_reads_file, |
1817 |
> |
bam_writer, |
1818 |
> |
*left_um_out, |
1819 |
> |
(right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT)); |
1820 |
|
} |
1821 |
|
|
1822 |
< |
if (single_best_hits.hits.size() > 0) |
1823 |
< |
{ |
1824 |
< |
update_junctions(single_best_hits, *final_junctions); |
1825 |
< |
update_insertions_and_deletions(single_best_hits, *final_insertions, *final_deletions); |
1826 |
< |
|
1827 |
< |
pair_support(best_hits, *final_fusions, *fusions); |
1828 |
< |
update_fusions(single_best_hits, *rt, *final_fusions, *fusions); |
1829 |
< |
|
1830 |
< |
print_sam_for_pair(*rt, |
1245 |
< |
best_hits, |
1246 |
< |
grade, |
1247 |
< |
left_reads_file, |
1248 |
< |
right_reads_file, |
1249 |
< |
bam_writer, |
1250 |
< |
left_um_out.file, |
1251 |
< |
right_um_out.file, |
1252 |
< |
begin_id, |
1253 |
< |
end_id); |
1822 |
> |
if (curr_right_hit_group.hits.size() > 0) |
1823 |
> |
{ //only right read mapped |
1824 |
> |
//write it in the mapped file with the #MAPPED# flag |
1825 |
> |
write_singleton_alignments(curr_right_obs_order, |
1826 |
> |
curr_right_hit_group, |
1827 |
> |
right_reads_file, |
1828 |
> |
bam_writer, |
1829 |
> |
*right_um_out, |
1830 |
> |
FRAG_RIGHT); |
1831 |
|
} |
1832 |
|
} |
1833 |
|
|
1839 |
|
} |
1840 |
|
} //while we still have unreported hits.. |
1841 |
|
//print the remaining unmapped reads at the end of each reads' stream |
1842 |
+ |
|
1843 |
|
left_reads_file.getRead(VMAXINT32, l_read, |
1844 |
|
reads_format, false, begin_id, end_id, |
1845 |
< |
left_um_out.file, false); |
1845 |
> |
left_um_out, false); |
1846 |
|
if (right_reads_file.file()) |
1847 |
|
right_reads_file.getRead(VMAXINT32, r_read, |
1848 |
|
reads_format, false, begin_id, end_id, |
1849 |
< |
right_um_out.file, false); |
1849 |
> |
right_um_out, false); |
1850 |
> |
|
1851 |
> |
|
1852 |
> |
// pclose (pipe close), which waits for a process to end, seems to conflict with boost::thread::join somehow, |
1853 |
> |
// resulting in deadlock like behavior. |
1854 |
> |
delete left_um_out; |
1855 |
> |
delete right_um_out; |
1856 |
|
|
1273 |
– |
left_um_out.close(); |
1274 |
– |
right_um_out.close(); |
1857 |
|
|
1858 |
|
for (size_t i = 0; i < hit_factories.size(); ++i) |
1859 |
|
delete hit_factories[i]; |
1873 |
|
string right_um_fname; |
1874 |
|
|
1875 |
|
JunctionSet* gtf_junctions; |
1876 |
+ |
|
1877 |
|
JunctionSet* junctions; |
1878 |
+ |
JunctionSet* rev_junctions; |
1879 |
+ |
InsertionSet* insertions; |
1880 |
+ |
DeletionSet* deletions; |
1881 |
+ |
DeletionSet* rev_deletions; |
1882 |
|
FusionSet* fusions; |
1883 |
+ |
Coverage* coverage; |
1884 |
+ |
|
1885 |
|
JunctionSet* final_junctions; |
1886 |
|
InsertionSet* final_insertions; |
1887 |
|
DeletionSet* final_deletions; |
1895 |
|
int64_t left_map_offset; |
1896 |
|
int64_t right_reads_offset; |
1897 |
|
int64_t right_map_offset; |
1898 |
+ |
|
1899 |
+ |
boost::random::mt19937 rng; |
1900 |
|
}; |
1901 |
|
|
1902 |
|
void driver(const string& bam_output_fname, |
1914 |
|
num_threads = 1; |
1915 |
|
|
1916 |
|
RefSequenceTable rt(sam_header, true); |
1917 |
< |
|
1327 |
< |
if (fusion_search) |
1328 |
< |
get_seqs(ref_stream, rt, true); |
1917 |
> |
get_seqs(ref_stream, rt, true); |
1918 |
|
|
1919 |
|
srandom(1); |
1920 |
|
|
1921 |
|
JunctionSet gtf_junctions; |
1922 |
|
if (!gtf_juncs.empty()) |
1923 |
|
{ |
1924 |
< |
char splice_buf[2048]; |
1924 |
> |
char splice_buf[4096]; |
1925 |
|
FILE* splice_coords = fopen(gtf_juncs.c_str(), "r"); |
1926 |
|
if (splice_coords) |
1927 |
|
{ |
1948 |
|
uint32_t right_coord = atoi(scan_right_coord); |
1949 |
|
bool antisense = *orientation == '-'; |
1950 |
|
|
1951 |
< |
// add 1 to left_coord to meet BED format |
1952 |
< |
gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats())); |
1951 |
> |
JunctionStats junction_stat; |
1952 |
> |
junction_stat.gtf_match = true; |
1953 |
> |
junction_stat.accepted = true; |
1954 |
> |
|
1955 |
> |
gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), junction_stat)); |
1956 |
|
} |
1957 |
|
} |
1958 |
|
fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str()); |
1976 |
|
} |
1977 |
|
|
1978 |
|
JunctionSet vjunctions[num_threads]; |
1979 |
+ |
InsertionSet vinsertions[num_threads]; |
1980 |
+ |
DeletionSet vdeletions[num_threads]; |
1981 |
|
FusionSet vfusions[num_threads]; |
1982 |
+ |
Coverage vcoverages[num_threads]; |
1983 |
+ |
|
1984 |
|
vector<boost::thread*> threads; |
1985 |
|
for (int i = 0; i < num_threads; ++i) |
1986 |
|
{ |
1988 |
|
|
1989 |
|
worker.left_map_fname = left_map_fname; |
1990 |
|
worker.right_map_fname = right_map_fname; |
1991 |
+ |
worker.rt = &rt; |
1992 |
+ |
worker.gtf_junctions = >f_junctions; |
1993 |
+ |
|
1994 |
|
worker.junctions = &vjunctions[i]; |
1995 |
+ |
worker.insertions = &vinsertions[i]; |
1996 |
+ |
worker.deletions = &vdeletions[i]; |
1997 |
|
worker.fusions = &vfusions[i]; |
1998 |
< |
worker.gtf_junctions = >f_junctions; |
1398 |
< |
worker.rt = &rt; |
1998 |
> |
worker.coverage = &vcoverages[i]; |
1999 |
|
|
2000 |
|
worker.right_map_offset = 0; |
2001 |
|
|
2023 |
|
} |
2024 |
|
|
2025 |
|
for (size_t i = 0; i < threads.size(); ++i) |
2026 |
< |
{ |
2026 |
> |
{ |
2027 |
|
threads[i]->join(); |
2028 |
|
delete threads[i]; |
2029 |
|
threads[i] = NULL; |
2030 |
|
} |
2031 |
|
threads.clear(); |
2032 |
|
|
2033 |
< |
JunctionSet junctions = vjunctions[0]; |
2034 |
< |
FusionSet fusions = vfusions[0]; |
2035 |
< |
for (size_t i = 1; i < num_threads; ++i) |
2033 |
> |
JunctionSet& junctions = vjunctions[0]; |
2034 |
> |
InsertionSet& insertions = vinsertions[0]; |
2035 |
> |
DeletionSet& deletions = vdeletions[0]; |
2036 |
> |
FusionSet& fusions = vfusions[0]; |
2037 |
> |
Coverage& coverage = vcoverages[0]; |
2038 |
> |
for (int i = 1; i < num_threads; ++i) |
2039 |
|
{ |
2040 |
|
merge_with(junctions, vjunctions[i]); |
2041 |
|
vjunctions[i].clear(); |
2042 |
|
|
2043 |
+ |
merge_with(insertions, vinsertions[i]); |
2044 |
+ |
vinsertions[i].clear(); |
2045 |
+ |
|
2046 |
+ |
merge_with(deletions, vdeletions[i]); |
2047 |
+ |
vdeletions[i].clear(); |
2048 |
+ |
|
2049 |
|
merge_with(fusions, vfusions[i]); |
2050 |
|
vfusions[i].clear(); |
2051 |
+ |
|
2052 |
+ |
coverage.merge_with(vcoverages[i]); |
2053 |
+ |
vcoverages[i].clear(); |
2054 |
+ |
} |
2055 |
+ |
|
2056 |
+ |
merge_with(junctions, gtf_junctions); |
2057 |
+ |
|
2058 |
+ |
coverage.calculate_coverage(); |
2059 |
+ |
|
2060 |
+ |
JunctionSet rev_junctions; |
2061 |
+ |
JunctionSet::const_iterator junction_iter = junctions.begin(); |
2062 |
+ |
for (; junction_iter != junctions.end(); ++junction_iter) |
2063 |
+ |
{ |
2064 |
+ |
const Junction& junction = junction_iter->first; |
2065 |
+ |
Junction rev_junction = junction; |
2066 |
+ |
rev_junction.left = junction.right; |
2067 |
+ |
rev_junction.right = junction.left; |
2068 |
+ |
rev_junctions[rev_junction] = junction_iter->second; |
2069 |
|
} |
2070 |
|
|
2071 |
+ |
DeletionSet rev_deletions; |
2072 |
+ |
#if 0 |
2073 |
+ |
DeletionSet::const_iterator deletion_iter = deletions.begin(); |
2074 |
+ |
for (; deletion_iter != deletions.end(); ++deletion_iter) |
2075 |
+ |
{ |
2076 |
+ |
const Deletion& deletion = deletion_iter->first; |
2077 |
+ |
Deletion rev_deletion = deletion; |
2078 |
+ |
rev_deletion.left = deletion.right; |
2079 |
+ |
rev_deletion.right = deletion.left; |
2080 |
+ |
rev_deletions[rev_deletion] = deletion_iter->second; |
2081 |
+ |
} |
2082 |
+ |
#endif |
2083 |
+ |
|
2084 |
|
size_t num_unfiltered_juncs = junctions.size(); |
2085 |
|
fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs); |
2086 |
|
|
2087 |
|
// Read hits, extract junctions, and toss the ones that arent strongly enough supported. |
1448 |
– |
|
2088 |
|
filter_junctions(junctions, gtf_junctions); |
2089 |
|
|
2090 |
|
//size_t num_juncs_after_filter = junctions.size(); |
2092 |
|
// num_unfiltered_juncs - num_juncs_after_filter); |
2093 |
|
|
2094 |
|
/* |
2095 |
< |
size_t small_overhangs = 0; |
2096 |
< |
for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) |
2095 |
> |
size_t small_overhangs = 0; |
2096 |
> |
for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) |
2097 |
|
{ |
2098 |
< |
if (i->second.accepted && |
2099 |
< |
(i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len)) |
2100 |
< |
{ |
2101 |
< |
small_overhangs++; |
2102 |
< |
} |
2098 |
> |
if (i->second.accepted && |
2099 |
> |
(i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len)) |
2100 |
> |
{ |
2101 |
> |
small_overhangs++; |
2102 |
> |
} |
2103 |
|
} |
2104 |
|
|
2105 |
< |
if (small_overhangs >0) |
2106 |
< |
fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs); |
2105 |
> |
if (small_overhangs >0) |
2106 |
> |
fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs); |
2107 |
|
*/ |
2108 |
|
|
2109 |
|
JunctionSet vfinal_junctions[num_threads]; |
2115 |
|
{ |
2116 |
|
ReportWorker worker; |
2117 |
|
|
2118 |
+ |
worker.sam_header_fname = sam_header; |
2119 |
|
char filename[1024] = {0}; |
2120 |
|
sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i); |
2121 |
|
worker.bam_output_fname = filename; |
2122 |
< |
worker.sam_header_fname = sam_header; |
2123 |
< |
sprintf(filename, "%s/unmapped_left%d.fq", output_dir.c_str(), i); |
2124 |
< |
worker.left_um_fname = filename; |
2122 |
> |
string tmpoutdir=getFdir(worker.bam_output_fname); |
2123 |
> |
worker.left_um_fname = tmpoutdir; |
2124 |
> |
sprintf(filename, "unmapped_left_%d.bam", i); |
2125 |
> |
worker.left_um_fname+=filename; |
2126 |
|
|
2127 |
|
if (right_reads_fname != "") |
2128 |
|
{ |
2129 |
< |
sprintf(filename, "%s/unmapped_right%d.fq", output_dir.c_str(), i); |
2130 |
< |
worker.right_um_fname = filename; |
2129 |
> |
sprintf(filename, "unmapped_right_%d.bam", i); |
2130 |
> |
worker.right_um_fname = tmpoutdir; |
2131 |
> |
worker.right_um_fname += filename; |
2132 |
|
} |
2133 |
|
|
2134 |
|
worker.left_reads_fname = left_reads_fname; |
2138 |
|
|
2139 |
|
worker.gtf_junctions = >f_junctions; |
2140 |
|
worker.junctions = &junctions; |
2141 |
+ |
worker.rev_junctions = &rev_junctions; |
2142 |
+ |
worker.insertions = &insertions; |
2143 |
+ |
worker.deletions = &deletions; |
2144 |
+ |
worker.rev_deletions = &rev_deletions; |
2145 |
|
worker.fusions = &fusions; |
2146 |
+ |
worker.coverage = &coverage; |
2147 |
+ |
|
2148 |
|
worker.final_junctions = &vfinal_junctions[i]; |
2149 |
|
worker.final_insertions = &vfinal_insertions[i]; |
2150 |
|
worker.final_deletions = &vfinal_deletions[i]; |
2194 |
|
InsertionSet final_insertions = vfinal_insertions[0]; |
2195 |
|
DeletionSet final_deletions = vfinal_deletions[0]; |
2196 |
|
FusionSet final_fusions = vfinal_fusions[0]; |
2197 |
< |
for (size_t i = 1; i < num_threads; ++i) |
2197 |
> |
for (int i = 1; i < num_threads; ++i) |
2198 |
|
{ |
2199 |
|
merge_with(final_junctions, vfinal_junctions[i]); |
2200 |
|
vfinal_junctions[i].clear(); |
2330 |
|
|
2331 |
|
string left_reads_filename = argv[optind++]; |
2332 |
|
string unzcmd=getUnpackCmd(left_reads_filename, false); |
1685 |
– |
|
2333 |
|
string right_map_filename; |
2334 |
|
string right_reads_filename; |
2335 |
|
|