27 |
|
#include "bwt_map.h" |
28 |
|
#include "tokenize.h" |
29 |
|
#include "reads.h" |
30 |
< |
|
31 |
< |
using namespace std; |
30 |
> |
#include "align_status.h" |
31 |
|
|
32 |
|
void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness) |
33 |
|
{ |
245 |
|
unsigned char splice_mms, |
246 |
|
bool end) |
247 |
|
{ |
248 |
< |
uint64_t insert_id = _insert_table.get_id(insert_name); |
249 |
< |
|
250 |
< |
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
251 |
< |
uint32_t reference_id2 = reference_id; |
252 |
< |
|
253 |
< |
if (ref_name2.length() > 0) |
254 |
< |
reference_id2 = _ref_table.get_id(ref_name2, NULL, 0); |
255 |
< |
|
256 |
< |
return BowtieHit(reference_id, |
257 |
< |
reference_id2, |
258 |
< |
insert_id, |
259 |
< |
left, |
260 |
< |
cigar, |
261 |
< |
antisense_aln, |
262 |
< |
antisense_splice, |
263 |
< |
edit_dist, |
264 |
< |
splice_mms, |
265 |
< |
end); |
248 |
> |
uint64_t insert_id = _insert_table.get_id(insert_name); |
249 |
> |
|
250 |
> |
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
251 |
> |
uint32_t reference_id2 = reference_id; |
252 |
> |
|
253 |
> |
if (ref_name2.length() > 0) |
254 |
> |
reference_id2 = _ref_table.get_id(ref_name2, NULL, 0); |
255 |
> |
|
256 |
> |
return BowtieHit(reference_id, |
257 |
> |
reference_id2, |
258 |
> |
insert_id, |
259 |
> |
left, |
260 |
> |
cigar, |
261 |
> |
antisense_aln, |
262 |
> |
antisense_splice, |
263 |
> |
edit_dist, |
264 |
> |
splice_mms, |
265 |
> |
end); |
266 |
|
} |
267 |
|
|
268 |
|
BowtieHit HitFactory::create_hit(const string& insert_name, |
273 |
|
unsigned char edit_dist, |
274 |
|
bool end) |
275 |
|
{ |
276 |
< |
uint64_t insert_id = _insert_table.get_id(insert_name); |
277 |
< |
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
278 |
< |
|
279 |
< |
return BowtieHit(reference_id, |
280 |
< |
reference_id, |
281 |
< |
insert_id, |
282 |
< |
left, |
283 |
< |
read_len, |
284 |
< |
antisense_aln, |
285 |
< |
edit_dist, |
286 |
< |
end); |
287 |
< |
} |
288 |
< |
|
289 |
< |
bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
290 |
< |
BowtieHit& bh, |
291 |
< |
bool strip_slash, |
292 |
< |
char* name_out, |
293 |
< |
char* name_tags, |
294 |
< |
char* seq, |
295 |
< |
char* qual) |
276 |
> |
uint64_t insert_id = _insert_table.get_id(insert_name); |
277 |
> |
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
278 |
> |
|
279 |
> |
return BowtieHit(reference_id, |
280 |
> |
reference_id, |
281 |
> |
insert_id, |
282 |
> |
left, |
283 |
> |
read_len, |
284 |
> |
antisense_aln, |
285 |
> |
edit_dist, |
286 |
> |
end); |
287 |
> |
} |
288 |
> |
|
289 |
> |
bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
290 |
> |
BowtieHit& bh, |
291 |
> |
bool strip_slash, |
292 |
> |
char* name_out, |
293 |
> |
char* name_tags, |
294 |
> |
char* seq, |
295 |
> |
char* qual) |
296 |
|
{ |
297 |
< |
if (!orig_bwt_buf || !*orig_bwt_buf) |
298 |
< |
return false; |
300 |
< |
|
301 |
< |
static const int buf_size = 2048; |
302 |
< |
|
303 |
< |
char bwt_buf[buf_size]; |
304 |
< |
strcpy(bwt_buf, orig_bwt_buf); |
305 |
< |
//const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s"; |
297 |
> |
if (!orig_bwt_buf || !*orig_bwt_buf) |
298 |
> |
return false; |
299 |
|
|
300 |
< |
char orientation; |
308 |
< |
|
309 |
< |
//int bwtf_ret = 0; |
310 |
< |
//uint32_t seqid = 0; |
311 |
< |
|
312 |
< |
char text_name[buf_size]; |
313 |
< |
unsigned int text_offset; |
314 |
< |
|
315 |
< |
|
316 |
< |
//unsigned int other_occs; |
317 |
< |
char mismatches[buf_size]; |
318 |
< |
//memset(mismatches, 0, sizeof(mismatches)); |
319 |
< |
|
320 |
< |
const char* buf = bwt_buf; |
321 |
< |
char* name = get_token((char**)&buf,"\t"); |
322 |
< |
char* orientation_str = get_token((char**)&buf,"\t"); |
323 |
< |
char* text_name_str = get_token((char**)&buf,"\t"); |
324 |
< |
|
325 |
< |
strcpy(text_name, text_name_str); |
326 |
< |
|
327 |
< |
char* text_offset_str = get_token((char**)&buf,"\t"); |
328 |
< |
char* seq_str = get_token((char**)&buf,"\t"); |
329 |
< |
if (seq) |
330 |
< |
strcpy(seq, seq_str); |
331 |
< |
|
332 |
< |
const char* qual_str = get_token((char**)&buf,"\t"); |
333 |
< |
if (qual) |
334 |
< |
strcpy(qual, qual_str); |
335 |
< |
|
336 |
< |
/*const char* other_occs_str =*/ get_token((char**)&buf, "\t"); |
337 |
< |
mismatches[0] = 0; |
338 |
< |
char* mismatches_str = get_token((char**)&buf, "\t"); |
339 |
< |
if (mismatches_str) |
340 |
< |
strcpy(mismatches, mismatches_str); |
341 |
< |
|
342 |
< |
orientation = orientation_str[0]; |
343 |
< |
text_offset = atoi(text_offset_str); |
300 |
> |
static const int buf_size = 2048; |
301 |
|
|
302 |
< |
bool end = true; |
303 |
< |
unsigned int seg_offset = 0; |
304 |
< |
unsigned int seg_num = 0; |
348 |
< |
unsigned int num_segs = 0; |
349 |
< |
|
350 |
< |
// Copy the tag out of the name field before we might wipe it out |
351 |
< |
char* pipe = strrchr(name, '|'); |
352 |
< |
if (pipe) |
353 |
< |
{ |
354 |
< |
if (name_tags) |
355 |
< |
strcpy(name_tags, pipe); |
302 |
> |
char bwt_buf[buf_size]; |
303 |
> |
strcpy(bwt_buf, orig_bwt_buf); |
304 |
> |
//const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s"; |
305 |
|
|
306 |
< |
char* tag_buf = pipe + 1; |
307 |
< |
if (strchr(tag_buf, ':')) |
308 |
< |
{ |
309 |
< |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
310 |
< |
if (seg_num + 1 == num_segs) |
311 |
< |
end = true; |
312 |
< |
else |
313 |
< |
end = false; |
314 |
< |
} |
315 |
< |
|
316 |
< |
*pipe = 0; |
306 |
> |
char orientation; |
307 |
> |
|
308 |
> |
//int bwtf_ret = 0; |
309 |
> |
//uint32_t seqid = 0; |
310 |
> |
|
311 |
> |
char text_name[buf_size]; |
312 |
> |
unsigned int text_offset; |
313 |
> |
|
314 |
> |
|
315 |
> |
//unsigned int other_occs; |
316 |
> |
char mismatches[buf_size]; |
317 |
> |
//memset(mismatches, 0, sizeof(mismatches)); |
318 |
> |
|
319 |
> |
const char* buf = bwt_buf; |
320 |
> |
char* name = get_token((char**)&buf,"\t"); |
321 |
> |
char* orientation_str = get_token((char**)&buf,"\t"); |
322 |
> |
char* text_name_str = get_token((char**)&buf,"\t"); |
323 |
> |
|
324 |
> |
strcpy(text_name, text_name_str); |
325 |
> |
|
326 |
> |
char* text_offset_str = get_token((char**)&buf,"\t"); |
327 |
> |
char* seq_str = get_token((char**)&buf,"\t"); |
328 |
> |
if (seq) |
329 |
> |
strcpy(seq, seq_str); |
330 |
> |
|
331 |
> |
const char* qual_str = get_token((char**)&buf,"\t"); |
332 |
> |
if (qual) |
333 |
> |
strcpy(qual, qual_str); |
334 |
> |
|
335 |
> |
/*const char* other_occs_str =*/ get_token((char**)&buf, "\t"); |
336 |
> |
mismatches[0] = 0; |
337 |
> |
char* mismatches_str = get_token((char**)&buf, "\t"); |
338 |
> |
if (mismatches_str) |
339 |
> |
strcpy(mismatches, mismatches_str); |
340 |
> |
|
341 |
> |
orientation = orientation_str[0]; |
342 |
> |
text_offset = atoi(text_offset_str); |
343 |
> |
|
344 |
> |
bool end = true; |
345 |
> |
unsigned int seg_offset = 0; |
346 |
> |
unsigned int seg_num = 0; |
347 |
> |
unsigned int num_segs = 0; |
348 |
> |
|
349 |
> |
// Copy the tag out of the name field before we might wipe it out |
350 |
> |
char* pipe = strrchr(name, '|'); |
351 |
> |
if (pipe) |
352 |
> |
{ |
353 |
> |
if (name_tags) |
354 |
> |
strcpy(name_tags, pipe); |
355 |
> |
|
356 |
> |
char* tag_buf = pipe + 1; |
357 |
> |
if (strchr(tag_buf, ':')) |
358 |
> |
{ |
359 |
> |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
360 |
> |
if (seg_num + 1 == num_segs) |
361 |
> |
end = true; |
362 |
> |
else |
363 |
> |
end = false; |
364 |
|
} |
369 |
– |
// Stripping the slash and number following it gives the insert name |
370 |
– |
char* slash = strrchr(name, '/'); |
371 |
– |
if (strip_slash && slash) |
372 |
– |
*slash = 0; |
373 |
– |
|
374 |
– |
size_t read_len = strlen(seq_str); |
375 |
– |
|
376 |
– |
// Add this alignment to the table of hits for this half of the |
377 |
– |
// Bowtie map |
365 |
|
|
366 |
< |
//vector<string> mismatch_toks; |
367 |
< |
char* pch = strtok (mismatches,","); |
368 |
< |
unsigned char num_mismatches = 0; |
369 |
< |
while (pch != NULL) |
366 |
> |
*pipe = 0; |
367 |
> |
} |
368 |
> |
// Stripping the slash and number following it gives the insert name |
369 |
> |
char* slash = strrchr(name, '/'); |
370 |
> |
if (strip_slash && slash) |
371 |
> |
*slash = 0; |
372 |
> |
|
373 |
> |
size_t read_len = strlen(seq_str); |
374 |
> |
|
375 |
> |
// Add this alignment to the table of hits for this half of the |
376 |
> |
// Bowtie map |
377 |
> |
|
378 |
> |
//vector<string> mismatch_toks; |
379 |
> |
char* pch = strtok (mismatches,","); |
380 |
> |
unsigned char num_mismatches = 0; |
381 |
> |
while (pch != NULL) |
382 |
> |
{ |
383 |
> |
char* colon = strchr(pch, ':'); |
384 |
> |
if (colon) |
385 |
|
{ |
386 |
< |
char* colon = strchr(pch, ':'); |
385 |
< |
if (colon) |
386 |
< |
{ |
387 |
< |
num_mismatches++; |
388 |
< |
} |
389 |
< |
//mismatch_toks.push_back(pch); |
390 |
< |
pch = strtok (NULL, ","); |
386 |
> |
num_mismatches++; |
387 |
|
} |
388 |
< |
|
389 |
< |
bh = create_hit(name, |
390 |
< |
text_name, |
391 |
< |
text_offset, |
392 |
< |
(int)read_len, |
393 |
< |
orientation == '-', |
394 |
< |
num_mismatches, |
395 |
< |
end); |
396 |
< |
|
397 |
< |
return true; |
388 |
> |
//mismatch_toks.push_back(pch); |
389 |
> |
pch = strtok (NULL, ","); |
390 |
> |
} |
391 |
> |
|
392 |
> |
bh = create_hit(name, |
393 |
> |
text_name, |
394 |
> |
text_offset, |
395 |
> |
(int)read_len, |
396 |
> |
orientation == '-', |
397 |
> |
num_mismatches, |
398 |
> |
end); |
399 |
> |
|
400 |
> |
return true; |
401 |
|
} |
402 |
|
|
403 |
|
int anchor_mismatch = 0; |
1119 |
|
cigar.push_back(op); |
1120 |
|
} |
1121 |
|
|
1122 |
< |
int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches, |
1123 |
< |
int &left, int spl_start, int spl_len, CigarOpCode spl_code) { |
1124 |
< |
//merge the original 'cigar' with the new insert/gap operation |
1125 |
< |
//at position spl_start and place the result into splcigar; |
1126 |
< |
//TODO: ideally this should also get and rebuild the MD string (alignment mismatches) |
1127 |
< |
int spl_mismatches=0; |
1128 |
< |
//return value: mismatches in the insert region for INS case, |
1129 |
< |
//or number of mismatches in the anchor region |
1130 |
< |
//return -1 if somehow the hit seems bad |
1131 |
< |
|
1132 |
< |
//these offsets are relative to the beginning of alignment on reference |
1133 |
< |
int spl_ofs=abs(spl_start-left); //relative position of splice op |
1134 |
< |
int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op |
1135 |
< |
CigarOp gapop(spl_code, spl_len); //for DEL, REF_SKIP, FUSIONS |
1136 |
< |
if (spl_code==INS) |
1137 |
< |
spl_ofs_end += spl_len; |
1138 |
< |
int ref_ofs=0; //working offset on reference |
1139 |
< |
int read_ofs=0; //working offset on the read, relative to the leftmost aligned base |
1140 |
< |
bool xfound=false; |
1141 |
< |
//if (left<=spl_start+spl_len) { |
1142 |
< |
if (spl_ofs_end>0) { |
1143 |
< |
int prev_opcode=0; |
1144 |
< |
int prev_oplen=0; |
1145 |
< |
for (size_t c = 0 ; c < cigar.size(); ++c) |
1122 |
> |
bool spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches, |
1123 |
> |
int &left, int spl_start, int spl_len, CigarOpCode spl_code, int& spl_mismatches) { |
1124 |
> |
//merge the original 'cigar' with the new insert/gap operation |
1125 |
> |
//at position spl_start and place the result into splcigar; |
1126 |
> |
//TODO: ideally this should also get and rebuild the MD string (alignment mismatches) |
1127 |
> |
|
1128 |
> |
//return value: mismatches in the insert region for INS case, |
1129 |
> |
//or number of mismatches in the anchor region |
1130 |
> |
//return -1 if somehow the hit seems bad |
1131 |
> |
|
1132 |
> |
//these offsets are relative to the beginning of alignment on reference |
1133 |
> |
int spl_ofs=spl_start-left; //relative position of splice op |
1134 |
> |
if (spl_code == FUSION_FF || spl_code == FUSION_FR || spl_code == FUSION_RF || spl_code == FUSION_RR) |
1135 |
> |
spl_ofs = abs(spl_ofs); |
1136 |
> |
int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op |
1137 |
> |
CigarOp gapop(spl_code, spl_len); //for DEL, REF_SKIP, FUSIONS |
1138 |
> |
if (spl_code==INS) |
1139 |
> |
spl_ofs_end += spl_len; |
1140 |
> |
|
1141 |
> |
int ref_ofs=0; //working offset on reference |
1142 |
> |
int read_ofs=0; //working offset on the read, relative to the leftmost aligned base |
1143 |
> |
bool xfound=false; |
1144 |
> |
//if (left<=spl_start+spl_len) { |
1145 |
> |
if (spl_ofs_end>0) { |
1146 |
> |
int prev_opcode=0; |
1147 |
> |
int prev_oplen=0; |
1148 |
> |
for (size_t c = 0 ; c < cigar.size(); ++c) |
1149 |
|
{ |
1150 |
|
int prev_read_ofs=read_ofs; |
1151 |
|
int cur_op_ofs=ref_ofs; |
1186 |
|
} |
1187 |
|
|
1188 |
|
if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) { |
1189 |
< |
if (cur_op_ofs==spl_ofs_end && spl_code!=INS) |
1190 |
< |
{ |
1191 |
< |
xfound=true; |
1192 |
< |
//we have to insert the gap here first |
1193 |
< |
cigar_add(splcigar, gapop); |
1194 |
< |
//also, check |
1195 |
< |
} |
1196 |
< |
cigar_add(splcigar, cigar[c]); |
1197 |
< |
} |
1189 |
> |
if (cur_op_ofs==spl_ofs_end) { |
1190 |
> |
if (spl_code!=INS) { |
1191 |
> |
if (cur_opcode!=INS) { |
1192 |
> |
xfound=true; |
1193 |
> |
//we have to insert the gap here first |
1194 |
> |
cigar_add(splcigar, gapop); |
1195 |
> |
//also, check |
1196 |
> |
} |
1197 |
> |
} |
1198 |
> |
} |
1199 |
> |
|
1200 |
> |
CigarOp op(cigar[c]); |
1201 |
> |
if (xfound) |
1202 |
> |
{ |
1203 |
> |
if (spl_code == FUSION_FR || spl_code == FUSION_RR) |
1204 |
> |
{ |
1205 |
> |
if (op.opcode == MATCH) |
1206 |
> |
op.opcode = mATCH; |
1207 |
> |
else if (op.opcode == INS) |
1208 |
> |
op.opcode = iNS; |
1209 |
> |
else if (op.opcode == DEL) |
1210 |
> |
op.opcode = dEL; |
1211 |
> |
else if (op.opcode == REF_SKIP) |
1212 |
> |
op.opcode = rEF_SKIP; |
1213 |
> |
} |
1214 |
> |
} |
1215 |
> |
else |
1216 |
> |
{ |
1217 |
> |
if (spl_code == FUSION_RF || spl_code == FUSION_RR) |
1218 |
> |
{ |
1219 |
> |
if (op.opcode == MATCH) |
1220 |
> |
op.opcode = mATCH; |
1221 |
> |
else if (op.opcode == INS) |
1222 |
> |
op.opcode = iNS; |
1223 |
> |
else if (op.opcode == DEL) |
1224 |
> |
op.opcode = dEL; |
1225 |
> |
else if (op.opcode == REF_SKIP) |
1226 |
> |
op.opcode = rEF_SKIP; |
1227 |
> |
} |
1228 |
> |
} |
1229 |
> |
|
1230 |
> |
cigar_add(splcigar, op); |
1231 |
> |
} |
1232 |
|
else //if (ref_ofs>spl_ofs) { |
1233 |
|
{ //op intersection |
1234 |
|
xfound=true; |
1262 |
|
CigarOp op(cigar[c]); |
1263 |
|
CigarOpCode opcode = op.opcode; |
1264 |
|
op.length=spl_ofs-cur_op_ofs; |
1265 |
< |
if (gapop.opcode == FUSION_RF || gapop.opcode == FUSION_RR) |
1265 |
> |
if (spl_code == FUSION_RF || spl_code == FUSION_RR) |
1266 |
|
{ |
1267 |
|
if (opcode == MATCH) |
1268 |
|
op.opcode = mATCH; |
1278 |
|
cigar_add(splcigar, gapop); |
1279 |
|
|
1280 |
|
op.opcode = opcode; |
1281 |
< |
if (gapop.opcode == FUSION_FR || gapop.opcode == FUSION_RR) |
1281 |
> |
if (spl_code == FUSION_FR || spl_code == FUSION_RR) |
1282 |
|
{ |
1283 |
|
if (opcode == MATCH) |
1284 |
|
op.opcode = mATCH; |
1313 |
|
//return spl_mismatches; |
1314 |
|
// } |
1315 |
|
|
1316 |
< |
return spl_mismatches; |
1316 |
> |
return xfound; |
1317 |
|
} |
1318 |
|
|
1319 |
|
bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1456 |
|
|
1457 |
|
vector<CigarOp> splcigar; |
1458 |
|
//this also updates left to the adjusted genomic coordinates |
1459 |
< |
int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, |
1460 |
< |
left, left_splice_pos+1, insertedSequence.length(), INS); |
1459 |
> |
int spl_num_mismatches=0; |
1460 |
> |
bool overlapped = spliceCigar(splcigar, samcigar, mismatches, |
1461 |
> |
left, left_splice_pos+1, insertedSequence.length(), |
1462 |
> |
INS, spl_num_mismatches); |
1463 |
> |
|
1464 |
> |
if (!overlapped) |
1465 |
> |
return false; |
1466 |
|
|
1467 |
|
if (spl_num_mismatches<0) return false; |
1468 |
|
num_mismatches-=spl_num_mismatches; |
1592 |
|
|
1593 |
|
CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP; |
1594 |
|
|
1595 |
< |
int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left, |
1596 |
< |
left_splice_pos+1, gap_len, opcode); |
1595 |
> |
int spl_num_mismatches=0; |
1596 |
> |
bool overlapped = spliceCigar(splcigar, samcigar, mismatches, left, |
1597 |
> |
left_splice_pos+1, gap_len, opcode, spl_num_mismatches); |
1598 |
> |
|
1599 |
> |
if (!overlapped) |
1600 |
> |
return false; |
1601 |
|
|
1602 |
|
if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches) |
1603 |
|
return false; |
1638 |
|
|
1639 |
|
|
1640 |
|
bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1641 |
< |
BowtieHit& bh, bool strip_slash, |
1642 |
< |
char* name_out, char* name_tags, |
1643 |
< |
char* seq, char* qual) { |
1641 |
> |
BowtieHit& bh, bool strip_slash, |
1642 |
> |
char* name_out, char* name_tags, |
1643 |
> |
char* seq, char* qual) { |
1644 |
|
if (_sam_header==NULL) |
1645 |
|
err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!"); |
1646 |
|
const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf; |
1742 |
|
num_hits = bam_aux2i(ptr); |
1743 |
|
} |
1744 |
|
|
1745 |
+ |
int alignment_score = 0; |
1746 |
+ |
bool has_alignment_score = false; |
1747 |
+ |
ptr = bam_aux_get(hit_buf, "AS"); |
1748 |
+ |
if (ptr) { |
1749 |
+ |
alignment_score = bam_aux2i(ptr); |
1750 |
+ |
has_alignment_score = true; |
1751 |
+ |
} |
1752 |
+ |
|
1753 |
|
string text_name = _sam_header->target_name[target_id]; |
1754 |
|
string text_name2 = ""; |
1755 |
|
|
1829 |
|
p_cig = t + 1; |
1830 |
|
cigar.push_back(CigarOp(opcode, length)); |
1831 |
|
|
1832 |
< |
if(opcode == INS) { |
1832 |
> |
if (opcode == INS) |
1833 |
|
num_mismatches -= length; |
1834 |
< |
} |
1782 |
< |
else if(opcode == DEL) { |
1834 |
> |
else if (opcode == DEL) |
1835 |
|
num_mismatches -= length; |
1836 |
< |
} |
1836 |
> |
|
1837 |
> |
if (!has_alignment_score) |
1838 |
> |
{ |
1839 |
> |
if (opcode == INS) |
1840 |
> |
alignment_score -= (bowtie2_read_gap_open * bowtie2_read_gap_cont * length); |
1841 |
> |
else if(opcode == DEL) |
1842 |
> |
alignment_score -= (bowtie2_ref_gap_open * bowtie2_ref_gap_cont * length); |
1843 |
> |
} |
1844 |
|
|
1845 |
|
/* |
1846 |
|
* update fusion direction. |
1872 |
|
int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT; |
1873 |
|
if (length <= 0) |
1874 |
|
{ |
1875 |
< |
fprintf (stderr, "BAM error: CIGAR op has zero length\n"); |
1875 |
> |
fprintf (stderr, "insert_id: %s - BAM error: CIGAR op has zero length\n", qname); |
1876 |
|
return false; |
1877 |
|
} |
1878 |
|
|
1909 |
|
* record to reflect this. Therefore, subtract out |
1910 |
|
* the mismatches due to in/dels |
1911 |
|
*/ |
1912 |
< |
if(opcode == INS){ |
1912 |
> |
if (opcode == INS) |
1913 |
|
num_mismatches -= length; |
1914 |
< |
} |
1856 |
< |
else if(opcode == DEL){ |
1914 |
> |
else if (opcode == DEL) |
1915 |
|
num_mismatches -= length; |
1916 |
< |
} |
1916 |
> |
|
1917 |
> |
if (!has_alignment_score) |
1918 |
> |
{ |
1919 |
> |
if (opcode == INS) |
1920 |
> |
alignment_score -= (bowtie2_read_gap_open * bowtie2_read_gap_cont * length); |
1921 |
> |
else if(opcode == DEL) |
1922 |
> |
alignment_score -= (bowtie2_ref_gap_open * bowtie2_ref_gap_cont * length); |
1923 |
> |
} |
1924 |
|
} |
1925 |
|
} |
1926 |
|
|
1927 |
+ |
if (!has_alignment_score) |
1928 |
+ |
alignment_score -= (num_mismatches * (bowtie2_max_penalty + bowtie2_min_penalty) / 2); |
1929 |
|
|
1930 |
|
string mrnm; |
1931 |
|
if (mate_target_id >= 0) { |
1932 |
|
if (mate_target_id == target_id) { |
1866 |
– |
//mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id]; |
1933 |
|
mrnm = _sam_header->target_name[mate_target_id]; |
1934 |
|
} |
1935 |
|
else { |
1870 |
– |
//fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n"); |
1936 |
|
return false; |
1937 |
|
} |
1938 |
|
} |
1965 |
|
0, |
1966 |
|
end); |
1967 |
|
} |
1968 |
+ |
|
1969 |
+ |
bh.alignment_score(alignment_score); |
1970 |
|
|
1971 |
|
return true; |
1972 |
|
} |
2168 |
|
|
2169 |
|
vector<CigarOp> splcigar; |
2170 |
|
//this also updates left to the adjusted genomic coordinates |
2171 |
< |
int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, |
2172 |
< |
left, left_splice_pos+1, insertedSequence.length(), INS); |
2171 |
> |
int spl_num_mismatches=0; |
2172 |
> |
bool overlapped=spliceCigar(splcigar, samcigar, mismatches, |
2173 |
> |
left, left_splice_pos+1, insertedSequence.length(), INS, spl_num_mismatches); |
2174 |
> |
|
2175 |
> |
if (!overlapped) |
2176 |
> |
return false; |
2177 |
|
|
2178 |
|
if (spl_num_mismatches<0) return false; |
2179 |
|
num_mismatches-=spl_num_mismatches; |
2187 |
|
num_mismatches, |
2188 |
|
0, |
2189 |
|
end); |
2190 |
+ |
|
2191 |
+ |
// daehwan - remove this |
2192 |
+ |
if (samcigar.size() > 1 && false) |
2193 |
+ |
{ |
2194 |
+ |
cout << text_name << "\t" << left << endl |
2195 |
+ |
<< print_cigar(samcigar) << endl |
2196 |
+ |
<< print_cigar(splcigar) << endl; |
2197 |
+ |
} |
2198 |
|
|
2199 |
|
return true; |
2200 |
|
} //"ins" |
2246 |
|
left_splice_pos -= 1; |
2247 |
|
else |
2248 |
|
left_splice_pos += 1; |
2249 |
< |
|
2250 |
< |
int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left, |
2251 |
< |
left_splice_pos, gap_len, opcode); |
2249 |
> |
|
2250 |
> |
int spl_num_mismatches=0; |
2251 |
> |
bool overlapped=spliceCigar(splcigar, samcigar, mismatches, left, |
2252 |
> |
left_splice_pos, gap_len, opcode, spl_num_mismatches); |
2253 |
> |
|
2254 |
> |
if (!overlapped) |
2255 |
> |
return false; |
2256 |
|
|
2257 |
|
if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches) |
2258 |
|
return false; |
2283 |
|
spl_num_mismatches, |
2284 |
|
end); |
2285 |
|
|
2286 |
+ |
// daehwan - remove this |
2287 |
+ |
if (samcigar.size() > 1 && false) |
2288 |
+ |
{ |
2289 |
+ |
cout << text_name << "\t" << left << endl |
2290 |
+ |
<< print_cigar(samcigar) << endl |
2291 |
+ |
<< print_cigar(splcigar) << endl; |
2292 |
+ |
} |
2293 |
+ |
|
2294 |
|
return true; |
2295 |
|
} |
2296 |
|
} //parse splice data |
2421 |
|
} |
2422 |
|
} |
2423 |
|
|
2333 |
– |
void print_hit(FILE* fout, |
2334 |
– |
const char* read_name, |
2335 |
– |
const BowtieHit& bh, |
2336 |
– |
const char* ref_name, |
2337 |
– |
const char* ref_name2, |
2338 |
– |
const char* sequence, |
2339 |
– |
const char* qualities) |
2340 |
– |
{ |
2341 |
– |
string seq; |
2342 |
– |
string quals; |
2343 |
– |
if (sequence) |
2344 |
– |
{ |
2345 |
– |
seq = sequence; |
2346 |
– |
quals = qualities; |
2347 |
– |
seq.resize(bh.read_len()); |
2348 |
– |
quals.resize(bh.read_len()); |
2349 |
– |
} |
2350 |
– |
else |
2351 |
– |
{ |
2352 |
– |
seq = "*"; |
2353 |
– |
} |
2354 |
– |
|
2355 |
– |
if (qualities) |
2356 |
– |
{ |
2357 |
– |
quals = qualities; |
2358 |
– |
quals.resize(bh.read_len()); |
2359 |
– |
} |
2360 |
– |
else |
2361 |
– |
{ |
2362 |
– |
quals = "*"; |
2363 |
– |
} |
2364 |
– |
|
2365 |
– |
uint32_t sam_flag = 0; |
2366 |
– |
if (bh.antisense_align()) |
2367 |
– |
{ |
2368 |
– |
sam_flag |= 0x0010; // BAM_FREVERSE |
2369 |
– |
} |
2370 |
– |
|
2371 |
– |
int left = bh.left(); |
2372 |
– |
|
2373 |
– |
uint32_t map_quality = 255; |
2374 |
– |
char cigar[256] = {0}; |
2375 |
– |
string mate_ref_name = "*"; |
2376 |
– |
uint32_t mate_pos = 0; |
2377 |
– |
uint32_t insert_size = 0; |
2378 |
– |
|
2379 |
– |
const vector<CigarOp>& bh_cigar = bh.cigar(); |
2380 |
– |
|
2381 |
– |
/* |
2382 |
– |
* In addition to calculating the cigar string, |
2383 |
– |
* we need to figure out how many in/dels are in the |
2384 |
– |
* sequence, so that we can give the correct |
2385 |
– |
* value for the NM tag |
2386 |
– |
*/ |
2387 |
– |
int indel_distance = 0; |
2388 |
– |
CigarOpCode fusion_dir = FUSION_NOTHING; |
2389 |
– |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
2390 |
– |
{ |
2391 |
– |
const CigarOp& op = bh_cigar[c]; |
2392 |
– |
|
2393 |
– |
char ibuf[64]; |
2394 |
– |
sprintf(ibuf, "%d", op.length); |
2395 |
– |
switch(op.opcode) |
2396 |
– |
{ |
2397 |
– |
case MATCH: |
2398 |
– |
case mATCH: |
2399 |
– |
strcat(cigar, ibuf); |
2400 |
– |
if (bh_cigar[c].opcode == MATCH) |
2401 |
– |
strcat(cigar, "M"); |
2402 |
– |
else |
2403 |
– |
strcat(cigar, "m"); |
2404 |
– |
break; |
2405 |
– |
case INS: |
2406 |
– |
case iNS: |
2407 |
– |
strcat(cigar, ibuf); |
2408 |
– |
if (bh_cigar[c].opcode == INS) |
2409 |
– |
strcat(cigar, "I"); |
2410 |
– |
else |
2411 |
– |
strcat(cigar, "i"); |
2412 |
– |
indel_distance += bh_cigar[c].length; |
2413 |
– |
break; |
2414 |
– |
case DEL: |
2415 |
– |
case dEL: |
2416 |
– |
strcat(cigar, ibuf); |
2417 |
– |
if (bh_cigar[c].opcode == DEL) |
2418 |
– |
strcat(cigar, "D"); |
2419 |
– |
else |
2420 |
– |
strcat(cigar, "d"); |
2421 |
– |
indel_distance += bh_cigar[c].length; |
2422 |
– |
break; |
2423 |
– |
case REF_SKIP: |
2424 |
– |
case rEF_SKIP: |
2425 |
– |
strcat(cigar, ibuf); |
2426 |
– |
if (bh_cigar[c].opcode == REF_SKIP) |
2427 |
– |
strcat(cigar, "N"); |
2428 |
– |
else |
2429 |
– |
strcat(cigar, "n"); |
2430 |
– |
break; |
2431 |
– |
case FUSION_FF: |
2432 |
– |
case FUSION_FR: |
2433 |
– |
case FUSION_RF: |
2434 |
– |
case FUSION_RR: |
2435 |
– |
fusion_dir = op.opcode; |
2436 |
– |
sprintf(ibuf, "%d", bh_cigar[c].length + 1); |
2437 |
– |
strcat(cigar, ibuf); |
2438 |
– |
strcat(cigar, "F"); |
2439 |
– |
break; |
2440 |
– |
default: |
2441 |
– |
break; |
2442 |
– |
} |
2443 |
– |
} |
2444 |
– |
|
2445 |
– |
char cigar1[256] = {0}, cigar2[256] = {0}; |
2446 |
– |
string left_seq, right_seq, left_qual, right_qual; |
2447 |
– |
int left1 = -1, left2 = -1; |
2448 |
– |
extract_partial_hits(bh, seq, quals, |
2449 |
– |
cigar1, cigar2, left_seq, right_seq, |
2450 |
– |
left_qual, right_qual, left1, left2); |
2451 |
– |
|
2452 |
– |
|
2453 |
– |
bool containsSplice = false; |
2454 |
– |
for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr) |
2455 |
– |
{ |
2456 |
– |
if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP) |
2457 |
– |
{ |
2458 |
– |
containsSplice = true; |
2459 |
– |
break; |
2460 |
– |
} |
2461 |
– |
} |
2462 |
– |
|
2463 |
– |
if (fusion_dir != FUSION_NOTHING) |
2464 |
– |
{ |
2465 |
– |
fprintf(fout, |
2466 |
– |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t", |
2467 |
– |
read_name, |
2468 |
– |
sam_flag, |
2469 |
– |
ref_name, |
2470 |
– |
left1 + 1, |
2471 |
– |
map_quality, |
2472 |
– |
cigar1, |
2473 |
– |
mate_ref_name.c_str(), |
2474 |
– |
mate_pos, |
2475 |
– |
insert_size, |
2476 |
– |
left_seq.c_str(), |
2477 |
– |
left_qual.c_str(), |
2478 |
– |
bh.edit_dist() + indel_distance); |
2479 |
– |
|
2480 |
– |
if (containsSplice) |
2481 |
– |
{ |
2482 |
– |
fprintf(fout, |
2483 |
– |
"XS:A:%c\t", |
2484 |
– |
bh.antisense_splice() ? '-' : '+'); |
2485 |
– |
} |
2486 |
– |
|
2487 |
– |
fprintf(fout, |
2488 |
– |
"FR:Z:1 %s-%s %d %s %s %s\n", |
2489 |
– |
ref_name, |
2490 |
– |
ref_name2, |
2491 |
– |
left + 1, |
2492 |
– |
cigar, |
2493 |
– |
seq.c_str(), |
2494 |
– |
quals.c_str()); |
2495 |
– |
|
2496 |
– |
fprintf(fout, |
2497 |
– |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t", |
2498 |
– |
read_name, |
2499 |
– |
sam_flag, |
2500 |
– |
ref_name2, |
2501 |
– |
left2 + 1, |
2502 |
– |
map_quality, |
2503 |
– |
cigar2, |
2504 |
– |
mate_ref_name.c_str(), |
2505 |
– |
mate_pos, |
2506 |
– |
insert_size, |
2507 |
– |
right_seq.c_str(), |
2508 |
– |
right_qual.c_str(), |
2509 |
– |
bh.edit_dist() + indel_distance); |
2510 |
– |
|
2511 |
– |
if (containsSplice) |
2512 |
– |
{ |
2513 |
– |
fprintf(fout, |
2514 |
– |
"XS:A:%c\t", |
2515 |
– |
bh.antisense_splice() ? '-' : '+'); |
2516 |
– |
} |
2517 |
– |
|
2518 |
– |
fprintf(fout, |
2519 |
– |
"FR:Z:2 %s-%s %d %s %s %s\n", |
2520 |
– |
ref_name, |
2521 |
– |
ref_name2, |
2522 |
– |
left + 1, |
2523 |
– |
cigar, |
2524 |
– |
seq.c_str(), |
2525 |
– |
quals.c_str()); |
2526 |
– |
} |
2527 |
– |
else |
2528 |
– |
{ |
2529 |
– |
fprintf(fout, |
2530 |
– |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t", |
2531 |
– |
read_name, |
2532 |
– |
sam_flag, |
2533 |
– |
ref_name, |
2534 |
– |
left + 1, |
2535 |
– |
map_quality, |
2536 |
– |
cigar, |
2537 |
– |
mate_ref_name.c_str(), |
2538 |
– |
mate_pos, |
2539 |
– |
insert_size, |
2540 |
– |
seq.c_str(), |
2541 |
– |
quals.c_str(), |
2542 |
– |
bh.edit_dist() + indel_distance); |
2543 |
– |
|
2544 |
– |
if (containsSplice) |
2545 |
– |
{ |
2546 |
– |
fprintf(fout, |
2547 |
– |
"XS:A:%c\t", |
2548 |
– |
bh.antisense_splice() ? '-' : '+'); |
2549 |
– |
} |
2550 |
– |
|
2551 |
– |
fprintf(fout, "\n"); |
2552 |
– |
} |
2553 |
– |
} |
2554 |
– |
|
2424 |
|
void print_bamhit(GBamWriter& wbam, |
2425 |
|
const char* read_name, |
2426 |
|
const BowtieHit& bh, |
2428 |
|
const char* ref_name2, |
2429 |
|
const char* sequence, |
2430 |
|
const char* qualities, |
2431 |
< |
bool from_bowtie) |
2431 |
> |
bool from_bowtie, |
2432 |
> |
const vector<string>* extra_fields) |
2433 |
|
{ |
2434 |
|
string seq; |
2435 |
|
string quals; |
2553 |
|
} |
2554 |
|
|
2555 |
|
vector<string> auxdata; |
2556 |
+ |
if (extra_fields) |
2557 |
+ |
auxdata.insert(auxdata.end(), extra_fields->begin(), extra_fields->end()); |
2558 |
+ |
|
2559 |
|
if (!sam_readgroup_id.empty()) |
2560 |
|
{ |
2561 |
|
string nm("RG:Z:"); |
2568 |
|
auxdata.push_back(nm); |
2569 |
|
|
2570 |
|
if (containsSplice) { |
2571 |
< |
nm="XS:A:"; |
2572 |
< |
nm+=(char)(bh.antisense_splice() ? '-' : '+'); |
2573 |
< |
auxdata.push_back(nm); |
2571 |
> |
// do not add more than once |
2572 |
> |
bool XS_found = false; |
2573 |
> |
for (size_t i = 0; i < auxdata.size(); ++i) |
2574 |
> |
{ |
2575 |
> |
if (auxdata[i].substr(0, 2) == "XS") |
2576 |
> |
{ |
2577 |
> |
XS_found = true; |
2578 |
> |
break; |
2579 |
> |
} |
2580 |
> |
} |
2581 |
> |
|
2582 |
> |
if (!XS_found) |
2583 |
> |
{ |
2584 |
> |
nm="XS:A:"; |
2585 |
> |
nm+=(char)(bh.antisense_splice() ? '-' : '+'); |
2586 |
> |
auxdata.push_back(nm); |
2587 |
> |
} |
2588 |
|
} |
2589 |
|
|
2590 |
|
if (fusion_dir != FUSION_NOTHING) |
2617 |
|
quals.c_str()); |
2618 |
|
auxdata.back() = XF; |
2619 |
|
|
2620 |
< |
brec = wbam.new_record(read_name, sam_flag, ref_name, left2 + 1, map_quality, |
2620 |
> |
brec = wbam.new_record(read_name, sam_flag, ref_name2, left2 + 1, map_quality, |
2621 |
|
cigar2, mate_ref_name.c_str(), mate_pos, |
2622 |
|
insert_size, right_seq.c_str(), right_qual.c_str(), &auxdata); |
2623 |
|
|
2889 |
|
} |
2890 |
|
|
2891 |
|
|
2892 |
< |
bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt) |
2892 |
> |
bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt, bool bDebug) |
2893 |
|
{ |
2894 |
|
RefSequenceTable::Sequence* ref_str1 = rt.get_seq(_ref_id); |
2895 |
|
RefSequenceTable::Sequence* ref_str2 = rt.get_seq(_ref_id2); |
2896 |
|
|
2897 |
|
if (!ref_str1 || !ref_str2) |
2898 |
|
return false; |
2899 |
+ |
|
2900 |
+ |
if (bDebug) |
2901 |
+ |
{ |
2902 |
+ |
cout << "check_editdist_consistency" << endl |
2903 |
+ |
<< "insert id: " << _insert_id << endl; |
2904 |
+ |
} |
2905 |
|
|
2906 |
|
RefSequenceTable::Sequence* ref_str = ref_str1; |
2907 |
|
|
2908 |
|
size_t pos_seq = 0; |
2909 |
|
size_t pos_ref = _left; |
2910 |
|
size_t mismatch = 0; |
2911 |
+ |
size_t N_mismatch = 0; |
2912 |
|
bool bSawFusion = false; |
2913 |
|
for (size_t i = 0; i < _cigar.size(); ++i) |
2914 |
|
{ |
2920 |
|
seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length); |
2921 |
|
for (size_t j = 0; j < cigar.length; ++j) |
2922 |
|
{ |
2923 |
< |
if (_seq[pos_seq] != ref_seq[j]) |
2923 |
> |
seqan::Dna5 ref_nt = _seq[pos_seq]; |
2924 |
> |
if (ref_nt != ref_seq[j]) |
2925 |
|
++mismatch; |
2926 |
+ |
|
2927 |
+ |
if (ref_nt == ref_seq[j] && ref_nt == 'N') |
2928 |
+ |
++N_mismatch; |
2929 |
+ |
|
2930 |
+ |
if (bDebug) |
2931 |
+ |
cout << pos_seq << "\t" << ref_nt << " vs. " << ref_seq[j] << "\t" << mismatch << endl; |
2932 |
+ |
|
2933 |
|
++pos_seq; |
2934 |
|
} |
2935 |
|
|
2939 |
|
case mATCH: |
2940 |
|
{ |
2941 |
|
seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1); |
2942 |
< |
seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>()); |
3041 |
< |
seqan::reverseInPlace(ref_seq); |
2942 |
> |
seqan::reverseComplement(ref_seq); |
2943 |
|
|
2944 |
|
for (size_t j = 0; j < cigar.length; ++j) |
2945 |
|
{ |
2946 |
< |
if (_seq[pos_seq] != ref_seq[j]) |
2946 |
> |
seqan::Dna5 ref_nt = _seq[pos_seq]; |
2947 |
> |
if (ref_nt != ref_seq[j]) |
2948 |
|
++mismatch; |
2949 |
+ |
|
2950 |
+ |
if (ref_nt == ref_seq[j] && ref_nt == 'N') |
2951 |
+ |
++N_mismatch; |
2952 |
+ |
|
2953 |
+ |
if (bDebug) |
2954 |
+ |
cout << pos_seq << "\t" << ref_nt << " vs. " << ref_seq[j] << "\t" << mismatch << endl; |
2955 |
+ |
|
2956 |
|
++pos_seq; |
2957 |
|
} |
2958 |
|
|
3001 |
|
} |
3002 |
|
} |
3003 |
|
|
3004 |
< |
return mismatch == _edit_dist; |
3004 |
> |
if (bDebug) |
3005 |
> |
cout << "mismatch (real) vs. (calculated):" << mismatch << " vs. " << (int)_edit_dist << endl; |
3006 |
> |
|
3007 |
> |
return mismatch == _edit_dist || mismatch + N_mismatch == _edit_dist; |
3008 |
> |
} |
3009 |
> |
|
3010 |
> |
void bowtie_sam_extra(const BowtieHit& bh, const RefSequenceTable& rt, vector<string>& fields) |
3011 |
> |
{ |
3012 |
> |
RefSequenceTable::Sequence* ref_str1 = rt.get_seq(bh.ref_id()); |
3013 |
> |
RefSequenceTable::Sequence* ref_str2 = rt.get_seq(bh.ref_id2()); |
3014 |
> |
|
3015 |
> |
if (!ref_str1 || !ref_str2) |
3016 |
> |
return; |
3017 |
> |
|
3018 |
> |
RefSequenceTable::Sequence* ref_str = ref_str1; |
3019 |
> |
|
3020 |
> |
size_t pos_seq = 0; |
3021 |
> |
size_t pos_mismatch = 0; |
3022 |
> |
size_t pos_ref = bh.left(); |
3023 |
> |
size_t mismatch = 0; |
3024 |
> |
size_t N_mismatch = 0; |
3025 |
> |
size_t num_gap_opens = 0; |
3026 |
> |
size_t num_gap_conts = 0; |
3027 |
> |
bool bSawFusion = false; |
3028 |
> |
|
3029 |
> |
int AS_score = 0; |
3030 |
> |
|
3031 |
> |
const vector<CigarOp>& cigars = bh.cigar(); |
3032 |
> |
const string& seq = bh.seq(); |
3033 |
> |
const string& qual = bh.qual(); |
3034 |
> |
|
3035 |
> |
string AS = "AS:i:"; |
3036 |
> |
string MD = "MD:Z:"; |
3037 |
> |
|
3038 |
> |
for (size_t i = 0; i < cigars.size(); ++i) |
3039 |
> |
{ |
3040 |
> |
CigarOp cigar = cigars[i]; |
3041 |
> |
switch(cigar.opcode) |
3042 |
> |
{ |
3043 |
> |
case MATCH: |
3044 |
> |
case mATCH: |
3045 |
> |
{ |
3046 |
> |
seqan::Dna5String ref_seq; |
3047 |
> |
if (cigar.opcode == MATCH) |
3048 |
> |
{ |
3049 |
> |
ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length); |
3050 |
> |
pos_ref += cigar.length; |
3051 |
> |
} |
3052 |
> |
else |
3053 |
> |
{ |
3054 |
> |
ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1); |
3055 |
> |
seqan::reverseComplement(ref_seq); |
3056 |
> |
pos_ref -= cigar.length; |
3057 |
> |
} |
3058 |
> |
|
3059 |
> |
for (size_t j = 0; j < cigar.length; ++j) |
3060 |
> |
{ |
3061 |
> |
seqan::Dna5 ref_nt = ref_seq[j]; |
3062 |
> |
if (seq[pos_seq] != ref_nt) |
3063 |
> |
{ |
3064 |
> |
++mismatch; |
3065 |
> |
|
3066 |
> |
if (pos_seq < qual.length()) |
3067 |
> |
{ |
3068 |
> |
if (seq[pos_seq] == 'N' || ref_nt == 'N') |
3069 |
> |
{ |
3070 |
> |
AS_score -= (int)bowtie2_penalty_for_N; |
3071 |
> |
} |
3072 |
> |
else |
3073 |
> |
{ |
3074 |
> |
float penalty = bowtie2_min_penalty + (bowtie2_max_penalty - bowtie2_min_penalty) * min((int)(qual[pos_seq] - '!'), 40) / 40.0; |
3075 |
> |
AS_score -= (int)penalty; |
3076 |
> |
} |
3077 |
> |
} |
3078 |
> |
|
3079 |
> |
str_appendInt(MD, (int)pos_mismatch); |
3080 |
> |
MD.push_back((char)ref_nt); |
3081 |
> |
pos_mismatch = 0; |
3082 |
> |
} |
3083 |
> |
else |
3084 |
> |
{ |
3085 |
> |
if (ref_nt == 'N') |
3086 |
> |
{ |
3087 |
> |
++N_mismatch; |
3088 |
> |
AS_score -= (int)bowtie2_penalty_for_N; |
3089 |
> |
} |
3090 |
> |
|
3091 |
> |
++pos_mismatch; |
3092 |
> |
} |
3093 |
> |
|
3094 |
> |
++pos_seq; |
3095 |
> |
} |
3096 |
> |
} |
3097 |
> |
break; |
3098 |
> |
|
3099 |
> |
case INS: |
3100 |
> |
case iNS: |
3101 |
> |
{ |
3102 |
> |
pos_seq += cigar.length; |
3103 |
> |
|
3104 |
> |
AS_score -= bowtie2_read_gap_open; |
3105 |
> |
AS_score -= (int)(bowtie2_read_gap_cont * cigar.length); |
3106 |
> |
|
3107 |
> |
num_gap_opens += 1; |
3108 |
> |
num_gap_conts += cigar.length; |
3109 |
> |
} |
3110 |
> |
break; |
3111 |
> |
|
3112 |
> |
case DEL: |
3113 |
> |
case dEL: |
3114 |
> |
{ |
3115 |
> |
AS_score -= bowtie2_ref_gap_open; |
3116 |
> |
AS_score -= (int)(bowtie2_ref_gap_cont * cigar.length); |
3117 |
> |
|
3118 |
> |
num_gap_opens += 1; |
3119 |
> |
num_gap_conts += cigar.length; |
3120 |
> |
|
3121 |
> |
seqan::Dna5String ref_seq; |
3122 |
> |
if (cigar.opcode == DEL) |
3123 |
> |
{ |
3124 |
> |
ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length); |
3125 |
> |
pos_ref += cigar.length; |
3126 |
> |
} |
3127 |
> |
else |
3128 |
> |
{ |
3129 |
> |
ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1); |
3130 |
> |
seqan::reverseComplement(ref_seq); |
3131 |
> |
pos_ref -= cigar.length; |
3132 |
> |
} |
3133 |
> |
|
3134 |
> |
str_appendInt(MD, (int)pos_mismatch); |
3135 |
> |
MD.push_back('^'); |
3136 |
> |
for (size_t k = 0; k < length(ref_seq); ++k) |
3137 |
> |
MD.push_back((char)ref_seq[k]); |
3138 |
> |
|
3139 |
> |
pos_mismatch = 0; |
3140 |
> |
} |
3141 |
> |
break; |
3142 |
> |
|
3143 |
> |
case REF_SKIP: |
3144 |
> |
case rEF_SKIP: |
3145 |
> |
{ |
3146 |
> |
if (cigar.opcode == REF_SKIP) |
3147 |
> |
pos_ref += cigar.length; |
3148 |
> |
else |
3149 |
> |
pos_ref -= cigar.length; |
3150 |
> |
} |
3151 |
> |
break; |
3152 |
> |
|
3153 |
> |
case FUSION_FF: |
3154 |
> |
case FUSION_FR: |
3155 |
> |
case FUSION_RF: |
3156 |
> |
case FUSION_RR: |
3157 |
> |
{ |
3158 |
> |
// We don't allow a read spans more than two chromosomes. |
3159 |
> |
if (bSawFusion) |
3160 |
> |
return; |
3161 |
> |
|
3162 |
> |
ref_str = ref_str2; |
3163 |
> |
pos_ref = cigar.length; |
3164 |
> |
|
3165 |
> |
bSawFusion = true; |
3166 |
> |
} |
3167 |
> |
break; |
3168 |
> |
|
3169 |
> |
default: |
3170 |
> |
break; |
3171 |
> |
} |
3172 |
> |
} |
3173 |
> |
|
3174 |
> |
str_appendInt(AS, AS_score); |
3175 |
> |
fields.push_back(AS); |
3176 |
> |
|
3177 |
> |
string XM = "XM:i:"; |
3178 |
> |
str_appendInt(XM, (int)mismatch); |
3179 |
> |
fields.push_back(XM); |
3180 |
> |
|
3181 |
> |
string XO = "XO:i:"; |
3182 |
> |
str_appendInt(XO, (int)num_gap_opens); |
3183 |
> |
fields.push_back(XO); |
3184 |
> |
|
3185 |
> |
string XG = "XG:i:"; |
3186 |
> |
str_appendInt(XG, (int)num_gap_conts); |
3187 |
> |
fields.push_back(XG); |
3188 |
> |
|
3189 |
> |
str_appendInt(MD, (int)pos_mismatch); |
3190 |
> |
fields.push_back(MD); |
3191 |
|
} |