18 |
|
#include <vector> |
19 |
|
#include <cmath> |
20 |
|
|
21 |
+ |
#include <seqan/sequence.h> |
22 |
+ |
#include <seqan/find.h> |
23 |
+ |
#include <seqan/file.h> |
24 |
+ |
#include <seqan/modifier.h> |
25 |
+ |
|
26 |
|
#include "common.h" |
27 |
|
#include "bwt_map.h" |
28 |
|
#include "tokenize.h" |
114 |
|
else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file)); |
115 |
|
} |
116 |
|
|
117 |
+ |
void LineHitFactory::seek(HitStream& hs, int64_t offset) |
118 |
+ |
{ |
119 |
+ |
// daehwan - implement this later |
120 |
+ |
if (hs._fzpipe != NULL) { |
121 |
+ |
hs._fzpipe->seek(offset); |
122 |
+ |
hs._hit_file=hs._fzpipe->file; |
123 |
+ |
} |
124 |
+ |
// else if (hs._hit_file) ::seek((FILE*)(hs._hit_file)); |
125 |
+ |
} |
126 |
+ |
|
127 |
|
bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) { |
128 |
|
FILE* f=(FILE *)(hs._hit_file); |
129 |
|
bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL); |
197 |
|
this->openStream(hs); |
198 |
|
} |
199 |
|
|
200 |
+ |
void BAMHitFactory::seek(HitStream& hs, int64_t offset) |
201 |
+ |
{ |
202 |
+ |
if (hs._hit_file) { |
203 |
+ |
bgzf_seek(((samfile_t*)hs._hit_file)->x.bam, offset, SEEK_SET); |
204 |
+ |
} |
205 |
+ |
} |
206 |
+ |
|
207 |
|
string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) { |
208 |
|
const bam1_t* bamrec=(const bam1_t*)hit_buf; |
209 |
|
char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec); |
237 |
|
|
238 |
|
BowtieHit HitFactory::create_hit(const string& insert_name, |
239 |
|
const string& ref_name, |
240 |
+ |
const string& ref_name2, |
241 |
|
int left, |
242 |
|
const vector<CigarOp>& cigar, |
243 |
|
bool antisense_aln, |
247 |
|
bool end) |
248 |
|
{ |
249 |
|
uint64_t insert_id = _insert_table.get_id(insert_name); |
250 |
+ |
|
251 |
|
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
252 |
+ |
uint32_t reference_id2 = reference_id; |
253 |
+ |
|
254 |
+ |
if (ref_name2.length() > 0) |
255 |
+ |
reference_id2 = _ref_table.get_id(ref_name2, NULL, 0); |
256 |
|
|
257 |
|
return BowtieHit(reference_id, |
258 |
+ |
reference_id2, |
259 |
|
insert_id, |
260 |
|
left, |
261 |
|
cigar, |
278 |
|
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
279 |
|
|
280 |
|
return BowtieHit(reference_id, |
281 |
+ |
reference_id, |
282 |
|
insert_id, |
283 |
|
left, |
284 |
|
read_len, |
403 |
|
|
404 |
|
int anchor_mismatch = 0; |
405 |
|
|
406 |
+ |
|
407 |
+ |
void parseSegReadName(char* name, char*& name_tags, bool strip_slash, |
408 |
+ |
bool &end, unsigned int &seg_offset, unsigned int& seg_num, |
409 |
+ |
unsigned int & num_segs) { |
410 |
+ |
char* pipe = strrchr(name, '|'); |
411 |
+ |
if (pipe) |
412 |
+ |
{ |
413 |
+ |
if (name_tags) |
414 |
+ |
strcpy(name_tags, pipe); |
415 |
+ |
|
416 |
+ |
char* tag_buf = pipe + 1; |
417 |
+ |
if (strchr(tag_buf, ':')) |
418 |
+ |
{ |
419 |
+ |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
420 |
+ |
if (seg_num + 1 == num_segs) |
421 |
+ |
end = true; |
422 |
+ |
else |
423 |
+ |
end = false; |
424 |
+ |
} |
425 |
+ |
|
426 |
+ |
*pipe = 0; |
427 |
+ |
} |
428 |
+ |
|
429 |
+ |
// Stripping the slash and number following it gives the insert name |
430 |
+ |
char* slash = strrchr(name, '/'); |
431 |
+ |
if (strip_slash && slash) |
432 |
+ |
*slash = 0; |
433 |
+ |
} |
434 |
+ |
|
435 |
+ |
|
436 |
|
bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
437 |
|
BowtieHit& bh, |
438 |
|
bool strip_slash, |
486 |
|
unsigned int num_segs = 0; |
487 |
|
|
488 |
|
// Copy the tag out of the name field before we might wipe it out |
489 |
< |
char* pipe = strrchr(name, '|'); |
430 |
< |
if (pipe) |
431 |
< |
{ |
432 |
< |
if (name_tags) |
433 |
< |
strcpy(name_tags, pipe); |
434 |
< |
|
435 |
< |
char* tag_buf = pipe + 1; |
436 |
< |
if (strchr(tag_buf, ':')) |
437 |
< |
{ |
438 |
< |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
439 |
< |
if (seg_num + 1 == num_segs) |
440 |
< |
end = true; |
441 |
< |
else |
442 |
< |
end = false; |
443 |
< |
} |
489 |
> |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
490 |
|
|
445 |
– |
*pipe = 0; |
446 |
– |
} |
447 |
– |
// Stripping the slash and number following it gives the insert name |
448 |
– |
char* slash = strrchr(name, '/'); |
449 |
– |
if (strip_slash && slash) |
450 |
– |
*slash = 0; |
451 |
– |
|
452 |
– |
//int read_len = strlen(sequence); |
453 |
– |
|
491 |
|
// Add this alignment to the table of hits for this half of the |
492 |
|
// Bowtie map |
493 |
|
|
537 |
|
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
538 |
|
uint32_t right = left + spliced_read_len - 1; |
539 |
|
|
540 |
+ |
|
541 |
|
/* |
542 |
|
* The 0-based position of the last genomic sequence before the insertion |
543 |
|
*/ |
650 |
|
*/ |
651 |
|
bh = create_hit(name, |
652 |
|
contig, |
653 |
+ |
"", |
654 |
|
left, |
655 |
|
cigar, |
656 |
|
orientation == '-', |
663 |
|
|
664 |
|
else |
665 |
|
{ |
666 |
< |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
666 |
> |
const string& junction_type = toks[num_extra_toks + junction_type_field]; |
667 |
> |
string junction_strand = toks[num_extra_toks + strand_field]; |
668 |
> |
|
669 |
|
int spliced_read_len = strlen(seq_str); |
670 |
< |
int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1; |
670 |
> |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
671 |
> |
int8_t left_splice_pos = atoi(splice_toks[0].c_str()); |
672 |
> |
if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr")) |
673 |
> |
{ |
674 |
> |
left += text_offset; |
675 |
> |
left_splice_pos = left_splice_pos - left + 1; |
676 |
> |
} |
677 |
> |
else |
678 |
> |
{ |
679 |
> |
left -= text_offset; |
680 |
> |
left_splice_pos = left - left_splice_pos + 1; |
681 |
> |
} |
682 |
> |
|
683 |
|
if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len; |
684 |
|
int8_t right_splice_pos = spliced_read_len - left_splice_pos; |
685 |
< |
|
685 |
> |
|
686 |
> |
int gap_len = 0; |
687 |
> |
if (junction_type == "fus") |
688 |
> |
gap_len = atoi(splice_toks[1].c_str()); |
689 |
> |
else |
690 |
> |
gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1; |
691 |
> |
|
692 |
|
if (right_splice_pos <= 0 || left_splice_pos <= 0) |
693 |
|
return false; |
694 |
|
|
703 |
|
if (right_splice_pos + seg_offset < _anchor_length) |
704 |
|
return false; |
705 |
|
} |
706 |
< |
//uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos; |
707 |
< |
//atoi(toks[right_window_edge_field].c_str()); |
649 |
< |
int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1; |
650 |
< |
|
651 |
< |
string junction_strand = toks[num_extra_toks + strand_field]; |
652 |
< |
if (!(junction_strand == "rev" || junction_strand == "fwd")|| |
706 |
> |
|
707 |
> |
if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")|| |
708 |
|
!(orientation == '-' || orientation == '+')) |
709 |
|
{ |
710 |
|
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
712 |
|
// junction_strand.c_str(), orientation); |
713 |
|
return false; |
714 |
|
} |
715 |
< |
|
661 |
< |
//vector<string> mismatch_toks; |
715 |
> |
|
716 |
|
char* pch = strtok (mismatches,","); |
717 |
|
int mismatches_in_anchor = 0; |
718 |
|
unsigned char num_mismatches = 0; |
728 |
|
(orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len) |
729 |
|
mismatches_in_anchor++; |
730 |
|
} |
677 |
– |
//mismatch_toks.push_back(pch); |
731 |
|
pch = strtok (NULL, ","); |
732 |
|
} |
733 |
|
|
734 |
|
// FIXME: we probably should exclude these hits somewhere, but this |
735 |
|
// isn't the right place |
736 |
|
vector<CigarOp> cigar; |
737 |
< |
cigar.push_back(CigarOp(MATCH, left_splice_pos)); |
738 |
< |
if(toks[num_extra_toks + junction_type_field] == "del"){ |
737 |
> |
if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr")) |
738 |
> |
cigar.push_back(CigarOp(MATCH, left_splice_pos)); |
739 |
> |
else |
740 |
> |
cigar.push_back(CigarOp(mATCH, left_splice_pos)); |
741 |
> |
|
742 |
> |
if(junction_type == "del") |
743 |
|
cigar.push_back(CigarOp(DEL, gap_len)); |
744 |
< |
}else{ |
744 |
> |
else if(junction_type == "fus") |
745 |
> |
{ |
746 |
> |
if (junction_strand == "ff") |
747 |
> |
cigar.push_back(CigarOp(FUSION_FF, gap_len)); |
748 |
> |
else if (junction_strand == "fr") |
749 |
> |
cigar.push_back(CigarOp(FUSION_FR, gap_len)); |
750 |
> |
else if (junction_strand == "rf") |
751 |
> |
cigar.push_back(CigarOp(FUSION_RF, gap_len)); |
752 |
> |
else |
753 |
> |
cigar.push_back(CigarOp(FUSION_RR, gap_len)); |
754 |
> |
} |
755 |
> |
else |
756 |
|
cigar.push_back(CigarOp(REF_SKIP, gap_len)); |
757 |
< |
} |
758 |
< |
cigar.push_back(CigarOp(MATCH, right_splice_pos)); |
759 |
< |
|
757 |
> |
|
758 |
> |
if (junction_type != "fus" || (junction_strand != "fr" && junction_strand != "rr")) |
759 |
> |
cigar.push_back(CigarOp(MATCH, right_splice_pos)); |
760 |
> |
else |
761 |
> |
cigar.push_back(CigarOp(mATCH, right_splice_pos)); |
762 |
> |
|
763 |
> |
string contig2 = ""; |
764 |
> |
if (junction_type == "fus") |
765 |
> |
{ |
766 |
> |
vector<string> contigs; |
767 |
> |
tokenize(contig, "-", contigs); |
768 |
> |
if (contigs.size() != 2) |
769 |
> |
return false; |
770 |
> |
|
771 |
> |
contig = contigs[0]; |
772 |
> |
contig2 = contigs[1]; |
773 |
> |
|
774 |
> |
if (junction_strand == "rf" || junction_strand == "rr") |
775 |
> |
orientation = (orientation == '+' ? '-' : '+'); |
776 |
> |
} |
777 |
> |
|
778 |
|
bh = create_hit(name, |
779 |
|
contig, |
780 |
< |
left, |
780 |
> |
contig2, |
781 |
> |
left, |
782 |
|
cigar, |
783 |
|
orientation == '-', |
784 |
|
junction_strand == "rev", |
799 |
|
return false; |
800 |
|
} |
801 |
|
|
802 |
+ |
|
803 |
+ |
int parseCigar(vector<CigarOp>& cigar, const char* cigar_str, |
804 |
+ |
bool &spliced_alignment) { |
805 |
+ |
const char* p_cig = cigar_str; |
806 |
+ |
int refspan=0; //alignment span on reference sequence |
807 |
+ |
|
808 |
+ |
while (*p_cig) |
809 |
+ |
{ |
810 |
+ |
char* t; |
811 |
+ |
int op_len = (int)strtol(p_cig, &t, 10); |
812 |
+ |
if (op_len <= 0) |
813 |
+ |
{ |
814 |
+ |
fprintf (stderr, "Error: CIGAR op has zero length\n"); |
815 |
+ |
return 0; |
816 |
+ |
} |
817 |
+ |
char op_char = toupper(*t); |
818 |
+ |
CigarOpCode opcode; |
819 |
+ |
switch (op_char) { |
820 |
+ |
case '=': |
821 |
+ |
case 'X': |
822 |
+ |
case 'M': opcode = MATCH; |
823 |
+ |
refspan+=op_len; |
824 |
+ |
break; |
825 |
+ |
case 'I': opcode = INS; |
826 |
+ |
break; |
827 |
+ |
case 'D': opcode = DEL; |
828 |
+ |
refspan+=op_len; |
829 |
+ |
break; |
830 |
+ |
case 'N': if (op_len > max_report_intron_length) |
831 |
+ |
return 0; |
832 |
+ |
opcode = REF_SKIP; |
833 |
+ |
spliced_alignment = true; |
834 |
+ |
refspan+=op_len; |
835 |
+ |
break; |
836 |
+ |
case 'S': opcode = SOFT_CLIP; |
837 |
+ |
break; |
838 |
+ |
case 'H': opcode = HARD_CLIP; |
839 |
+ |
break; |
840 |
+ |
case 'P': opcode = PAD; |
841 |
+ |
break; |
842 |
+ |
default: fprintf (stderr, "Error: invalid CIGAR operation\n"); |
843 |
+ |
return 0; |
844 |
+ |
} |
845 |
+ |
p_cig = t + 1; |
846 |
+ |
cigar.push_back(CigarOp(opcode, op_len)); |
847 |
+ |
} //while cigar codes |
848 |
+ |
if (*p_cig) { |
849 |
+ |
fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig); |
850 |
+ |
return 0; |
851 |
+ |
} |
852 |
+ |
return refspan; |
853 |
+ |
} |
854 |
+ |
|
855 |
+ |
int getBAMmismatches(const bam1_t* buf, vector<CigarOp>& cigar, |
856 |
+ |
vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) { |
857 |
+ |
int gspan=0;//genomic span of the alignment |
858 |
+ |
sam_nm=0; |
859 |
+ |
int num_mismatches=0; |
860 |
+ |
|
861 |
+ |
uint8_t* ptr = bam_aux_get(buf, "XS"); |
862 |
+ |
if (ptr) { |
863 |
+ |
char src_strand_char = bam_aux2A(ptr); |
864 |
+ |
if (src_strand_char == '-') |
865 |
+ |
antisense_splice = true; |
866 |
+ |
} |
867 |
+ |
|
868 |
+ |
ptr = bam_aux_get(buf, "MD"); |
869 |
+ |
if (ptr) { |
870 |
+ |
const char* p = bam_aux2Z(ptr); |
871 |
+ |
int bi=0; //base offset position in the read |
872 |
+ |
while (*p != 0) { |
873 |
+ |
if (isdigit(*p)) { |
874 |
+ |
int v=atoi(p); |
875 |
+ |
do { p++; } while (isdigit(*p)); |
876 |
+ |
bi+=v; |
877 |
+ |
} |
878 |
+ |
while (isalpha(*p)) { |
879 |
+ |
p++; |
880 |
+ |
num_mismatches++; |
881 |
+ |
//mismatches.push_back(bi); |
882 |
+ |
mismatches[bi]=true; |
883 |
+ |
bi++; |
884 |
+ |
} |
885 |
+ |
if (*p=='^') { //reference deletion |
886 |
+ |
p++; |
887 |
+ |
while (isalpha(*p)) { //insert read bases |
888 |
+ |
p++; bi++; |
889 |
+ |
} |
890 |
+ |
} |
891 |
+ |
} |
892 |
+ |
} |
893 |
+ |
|
894 |
+ |
/* By convention,the NM field of the SAM record |
895 |
+ |
* counts an insertion or deletion. I dont' think |
896 |
+ |
* we want the mismatch count in the BowtieHit |
897 |
+ |
* record to reflect this. Therefore, subtract out |
898 |
+ |
* the mismatches due to in/dels |
899 |
+ |
*/ |
900 |
+ |
for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){ |
901 |
+ |
switch (itr->opcode) |
902 |
+ |
{ |
903 |
+ |
case MATCH: |
904 |
+ |
case REF_SKIP: |
905 |
+ |
case PAD: |
906 |
+ |
gspan += itr->length; |
907 |
+ |
break; |
908 |
+ |
case DEL: |
909 |
+ |
gspan += itr->length; |
910 |
+ |
sam_nm -= itr->length; |
911 |
+ |
break; |
912 |
+ |
case INS: |
913 |
+ |
sam_nm -= itr->length; |
914 |
+ |
break; |
915 |
+ |
default: |
916 |
+ |
break; |
917 |
+ |
} |
918 |
+ |
} |
919 |
+ |
return num_mismatches; |
920 |
+ |
} |
921 |
+ |
|
922 |
+ |
int getSAMmismatches(char* &buf, vector<CigarOp>& cigar, |
923 |
+ |
vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) { |
924 |
+ |
int gspan=0;//genomic span of the alignment |
925 |
+ |
const char* tag_buf = buf; |
926 |
+ |
sam_nm=0; |
927 |
+ |
int num_mismatches=0; |
928 |
+ |
while((tag_buf = get_token((char**)&buf,"\t"))) |
929 |
+ |
{ |
930 |
+ |
vector<string> tuple_fields; |
931 |
+ |
tokenize(tag_buf,":", tuple_fields); |
932 |
+ |
if (tuple_fields.size() == 3) |
933 |
+ |
{ |
934 |
+ |
if (tuple_fields[0] == "XS") |
935 |
+ |
{ |
936 |
+ |
if (tuple_fields[2] == "-") |
937 |
+ |
antisense_splice = true; |
938 |
+ |
} |
939 |
+ |
else if (tuple_fields[0] == "NM") |
940 |
+ |
{ |
941 |
+ |
sam_nm = atoi(tuple_fields[2].c_str()); |
942 |
+ |
} |
943 |
+ |
else if (tuple_fields[0] == "NS") |
944 |
+ |
{ |
945 |
+ |
//ignored for now |
946 |
+ |
} |
947 |
+ |
else if (tuple_fields[0] == "MD") |
948 |
+ |
{ |
949 |
+ |
const char* p=tuple_fields[2].c_str(); |
950 |
+ |
int bi=0; //base offset position in the read |
951 |
+ |
while (*p != 0) { |
952 |
+ |
if (isdigit(*p)) { |
953 |
+ |
int v=atoi(p); |
954 |
+ |
do { p++; } while (isdigit(*p)); |
955 |
+ |
bi+=v; |
956 |
+ |
} |
957 |
+ |
while (isalpha(*p)) { |
958 |
+ |
p++; |
959 |
+ |
num_mismatches++; |
960 |
+ |
//mismatches.push_back(bi); |
961 |
+ |
mismatches[bi]=true; |
962 |
+ |
bi++; |
963 |
+ |
} |
964 |
+ |
if (*p=='^') { //reference deletion |
965 |
+ |
p++; |
966 |
+ |
while (isalpha(*p)) { //insert read bases |
967 |
+ |
p++; bi++; |
968 |
+ |
} |
969 |
+ |
} |
970 |
+ |
} |
971 |
+ |
} |
972 |
+ |
//else |
973 |
+ |
//{ |
974 |
+ |
//fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str()); |
975 |
+ |
//return false; |
976 |
+ |
//} |
977 |
+ |
} |
978 |
+ |
} |
979 |
+ |
/* By convention,the NM field of the SAM record |
980 |
+ |
* counts an insertion or deletion. I dont' think |
981 |
+ |
* we want the mismatch count in the BowtieHit |
982 |
+ |
* record to reflect this. Therefore, subtract out |
983 |
+ |
* the mismatches due to in/dels |
984 |
+ |
*/ |
985 |
+ |
for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){ |
986 |
+ |
switch (itr->opcode) |
987 |
+ |
{ |
988 |
+ |
case MATCH: |
989 |
+ |
case REF_SKIP: |
990 |
+ |
case PAD: |
991 |
+ |
gspan += itr->length; |
992 |
+ |
break; |
993 |
+ |
case DEL: |
994 |
+ |
gspan += itr->length; |
995 |
+ |
sam_nm -= itr->length; |
996 |
+ |
break; |
997 |
+ |
case INS: |
998 |
+ |
sam_nm -= itr->length; |
999 |
+ |
break; |
1000 |
+ |
default: |
1001 |
+ |
break; |
1002 |
+ |
} |
1003 |
+ |
} |
1004 |
+ |
return num_mismatches; |
1005 |
+ |
} |
1006 |
+ |
|
1007 |
|
bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1008 |
|
BowtieHit& bh, |
1009 |
|
bool strip_slash, |
1019 |
|
// Are we still in the header region? |
1020 |
|
if (bwt_buf[0] == '@') |
1021 |
|
return false; |
1022 |
< |
|
1022 |
> |
|
1023 |
|
char* buf = bwt_buf; |
1024 |
|
char* name = get_token((char**)&buf,"\t"); |
1025 |
|
char* sam_flag_str = get_token((char**)&buf,"\t"); |
1034 |
|
const char* seq_str = get_token((char**)&buf,"\t"); |
1035 |
|
if (seq) |
1036 |
|
strcpy(seq, seq_str); |
745 |
– |
|
1037 |
|
const char* qual_str = get_token((char**)&buf,"\t"); |
1038 |
|
if (qual) |
1039 |
|
strcpy(qual, qual_str); |
1052 |
|
{ |
1053 |
|
// truncated or malformed SAM record |
1054 |
|
return false; |
1055 |
< |
} |
1056 |
< |
|
766 |
< |
|
1055 |
> |
} |
1056 |
> |
|
1057 |
|
int sam_flag = atoi(sam_flag_str); |
1058 |
+ |
string ref_name = text_name, ref_name2 = ""; |
1059 |
|
int text_offset = atoi(text_offset_str); |
1060 |
|
|
1061 |
|
bool end = true; |
1064 |
|
unsigned int num_segs = 0; |
1065 |
|
|
1066 |
|
// Copy the tag out of the name field before we might wipe it out |
1067 |
< |
char* pipe = strrchr(name, '|'); |
777 |
< |
if (pipe) |
778 |
< |
{ |
779 |
< |
if (name_tags) |
780 |
< |
strcpy(name_tags, pipe); |
781 |
< |
|
782 |
< |
char* tag_buf = pipe + 1; |
783 |
< |
if (strchr(tag_buf, ':')) |
784 |
< |
{ |
785 |
< |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
786 |
< |
if (seg_num + 1 == num_segs) |
787 |
< |
end = true; |
788 |
< |
else |
789 |
< |
end = false; |
790 |
< |
} |
791 |
< |
|
792 |
< |
*pipe = 0; |
793 |
< |
} |
1067 |
> |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
1068 |
|
|
795 |
– |
// Stripping the slash and number following it gives the insert name |
796 |
– |
char* slash = strrchr(name, '/'); |
797 |
– |
if (strip_slash && slash) |
798 |
– |
*slash = 0; |
799 |
– |
|
800 |
– |
char* p_cig = cigar_str; |
801 |
– |
//int len = strlen(sequence); |
1069 |
|
vector<CigarOp> cigar; |
1070 |
|
bool spliced_alignment = false; |
1071 |
< |
// Mostly pilfered direct from the SAM tools: |
1072 |
< |
while (*p_cig) |
1073 |
< |
{ |
1074 |
< |
char* t; |
808 |
< |
int length = (int)strtol(p_cig, &t, 10); |
809 |
< |
if (length <= 0) |
810 |
< |
{ |
811 |
< |
//fprintf (stderr, "CIGAR op has zero length\n"); |
812 |
< |
return false; |
813 |
< |
} |
814 |
< |
char op_char = toupper(*t); |
815 |
< |
CigarOpCode opcode; |
816 |
< |
if (op_char == 'M') opcode = MATCH; |
817 |
< |
else if (op_char == 'I') opcode = INS; |
818 |
< |
else if (op_char == 'D') opcode = DEL; |
819 |
< |
else if (op_char == 'N') |
820 |
< |
{ |
821 |
< |
if (length > max_report_intron_length) |
822 |
< |
return false; |
823 |
< |
opcode = REF_SKIP; |
824 |
< |
spliced_alignment = true; |
825 |
< |
} |
826 |
< |
else if (op_char == 'S') opcode = SOFT_CLIP; |
827 |
< |
else if (op_char == 'H') opcode = HARD_CLIP; |
828 |
< |
else if (op_char == 'P') opcode = PAD; |
829 |
< |
else |
830 |
< |
{ |
831 |
< |
fprintf (stderr, "invalid CIGAR operation\n"); |
832 |
< |
return false; |
833 |
< |
} |
834 |
< |
p_cig = t + 1; |
835 |
< |
//i += length; |
836 |
< |
cigar.push_back(CigarOp(opcode, length)); |
837 |
< |
} |
838 |
< |
if (*p_cig) |
839 |
< |
{ |
840 |
< |
fprintf (stderr, "unmatched CIGAR operation\n"); |
841 |
< |
return false; |
842 |
< |
} |
843 |
< |
|
1071 |
> |
|
1072 |
> |
int refspan=parseCigar(cigar, cigar_str, spliced_alignment); |
1073 |
> |
if (refspan==0) |
1074 |
> |
return false; |
1075 |
|
//vector<string> attributes; |
1076 |
|
//tokenize(tag_buf, " \t",attributes); |
1077 |
< |
|
1077 |
> |
|
1078 |
|
bool antisense_splice = false; |
1079 |
< |
unsigned char num_mismatches = 0; |
1080 |
< |
unsigned char num_splice_anchor_mismatches = 0; |
1081 |
< |
const char* tag_buf = buf; |
1082 |
< |
|
1083 |
< |
// while((tag_buf = get_token((char**)&buf,"\t"))) |
853 |
< |
// { |
854 |
< |
// vector<string> tuple_fields; |
855 |
< |
// tokenize(tag_buf,":", tuple_fields); |
856 |
< |
// if (tuple_fields.size() == 3) |
857 |
< |
// { |
858 |
< |
// if (tuple_fields[0] == "XS") |
859 |
< |
// { |
860 |
< |
// if (tuple_fields[2] == "-") |
861 |
< |
// antisense_splice = true; |
862 |
< |
// } |
863 |
< |
// else if (tuple_fields[0] == "NM") |
864 |
< |
// { |
865 |
< |
// num_mismatches = atoi(tuple_fields[2].c_str()); |
866 |
< |
// } |
867 |
< |
// else if (tuple_fields[0] == "NS") |
868 |
< |
// { |
869 |
< |
// num_splice_anchor_mismatches = atoi(tuple_fields[2].c_str()); |
870 |
< |
// } |
871 |
< |
// else |
872 |
< |
// { |
873 |
< |
// fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str()); |
874 |
< |
// return false; |
875 |
< |
// } |
876 |
< |
// } |
877 |
< |
// } |
878 |
< |
|
879 |
< |
while((tag_buf = get_token((char**)&buf,"\t"))) |
880 |
< |
{ |
881 |
< |
vector<string> tuple_fields; |
882 |
< |
tokenize(tag_buf,":", tuple_fields); |
883 |
< |
if (tuple_fields.size() == 3) |
884 |
< |
{ |
885 |
< |
if (tuple_fields[0] == "XS") |
886 |
< |
{ |
887 |
< |
if (tuple_fields[2] == "-") |
888 |
< |
antisense_splice = true; |
889 |
< |
} |
890 |
< |
else if (tuple_fields[0] == "NM") |
891 |
< |
{ |
892 |
< |
num_mismatches = atoi(tuple_fields[2].c_str()); |
893 |
< |
} |
894 |
< |
else if (tuple_fields[0] == "NS") |
895 |
< |
{ |
896 |
< |
//ignored for now |
897 |
< |
} |
898 |
< |
else |
899 |
< |
{ |
900 |
< |
//fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str()); |
901 |
< |
//return false; |
902 |
< |
} |
903 |
< |
} |
904 |
< |
} |
905 |
< |
|
1079 |
> |
int sam_nm = 0; //the value of the NM tag (edit distance) |
1080 |
> |
//int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read) |
1081 |
> |
vector<bool> mismatches; |
1082 |
> |
mismatches.resize(strlen(seq_str), false); |
1083 |
> |
int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice); |
1084 |
|
|
907 |
– |
/* |
908 |
– |
* By convention,the NM field of the SAM record |
909 |
– |
* counts an insertion or deletion. I dont' think |
910 |
– |
* we want the mismatch count in the BowtieHit |
911 |
– |
* record to reflect this. Therefore, subtract out |
912 |
– |
* the mismatches due to in/dels |
913 |
– |
*/ |
914 |
– |
for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){ |
915 |
– |
if(itr->opcode == INS){ |
916 |
– |
num_mismatches -= itr->length; |
917 |
– |
} |
918 |
– |
if(itr->opcode == DEL){ |
919 |
– |
num_mismatches -= itr->length; |
920 |
– |
} |
921 |
– |
} |
922 |
– |
// vector<string> toks; |
923 |
– |
// tokenize(tag_buf, ":", toks); |
1085 |
|
if (spliced_alignment) |
1086 |
|
{ |
1087 |
|
bh = create_hit(name, |
1088 |
< |
text_name, |
1088 |
> |
ref_name, |
1089 |
> |
ref_name2, |
1090 |
|
text_offset - 1, |
1091 |
|
cigar, |
1092 |
|
sam_flag & 0x0010, |
1093 |
|
antisense_splice, |
1094 |
|
num_mismatches, |
1095 |
< |
num_splice_anchor_mismatches, |
1095 |
> |
0, |
1096 |
|
end); |
935 |
– |
return true; |
936 |
– |
|
1097 |
|
} |
1098 |
|
else |
1099 |
|
{ |
1100 |
|
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
941 |
– |
|
1101 |
|
bh = create_hit(name, |
1102 |
< |
text_name, |
1102 |
> |
ref_name, |
1103 |
> |
ref_name2, |
1104 |
|
text_offset - 1, // SAM files are 1-indexed |
1105 |
|
cigar, |
1106 |
|
sam_flag & 0x0010, |
1108 |
|
num_mismatches, |
1109 |
|
0, |
1110 |
|
end); |
951 |
– |
return true; |
1111 |
|
} |
1112 |
< |
|
1113 |
< |
return false; |
1112 |
> |
return true; |
1113 |
> |
} |
1114 |
> |
|
1115 |
> |
void cigar_add(vector<CigarOp>& cigar, CigarOp& op) { |
1116 |
> |
if (op.length<=0) return; |
1117 |
> |
if (cigar.size()>0 && cigar.back().opcode==op.opcode) { |
1118 |
> |
cigar.back().length+=op.length; |
1119 |
> |
} |
1120 |
> |
cigar.push_back(op); |
1121 |
> |
} |
1122 |
> |
|
1123 |
> |
int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches, |
1124 |
> |
int &left, int spl_start, int spl_len, CigarOpCode spl_code) { |
1125 |
> |
//merge the original 'cigar' with the new insert/gap operation |
1126 |
> |
//at position spl_start and place the result into splcigar; |
1127 |
> |
//TODO: ideally this should also get and rebuild the MD string (alignment mismatches) |
1128 |
> |
int spl_mismatches=0; |
1129 |
> |
//return value: mismatches in the insert region for INS case, |
1130 |
> |
//or number of mismatches in the anchor region |
1131 |
> |
//return -1 if somehow the hit seems bad |
1132 |
> |
|
1133 |
> |
//these offsets are relative to the beginning of alignment on reference |
1134 |
> |
int spl_ofs=abs(spl_start-left); //relative position of splice op |
1135 |
> |
int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op |
1136 |
> |
CigarOp gapop(spl_code, spl_len); //for DEL, REF_SKIP, FUSIONS |
1137 |
> |
if (spl_code==INS) |
1138 |
> |
spl_ofs_end += spl_len; |
1139 |
> |
int ref_ofs=0; //working offset on reference |
1140 |
> |
int read_ofs=0; //working offset on the read, relative to the leftmost aligned base |
1141 |
> |
bool xfound=false; |
1142 |
> |
//if (left<=spl_start+spl_len) { |
1143 |
> |
if (spl_ofs_end>0) { |
1144 |
> |
int prev_opcode=0; |
1145 |
> |
int prev_oplen=0; |
1146 |
> |
for (size_t c = 0 ; c < cigar.size(); ++c) |
1147 |
> |
{ |
1148 |
> |
int prev_read_ofs=read_ofs; |
1149 |
> |
int cur_op_ofs=ref_ofs; |
1150 |
> |
int cur_opcode=cigar[c].opcode; |
1151 |
> |
int cur_oplen=cigar[c].length; |
1152 |
> |
|
1153 |
> |
switch (cur_opcode) { |
1154 |
> |
case MATCH: |
1155 |
> |
ref_ofs+=cur_oplen; |
1156 |
> |
read_ofs+=cur_oplen; |
1157 |
> |
if (spl_code==REF_SKIP || spl_code==DEL || |
1158 |
> |
spl_code==FUSION_FF || spl_code==FUSION_FR || |
1159 |
> |
spl_code==FUSION_RF || spl_code==FUSION_RR) { |
1160 |
> |
for (int o=cur_op_ofs;o<ref_ofs;o++) { |
1161 |
> |
int rofs=prev_read_ofs+(o-cur_op_ofs); |
1162 |
> |
if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs]) |
1163 |
> |
spl_mismatches++; |
1164 |
> |
} |
1165 |
> |
} |
1166 |
> |
else if (spl_code==INS) { |
1167 |
> |
for (int o=cur_op_ofs;o<ref_ofs;o++) { |
1168 |
> |
int rofs=prev_read_ofs+(o-cur_op_ofs); |
1169 |
> |
if (o>=spl_ofs && o<spl_ofs_end && mismatches[rofs]) |
1170 |
> |
spl_mismatches++; |
1171 |
> |
} |
1172 |
> |
} |
1173 |
> |
break; |
1174 |
> |
case DEL: |
1175 |
> |
case REF_SKIP: |
1176 |
> |
case PAD: |
1177 |
> |
ref_ofs+=cur_oplen; |
1178 |
> |
break; |
1179 |
> |
case SOFT_CLIP: |
1180 |
> |
case INS: |
1181 |
> |
read_ofs+=cur_oplen; |
1182 |
> |
break; |
1183 |
> |
//case HARD_CLIP: |
1184 |
> |
} |
1185 |
> |
|
1186 |
> |
if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) { |
1187 |
> |
if (cur_op_ofs==spl_ofs_end && spl_code!=INS) |
1188 |
> |
{ |
1189 |
> |
xfound=true; |
1190 |
> |
//we have to insert the gap here first |
1191 |
> |
cigar_add(splcigar, gapop); |
1192 |
> |
//also, check |
1193 |
> |
} |
1194 |
> |
cigar_add(splcigar, cigar[c]); |
1195 |
> |
} |
1196 |
> |
else //if (ref_ofs>spl_ofs) { |
1197 |
> |
{ //op intersection |
1198 |
> |
xfound=true; |
1199 |
> |
if (spl_code==INS) { |
1200 |
> |
//we have to shorten cur_opcode |
1201 |
> |
// find the overlap between current range |
1202 |
> |
int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs; |
1203 |
> |
int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs; |
1204 |
> |
|
1205 |
> |
CigarOp op(cigar[c]); |
1206 |
> |
op.length=spl_ofs-cur_op_ofs; |
1207 |
> |
if (spl_ofs>cur_op_ofs) |
1208 |
> |
cigar_add(splcigar, op); |
1209 |
> |
if (spl_ofs<0) |
1210 |
> |
{ |
1211 |
> |
CigarOp temp = gapop; |
1212 |
> |
temp.length += spl_ofs; |
1213 |
> |
if (temp.length>0) |
1214 |
> |
cigar_add(splcigar, temp); |
1215 |
> |
} |
1216 |
> |
else |
1217 |
> |
cigar_add(splcigar, gapop); |
1218 |
> |
op.length=ref_ofs-spl_ofs_end; |
1219 |
> |
if (ref_ofs>spl_ofs_end) |
1220 |
> |
cigar_add(splcigar,op); |
1221 |
> |
} |
1222 |
> |
else {//DEL or REF_SKIP or FUSION_[FR][FR] |
1223 |
> |
//spl_ofs == spl_ofs_end |
1224 |
> |
//we have to split cur_opcode |
1225 |
> |
//look for mismatches within min_anchor_len distance from splice point |
1226 |
> |
CigarOp op(cigar[c]); |
1227 |
> |
CigarOpCode opcode = op.opcode; |
1228 |
> |
op.length=spl_ofs-cur_op_ofs; |
1229 |
> |
if (gapop.opcode == FUSION_RF || gapop.opcode == FUSION_RR) |
1230 |
> |
{ |
1231 |
> |
if (opcode == MATCH) |
1232 |
> |
op.opcode = mATCH; |
1233 |
> |
else if (opcode == INS) |
1234 |
> |
op.opcode = iNS; |
1235 |
> |
else if (opcode == DEL) |
1236 |
> |
op.opcode = dEL; |
1237 |
> |
else if (opcode == REF_SKIP) |
1238 |
> |
op.opcode = rEF_SKIP; |
1239 |
> |
} |
1240 |
> |
cigar_add(splcigar, op); |
1241 |
> |
|
1242 |
> |
cigar_add(splcigar, gapop); |
1243 |
> |
|
1244 |
> |
op.opcode = opcode; |
1245 |
> |
if (gapop.opcode == FUSION_FR || gapop.opcode == FUSION_RR) |
1246 |
> |
{ |
1247 |
> |
if (opcode == MATCH) |
1248 |
> |
op.opcode = mATCH; |
1249 |
> |
else if (opcode == INS) |
1250 |
> |
op.opcode = iNS; |
1251 |
> |
else if (opcode == DEL) |
1252 |
> |
op.opcode = dEL; |
1253 |
> |
else if (opcode == REF_SKIP) |
1254 |
> |
op.opcode = rEF_SKIP; |
1255 |
> |
} |
1256 |
> |
op.length=ref_ofs-spl_ofs; |
1257 |
> |
cigar_add(splcigar,op); |
1258 |
> |
} |
1259 |
> |
} //op intersection |
1260 |
> |
prev_opcode=cur_opcode; |
1261 |
> |
prev_oplen=cur_oplen; |
1262 |
> |
} //for each cigar opcode |
1263 |
> |
} //intersection possible |
1264 |
> |
|
1265 |
> |
//if (!xfound) {//no intersection found between splice event and alignment |
1266 |
> |
if (spl_ofs_end<=0) { |
1267 |
> |
//alignment starts after the splice event |
1268 |
> |
if (spl_code==INS) left-=spl_len; |
1269 |
> |
else left+=spl_len; |
1270 |
> |
|
1271 |
> |
splcigar = cigar; |
1272 |
> |
} |
1273 |
> |
//else { |
1274 |
> |
//alignment ends before the splice event |
1275 |
> |
//nothing to do |
1276 |
> |
// } |
1277 |
> |
//return spl_mismatches; |
1278 |
> |
// } |
1279 |
> |
|
1280 |
> |
return spl_mismatches; |
1281 |
|
} |
1282 |
|
|
1283 |
|
bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1297 |
|
if (bwt_buf[0] == '@') |
1298 |
|
return false; |
1299 |
|
|
974 |
– |
//char orientation; |
975 |
– |
/* |
976 |
– |
char mismatches[buf_size]; |
977 |
– |
|
978 |
– |
//char* orientation_str = get_token((char**)&buf,"\t"); |
979 |
– |
|
980 |
– |
mismatches[0] = 0; |
981 |
– |
char* mismatches_str = get_token((char**)&buf, "\t"); |
982 |
– |
if (mismatches_str) |
983 |
– |
strcpy(mismatches, mismatches_str); |
984 |
– |
|
985 |
– |
orientation = orientation_str[0]; |
986 |
– |
text_offset = atoi(text_offset_str); |
987 |
– |
*/ |
988 |
– |
|
1300 |
|
char* buf = bwt_buf; |
1301 |
|
char* name = get_token((char**)&buf,"\t"); |
1302 |
|
char* sam_flag_str = get_token((char**)&buf,"\t"); |
1307 |
|
const char* mate_ref_str = get_token((char**)&buf,"\t"); |
1308 |
|
const char* mate_pos_str = get_token((char**)&buf,"\t"); |
1309 |
|
const char* inferred_insert_sz_str = get_token((char**)&buf,"\t"); |
1310 |
< |
|
1310 |
> |
//int num_mismatches=0; |
1311 |
> |
//int mismatches[1024]; //list of 0-based mismatch positions in this read |
1312 |
> |
//parsed from SAM's MD:Z: tag |
1313 |
|
const char* seq_str = get_token((char**)&buf,"\t"); |
1314 |
|
if (seq) |
1315 |
|
strcpy(seq, seq_str); |
1334 |
|
return false; |
1335 |
|
} |
1336 |
|
|
1024 |
– |
|
1337 |
|
int sam_flag = atoi(sam_flag_str); |
1338 |
|
int text_offset = atoi(text_offset_str); |
1339 |
< |
|
1028 |
< |
//############################################## |
1339 |
> |
text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based) |
1340 |
|
bool end = true; |
1341 |
|
unsigned int seg_offset = 0; |
1342 |
|
unsigned int seg_num = 0; |
1343 |
|
unsigned int num_segs = 0; |
1344 |
|
|
1345 |
|
// Copy the tag out of the name field before we might wipe it out |
1346 |
< |
char* pipe = strrchr(name, '|'); |
1036 |
< |
if (pipe) |
1037 |
< |
{ |
1038 |
< |
if (name_tags) |
1039 |
< |
strcpy(name_tags, pipe); |
1346 |
> |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
1347 |
|
|
1348 |
< |
char* tag_buf = pipe + 1; |
1349 |
< |
if (strchr(tag_buf, ':')) |
1350 |
< |
{ |
1044 |
< |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
1045 |
< |
if (seg_num + 1 == num_segs) |
1046 |
< |
end = true; |
1047 |
< |
else |
1048 |
< |
end = false; |
1049 |
< |
} |
1348 |
> |
vector<CigarOp> samcigar; |
1349 |
> |
bool spliced_alignment = false; |
1350 |
> |
int refspan=parseCigar(samcigar, cigar_str, spliced_alignment); |
1351 |
|
|
1352 |
< |
*pipe = 0; |
1353 |
< |
} |
1354 |
< |
// Stripping the slash and number following it gives the insert name |
1355 |
< |
char* slash = strrchr(name, '/'); |
1356 |
< |
if (strip_slash && slash) |
1357 |
< |
*slash = 0; |
1352 |
> |
if (refspan==0) |
1353 |
> |
return false; |
1354 |
> |
bool antisense_splice = false; |
1355 |
> |
int sam_nm = 0; |
1356 |
> |
vector<bool> mismatches; |
1357 |
> |
mismatches.resize(strlen(seq_str), false); |
1358 |
|
|
1359 |
< |
//int read_len = strlen(sequence); |
1359 |
> |
int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice); |
1360 |
> |
|
1361 |
> |
//############################################## |
1362 |
|
|
1363 |
|
// Add this alignment to the table of hits for this half of the |
1364 |
|
// Bowtie map |
1365 |
|
|
1366 |
|
// Parse the text_name field to recover the splice coords |
1367 |
|
vector<string> toks; |
1065 |
– |
|
1368 |
|
tokenize_strict(text_name, "|", toks); |
1369 |
|
|
1370 |
|
int num_extra_toks = (int)toks.size() - 6; |
1389 |
|
if (splice_toks.size() != 2) |
1390 |
|
{ |
1391 |
|
fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); |
1392 |
< |
//fprintf(stderr, "%s (token: %s)\n", text_name, |
1392 |
> |
//fprintf(stderr, "\t%s (token: %s)\n", text_name, |
1393 |
|
// toks[num_extra_toks + splice_field].c_str()); |
1394 |
|
return false; |
1395 |
|
} |
1396 |
|
|
1397 |
< |
// |
1398 |
< |
// check for an insertion hit |
1399 |
< |
// |
1400 |
< |
if(toks[num_extra_toks + junction_type_field] == "ins") |
1401 |
< |
{ |
1100 |
< |
int8_t spliced_read_len = strlen(seq_str); |
1101 |
< |
/* |
1102 |
< |
* The 0-based position of the left edge of the alignment. Note that this |
1103 |
< |
* value may need to be futher corrected to account for the presence of |
1104 |
< |
* of the insertion. |
1105 |
< |
*/ |
1106 |
< |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1107 |
< |
uint32_t right = left + spliced_read_len - 1; |
1108 |
< |
|
1109 |
< |
/* |
1110 |
< |
* The 0-based position of the last genomic sequence before the insertion |
1111 |
< |
*/ |
1112 |
< |
uint32_t left_splice_pos = atoi(splice_toks[0].c_str()); |
1113 |
< |
|
1114 |
< |
string insertedSequence = splice_toks[1]; |
1115 |
< |
/* |
1116 |
< |
* The 0-based position of the first genomic sequence after teh insertion |
1117 |
< |
*/ |
1118 |
< |
uint32_t right_splice_pos = left_splice_pos + 1; |
1119 |
< |
if(left > left_splice_pos){ |
1120 |
< |
/* |
1121 |
< |
* The genomic position of the left edge of the alignment needs to be corrected |
1122 |
< |
* If the alignment does not extend into the insertion, simply subtract the length |
1123 |
< |
* of the inserted sequence, otherwise, just set it equal to the right edge |
1124 |
< |
*/ |
1125 |
< |
left = left - insertedSequence.length(); |
1126 |
< |
if(left < right_splice_pos){ |
1127 |
< |
left = right_splice_pos; |
1128 |
< |
} |
1129 |
< |
} |
1130 |
< |
if(right > left_splice_pos){ |
1131 |
< |
right = right - insertedSequence.length(); |
1132 |
< |
if(right < left_splice_pos){ |
1133 |
< |
right = left_splice_pos; |
1134 |
< |
} |
1135 |
< |
} |
1136 |
< |
/* |
1137 |
< |
* Now, right and left should be properly transformed into genomic coordinates |
1138 |
< |
* We should be able to deduce how much the alignment matches the insertion |
1139 |
< |
* simply based on the length of the read |
1140 |
< |
*/ |
1141 |
< |
int left_match_length = 0; |
1142 |
< |
if(left <= left_splice_pos){ |
1143 |
< |
left_match_length = left_splice_pos - left + 1; |
1144 |
< |
} |
1145 |
< |
int right_match_length = 0; |
1146 |
< |
if(right >= right_splice_pos){ |
1147 |
< |
right_match_length = right - right_splice_pos + 1; |
1148 |
< |
} |
1149 |
< |
int insertion_match_length = spliced_read_len - left_match_length - right_match_length; |
1397 |
> |
string junction_strand = toks[num_extra_toks + strand_field]; |
1398 |
> |
if(junction_strand != "rev" && junction_strand != "fwd"){ |
1399 |
> |
fprintf(stderr, "Malformed insertion record\n"); |
1400 |
> |
return false; |
1401 |
> |
} |
1402 |
|
|
1403 |
< |
if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0) |
1404 |
< |
return false; |
1403 |
> |
// |
1404 |
> |
// check for an insertion hit |
1405 |
> |
// |
1406 |
> |
if(toks[num_extra_toks + junction_type_field] == "ins") |
1407 |
> |
{ |
1408 |
> |
//int8_t spliced_read_len = strlen(seq_str); |
1409 |
> |
//TODO FIXME: use the CIGAR instead of seq length! |
1410 |
> |
// The 0-based position of the left edge of the alignment. Note that this |
1411 |
> |
// value may need to be further corrected to account for the presence of |
1412 |
> |
// of the insertion. |
1413 |
> |
int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1414 |
> |
|
1415 |
> |
// The 0-based position of the last genomic sequence before the insertion |
1416 |
> |
int left_splice_pos = atoi(splice_toks[0].c_str()); |
1417 |
> |
|
1418 |
> |
string insertedSequence = splice_toks[1]; |
1419 |
> |
// The 0-based position of the first genomic sequence after the insertion |
1420 |
> |
|
1421 |
> |
vector<CigarOp> splcigar; |
1422 |
> |
//this also updates left to the adjusted genomic coordinates |
1423 |
> |
int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, |
1424 |
> |
left, left_splice_pos+1, insertedSequence.length(), INS); |
1425 |
|
|
1426 |
< |
string junction_strand = toks[num_extra_toks + strand_field]; |
1427 |
< |
if(junction_strand != "rev" && junction_strand != "fwd"){ |
1428 |
< |
fprintf(stderr, "Malformed insertion record\n"); |
1429 |
< |
return false; |
1430 |
< |
} |
1426 |
> |
if (spl_num_mismatches<0) return false; |
1427 |
> |
num_mismatches-=spl_num_mismatches; |
1428 |
> |
/* |
1429 |
> |
uint32_t right_splice_pos = left_splice_pos + 1; |
1430 |
> |
|
1431 |
> |
//uint32_t right = left + spliced_read_len - 1; |
1432 |
> |
int right = left + refspan - 1; |
1433 |
> |
|
1434 |
> |
if(left > left_splice_pos){ |
1435 |
> |
//The genomic position of the left edge of the alignment needs to be corrected |
1436 |
> |
//If the alignment does not extend into the insertion, simply subtract the length |
1437 |
> |
//of the inserted sequence, otherwise, just set it equal to the right edge |
1438 |
> |
left = left - insertedSequence.length(); |
1439 |
> |
if(left < right_splice_pos){ |
1440 |
> |
left = right_splice_pos; |
1441 |
> |
} |
1442 |
> |
} |
1443 |
> |
if(right > left_splice_pos){ |
1444 |
> |
right = right - insertedSequence.length(); |
1445 |
> |
if(right < left_splice_pos){ |
1446 |
> |
right = left_splice_pos; |
1447 |
> |
} |
1448 |
> |
} |
1449 |
> |
// Now, right and left should be properly transformed into genomic coordinates |
1450 |
> |
// We should be able to deduce how much the alignment matches the insertion |
1451 |
> |
// simply based on the length of the read |
1452 |
> |
int left_match_length = 0; |
1453 |
> |
if(left <= left_splice_pos){ |
1454 |
> |
left_match_length = left_splice_pos - left + 1; |
1455 |
> |
} |
1456 |
> |
int right_match_length = 0; |
1457 |
> |
if(right >= right_splice_pos){ |
1458 |
> |
right_match_length = right - right_splice_pos + 1; |
1459 |
> |
} |
1460 |
> |
int insertion_match_length = spliced_read_len - left_match_length - right_match_length; |
1461 |
> |
|
1462 |
> |
if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0) |
1463 |
> |
return false; |
1464 |
> |
|
1465 |
> |
//char* pch = strtok( mismatches, ","); |
1466 |
> |
//unsigned char num_mismatches = 0; |
1467 |
> |
//text_offset holds the left end of the alignment, |
1468 |
> |
//RELATIVE TO the start of the contig |
1469 |
> |
|
1470 |
> |
//The 0-based relative position of the left-most character |
1471 |
> |
//before the insertion in the contig |
1472 |
> |
int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
1473 |
> |
for (size_t i=0;i<mismatches.size();++i) { |
1474 |
> |
int mismatch_pos = mismatches[i]; |
1475 |
> |
// for reversely mapped reads, |
1476 |
> |
//find the correct mismatched position. |
1477 |
> |
if (sam_flag & 0x0010){ |
1478 |
> |
mismatch_pos = spliced_read_len - mismatch_pos - 1; |
1479 |
> |
} |
1480 |
> |
|
1481 |
> |
//Only count mismatches outside of the insertion region |
1482 |
> |
// If there is a mismatch within the insertion, |
1483 |
> |
// disallow this hit |
1484 |
> |
if(mismatch_pos + text_offset <= relative_splice_pos || |
1485 |
> |
mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){ |
1486 |
> |
spl_num_mismatches++; |
1487 |
> |
}else{ |
1488 |
> |
return false; |
1489 |
> |
} |
1490 |
> |
} |
1491 |
> |
*/ |
1492 |
> |
//vector<CigarOp> splcigar; |
1493 |
> |
//spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS); |
1494 |
> |
//splcigar.push_back(CigarOp(MATCH, left_match_length)); |
1495 |
> |
//splcigar.push_back(CigarOp(INS, insertion_match_length)); |
1496 |
> |
//splcigar.push_back(CigarOp(MATCH, right_match_length)); |
1497 |
> |
|
1498 |
> |
bh = create_hit(name, |
1499 |
> |
contig, |
1500 |
> |
"", |
1501 |
> |
left, |
1502 |
> |
//splcigar, |
1503 |
> |
splcigar, |
1504 |
> |
sam_flag & 0x0010, |
1505 |
> |
junction_strand == "rev", |
1506 |
> |
num_mismatches, |
1507 |
> |
0, |
1508 |
> |
end); |
1509 |
|
|
1510 |
< |
char* pch = strtok( mismatches, ","); |
1511 |
< |
unsigned char num_mismatches = 0; |
1510 |
> |
return true; |
1511 |
> |
} //"ins" |
1512 |
> |
else //"del" or intron |
1513 |
> |
{ |
1514 |
> |
// The 0-based position of the left edge of the alignment. |
1515 |
> |
int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1516 |
> |
|
1517 |
> |
// The 0-based position of the last genomic sequence before the deletion |
1518 |
> |
int left_splice_pos = atoi(splice_toks[0].c_str()); |
1519 |
> |
|
1520 |
> |
int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1; |
1521 |
> |
/* |
1522 |
> |
if ((sam_flag & 0x0010) == 0) //###### |
1523 |
> |
{ |
1524 |
> |
if (left_splice_ofs + seg_offset < _anchor_length) |
1525 |
> |
return false; |
1526 |
> |
} |
1527 |
> |
else |
1528 |
> |
{ |
1529 |
> |
if (right_splice_ofs + seg_offset < _anchor_length) |
1530 |
> |
return false; |
1531 |
> |
} |
1532 |
> |
*/ |
1533 |
> |
//uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos; |
1534 |
> |
//atoi(toks[right_window_edge_field].c_str()); |
1535 |
> |
|
1536 |
> |
/* |
1537 |
> |
//offset of deletion point, relative to the beginning of the alignment |
1538 |
> |
int left_splice_ofs = left_splice_pos-left+1; |
1539 |
> |
|
1540 |
> |
int mismatches_in_anchor = 0; |
1541 |
> |
for (size_t i=0;i<mismatches.size();++i) { |
1542 |
> |
spl_num_mismatches++; |
1543 |
> |
int mismatch_pos = mismatches[i]; |
1544 |
> |
if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) || |
1545 |
> |
((sam_flag & 0x0010) != 0 && |
1546 |
> |
abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len) |
1547 |
> |
mismatches_in_anchor++; |
1548 |
> |
} |
1549 |
> |
*/ |
1550 |
> |
vector<CigarOp> splcigar; |
1551 |
> |
|
1552 |
> |
CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP; |
1553 |
> |
|
1554 |
> |
int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left, |
1555 |
> |
left_splice_pos+1, gap_len, opcode); |
1556 |
> |
|
1557 |
> |
if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches) |
1558 |
> |
return false; |
1559 |
> |
/* |
1560 |
> |
splcigar.push_back(CigarOp(MATCH, left_splice_pos)); |
1561 |
> |
if(toks[num_extra_toks + junction_type_field] == "del"){ |
1562 |
> |
splcigar.push_back(CigarOp(DEL, gap_len)); |
1563 |
> |
}else{ |
1564 |
> |
splcigar.push_back(CigarOp(REF_SKIP, gap_len)); |
1565 |
> |
} |
1566 |
> |
splcigar.push_back(CigarOp(MATCH, right_splice_pos)); |
1567 |
> |
*/ |
1568 |
> |
bh = create_hit(name, |
1569 |
> |
contig, |
1570 |
> |
"", |
1571 |
> |
left, |
1572 |
> |
splcigar, |
1573 |
> |
(sam_flag & 0x0010), |
1574 |
> |
junction_strand == "rev", |
1575 |
> |
num_mismatches, |
1576 |
> |
spl_num_mismatches, |
1577 |
> |
end); |
1578 |
|
|
1579 |
< |
/* |
1580 |
< |
* remember that text_offset holds the left end of the |
1581 |
< |
* alignment, relative to the start of the contig |
1582 |
< |
*/ |
1579 |
> |
return true; |
1580 |
> |
} |
1581 |
> |
} //parse splice data |
1582 |
> |
else |
1583 |
> |
{ |
1584 |
> |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
1585 |
> |
//fprintf(stderr, "%s\n", orig_bwt_buf); |
1586 |
> |
// continue; |
1587 |
> |
return false; |
1588 |
> |
} |
1589 |
> |
|
1590 |
> |
return false; |
1591 |
> |
} |
1592 |
|
|
1168 |
– |
/* |
1169 |
– |
* The 0-based relative position of the left-most character |
1170 |
– |
* before the insertion in the contig |
1171 |
– |
*/ |
1172 |
– |
int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
1173 |
– |
while (pch != NULL) |
1174 |
– |
{ |
1175 |
– |
char* colon = strchr(pch, ':'); |
1176 |
– |
if (colon) |
1177 |
– |
{ |
1178 |
– |
*colon = 0; |
1179 |
– |
int mismatch_pos = atoi(pch); |
1593 |
|
|
1181 |
– |
/* |
1182 |
– |
* for reversely mapped reads, |
1183 |
– |
* find the correct mismatched position. |
1184 |
– |
*/ |
1185 |
– |
if (sam_flag & 0x0010){ |
1186 |
– |
mismatch_pos = spliced_read_len - mismatch_pos - 1; |
1187 |
– |
} |
1594 |
|
|
1595 |
< |
/* |
1596 |
< |
* Only count mismatches outside of the insertion region |
1597 |
< |
* If there is a mismatch within the insertion, |
1598 |
< |
* disallow this hit |
1599 |
< |
*/ |
1600 |
< |
if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){ |
1601 |
< |
num_mismatches++; |
1602 |
< |
}else{ |
1603 |
< |
return false; |
1604 |
< |
} |
1605 |
< |
} |
1606 |
< |
pch = strtok (NULL, ","); |
1607 |
< |
} |
1595 |
> |
bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1596 |
> |
BowtieHit& bh, bool strip_slash, |
1597 |
> |
char* name_out, char* name_tags, |
1598 |
> |
char* seq, char* qual) { |
1599 |
> |
if (_sam_header==NULL) |
1600 |
> |
err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!"); |
1601 |
> |
const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf; |
1602 |
> |
|
1603 |
> |
uint32_t sam_flag = hit_buf->core.flag; |
1604 |
> |
|
1605 |
> |
int text_offset = hit_buf->core.pos; |
1606 |
> |
int text_mate_pos = hit_buf->core.mpos; |
1607 |
> |
int target_id = hit_buf->core.tid; |
1608 |
> |
int mate_target_id = hit_buf->core.mtid; |
1609 |
> |
|
1610 |
> |
vector<CigarOp> cigar; |
1611 |
> |
bool spliced_alignment = false; |
1612 |
> |
int num_hits = 1; |
1613 |
> |
|
1614 |
> |
double mapQ = hit_buf->core.qual; |
1615 |
> |
long double error_prob; |
1616 |
> |
if (mapQ > 0) |
1617 |
> |
{ |
1618 |
> |
long double p = (-1.0 * mapQ) / 10.0; |
1619 |
> |
error_prob = pow(10.0L, p); |
1620 |
> |
} |
1621 |
> |
else |
1622 |
> |
{ |
1623 |
> |
error_prob = 1.0; |
1624 |
> |
} |
1625 |
> |
|
1626 |
> |
bool end = true; |
1627 |
> |
unsigned int seg_offset = 0; |
1628 |
> |
unsigned int seg_num = 0; |
1629 |
> |
unsigned int num_segs = 0; |
1630 |
> |
// Copy the tag out of the name field before we might wipe it out |
1631 |
> |
char* qname = bam1_qname(hit_buf); |
1632 |
> |
char* pipe = strrchr(qname, '|'); |
1633 |
> |
if (pipe) |
1634 |
> |
{ |
1635 |
> |
if (name_tags) |
1636 |
> |
strcpy(name_tags, pipe); |
1637 |
> |
|
1638 |
> |
char* tag_buf = pipe + 1; |
1639 |
> |
if (strchr(tag_buf, ':')) |
1640 |
> |
{ |
1641 |
> |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
1642 |
> |
if (seg_num + 1 == num_segs) |
1643 |
> |
end = true; |
1644 |
> |
else |
1645 |
> |
end = false; |
1646 |
> |
} |
1647 |
> |
|
1648 |
> |
*pipe = 0; |
1649 |
> |
} |
1650 |
|
|
1651 |
+ |
if (target_id < 0) { |
1652 |
+ |
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
1653 |
+ |
bh = create_hit(qname, |
1654 |
+ |
"*", //ref_name |
1655 |
+ |
0, //left coord |
1656 |
+ |
0, //read_len |
1657 |
+ |
false, //antisense_aln |
1658 |
+ |
0, //edit_dist |
1659 |
+ |
end); |
1660 |
+ |
return true; |
1661 |
+ |
} |
1662 |
|
|
1663 |
< |
vector<CigarOp> cigar; |
1664 |
< |
cigar.push_back(CigarOp(MATCH, left_match_length)); |
1665 |
< |
cigar.push_back(CigarOp(INS, insertion_match_length)); |
1666 |
< |
cigar.push_back(CigarOp(MATCH, right_match_length)); |
1663 |
> |
if (seq!=NULL) { |
1664 |
> |
char *bseq = (char*)bam1_seq(hit_buf); |
1665 |
> |
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
1666 |
> |
char v = bam1_seqi(bseq,i); |
1667 |
> |
seq[i]=bam_nt16_rev_table[v]; |
1668 |
> |
} |
1669 |
> |
seq[hit_buf->core.l_qseq]=0; |
1670 |
> |
} |
1671 |
> |
if (qual!=NULL) { |
1672 |
> |
char *bq = (char*)bam1_qual(hit_buf); |
1673 |
> |
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
1674 |
> |
qual[i]=bq[i]+33; |
1675 |
> |
} |
1676 |
> |
qual[hit_buf->core.l_qseq]=0; |
1677 |
> |
} |
1678 |
|
|
1679 |
< |
/* |
1680 |
< |
* For now, disallow hits that don't span |
1681 |
< |
* the insertion. If we allow these types of hits, |
1682 |
< |
* then long_spanning.cpp needs to be updated |
1683 |
< |
* in order to intelligently merge these kinds |
1684 |
< |
* of reads back together |
1685 |
< |
* |
1686 |
< |
* Following code has been changed to allow segment that end |
1687 |
< |
* in an insertion |
1688 |
< |
*/ |
1689 |
< |
bh = create_hit(name, |
1690 |
< |
contig, |
1691 |
< |
left, |
1692 |
< |
cigar, |
1693 |
< |
sam_flag & 0x0010, //####### |
1694 |
< |
junction_strand == "rev", |
1695 |
< |
num_mismatches, |
1696 |
< |
0, |
1697 |
< |
end); |
1698 |
< |
return true; |
1229 |
< |
} |
1679 |
> |
bool antisense_splice=false; |
1680 |
> |
unsigned char num_mismatches = 0; |
1681 |
> |
unsigned char num_splice_anchor_mismatches = 0; |
1682 |
> |
|
1683 |
> |
uint8_t* ptr = bam_aux_get(hit_buf, "XS"); |
1684 |
> |
if (ptr) { |
1685 |
> |
char src_strand_char = bam_aux2A(ptr); |
1686 |
> |
if (src_strand_char == '-') |
1687 |
> |
antisense_splice = true; |
1688 |
> |
} |
1689 |
> |
|
1690 |
> |
ptr = bam_aux_get(hit_buf, "NM"); |
1691 |
> |
if (ptr) { |
1692 |
> |
num_mismatches = bam_aux2i(ptr); |
1693 |
> |
} |
1694 |
> |
|
1695 |
> |
ptr = bam_aux_get(hit_buf, "NH"); |
1696 |
> |
if (ptr) { |
1697 |
> |
num_hits = bam_aux2i(ptr); |
1698 |
> |
} |
1699 |
|
|
1700 |
< |
else |
1701 |
< |
{ |
1702 |
< |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1703 |
< |
int spliced_read_len = strlen(seq_str); |
1704 |
< |
int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1; |
1705 |
< |
if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len; |
1706 |
< |
int8_t right_splice_pos = spliced_read_len - left_splice_pos; |
1700 |
> |
string text_name = _sam_header->target_name[target_id]; |
1701 |
> |
string text_name2 = ""; |
1702 |
> |
|
1703 |
> |
bool fusion_alignment = false; |
1704 |
> |
string fusion_cigar_str; |
1705 |
> |
ptr = bam_aux_get(hit_buf, "XF"); |
1706 |
> |
if (ptr) { |
1707 |
> |
fusion_alignment = true; |
1708 |
> |
char* xf = bam_aux2Z(ptr); |
1709 |
|
|
1710 |
< |
if (right_splice_pos <= 0 || left_splice_pos <= 0) |
1711 |
< |
return false; |
1710 |
> |
// ignore the second part of a fusion alignment |
1711 |
> |
if (xf[0] == '2') |
1712 |
> |
return false; |
1713 |
|
|
1714 |
< |
if ((sam_flag & 0x0010) == 0) //###### |
1715 |
< |
{ |
1244 |
< |
if (left_splice_pos + seg_offset < _anchor_length){ |
1245 |
< |
return false; |
1246 |
< |
} |
1247 |
< |
} |
1248 |
< |
else |
1249 |
< |
{ |
1250 |
< |
if (right_splice_pos + seg_offset < _anchor_length) |
1251 |
< |
return false; |
1252 |
< |
} |
1253 |
< |
//uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos; |
1254 |
< |
//atoi(toks[right_window_edge_field].c_str()); |
1255 |
< |
int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1; |
1714 |
> |
vector<string> fields; |
1715 |
> |
tokenize(xf, " ", fields); |
1716 |
|
|
1717 |
< |
string junction_strand = toks[num_extra_toks + strand_field]; |
1718 |
< |
if (!(junction_strand == "rev" || junction_strand == "fwd")|| |
1719 |
< |
!(orientation == '-' || orientation == '+')) |
1720 |
< |
{ |
1721 |
< |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
1722 |
< |
//fprintf(stderr, "junction_strand=%s, orientation='%c'\n", |
1723 |
< |
// junction_strand.c_str(), orientation); |
1264 |
< |
return false; |
1265 |
< |
} |
1717 |
> |
vector<string> contigs; |
1718 |
> |
tokenize(fields[1], "-", contigs); |
1719 |
> |
if (contigs.size() >= 2) |
1720 |
> |
{ |
1721 |
> |
text_name = contigs[0]; |
1722 |
> |
text_name2 = contigs[1]; |
1723 |
> |
} |
1724 |
|
|
1725 |
< |
//vector<string> mismatch_toks; |
1726 |
< |
char* pch = strtok (mismatches,","); |
1269 |
< |
int mismatches_in_anchor = 0; |
1270 |
< |
unsigned char num_mismatches = 0; |
1271 |
< |
while (pch != NULL) |
1272 |
< |
{ |
1273 |
< |
char* colon = strchr(pch, ':'); |
1274 |
< |
if (colon) |
1275 |
< |
{ |
1276 |
< |
*colon = 0; |
1277 |
< |
num_mismatches++; |
1278 |
< |
int mismatch_pos = atoi(pch); |
1279 |
< |
if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) || |
1280 |
< |
(orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len) |
1281 |
< |
mismatches_in_anchor++; |
1282 |
< |
} |
1283 |
< |
//mismatch_toks.push_back(pch); |
1284 |
< |
pch = strtok (NULL, ","); |
1285 |
< |
} |
1725 |
> |
text_offset = atoi(fields[2].c_str()) - 1; |
1726 |
> |
fusion_cigar_str = fields[3].c_str(); |
1727 |
|
|
1728 |
< |
// FIXME: we probably should exclude these hits somewhere, but this |
1729 |
< |
// isn't the right place |
1730 |
< |
vector<CigarOp> cigar; |
1731 |
< |
cigar.push_back(CigarOp(MATCH, left_splice_pos)); |
1732 |
< |
if(toks[num_extra_toks + junction_type_field] == "del"){ |
1292 |
< |
cigar.push_back(CigarOp(DEL, gap_len)); |
1293 |
< |
}else{ |
1294 |
< |
cigar.push_back(CigarOp(REF_SKIP, gap_len)); |
1295 |
< |
} |
1296 |
< |
cigar.push_back(CigarOp(MATCH, right_splice_pos)); |
1728 |
> |
if (seq) |
1729 |
> |
strcpy(seq, fields[4].c_str()); |
1730 |
> |
if (qual) |
1731 |
> |
strcpy(qual, fields[5].c_str()); |
1732 |
> |
} |
1733 |
|
|
1734 |
< |
bh = create_hit(name, |
1735 |
< |
contig, |
1736 |
< |
left, |
1737 |
< |
cigar, |
1738 |
< |
orientation == '-', |
1739 |
< |
junction_strand == "rev", |
1740 |
< |
num_mismatches, |
1741 |
< |
mismatches_in_anchor, |
1742 |
< |
end); |
1743 |
< |
return true; |
1744 |
< |
} |
1734 |
> |
if (fusion_alignment) { |
1735 |
> |
const char* p_cig = fusion_cigar_str.c_str(); |
1736 |
> |
while (*p_cig) { |
1737 |
> |
char* t; |
1738 |
> |
int length = (int)strtol(p_cig, &t, 10); |
1739 |
> |
if (length <= 0) |
1740 |
> |
{ |
1741 |
> |
//fprintf (stderr, "CIGAR op has zero length\n"); |
1742 |
> |
return false; |
1743 |
> |
} |
1744 |
> |
char op_char = *t; |
1745 |
> |
CigarOpCode opcode; |
1746 |
> |
if (op_char == 'M') opcode = MATCH; |
1747 |
> |
else if(op_char == 'm') opcode = mATCH; |
1748 |
> |
else if (op_char == 'I') opcode = INS; |
1749 |
> |
else if (op_char == 'i') opcode = iNS; |
1750 |
> |
else if (op_char == 'D') opcode = DEL; |
1751 |
> |
else if (op_char == 'd') opcode = dEL; |
1752 |
> |
else if (op_char == 'N' || op_char == 'n') |
1753 |
> |
{ |
1754 |
> |
if (length > max_report_intron_length) |
1755 |
> |
return false; |
1756 |
> |
|
1757 |
> |
if (op_char == 'N') |
1758 |
> |
opcode = REF_SKIP; |
1759 |
> |
else |
1760 |
> |
opcode = rEF_SKIP; |
1761 |
> |
spliced_alignment = true; |
1762 |
> |
} |
1763 |
> |
else if (op_char == 'F') |
1764 |
> |
{ |
1765 |
> |
opcode = FUSION_FF; |
1766 |
> |
length = length - 1; |
1767 |
> |
} |
1768 |
> |
else if (op_char == 'S') opcode = SOFT_CLIP; |
1769 |
> |
else if (op_char == 'H') opcode = HARD_CLIP; |
1770 |
> |
else if (op_char == 'P') opcode = PAD; |
1771 |
> |
else |
1772 |
> |
{ |
1773 |
> |
fprintf (stderr, "(%d-%d) invalid CIGAR operation\n", length, (int)op_char); |
1774 |
> |
return false; |
1775 |
|
} |
1776 |
< |
else |
1776 |
> |
p_cig = t + 1; |
1777 |
> |
cigar.push_back(CigarOp(opcode, length)); |
1778 |
> |
|
1779 |
> |
if(opcode == INS) { |
1780 |
> |
num_mismatches -= length; |
1781 |
> |
} |
1782 |
> |
else if(opcode == DEL) { |
1783 |
> |
num_mismatches -= length; |
1784 |
> |
} |
1785 |
> |
|
1786 |
> |
/* |
1787 |
> |
* update fusion direction. |
1788 |
> |
*/ |
1789 |
> |
size_t cigar_size = cigar.size(); |
1790 |
> |
if (cigar_size >= 3 && cigar[cigar_size - 2].opcode == FUSION_FF) |
1791 |
|
{ |
1792 |
< |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
1793 |
< |
//fprintf(stderr, "%s\n", orig_bwt_buf); |
1794 |
< |
// continue; |
1792 |
> |
CigarOpCode prev = cigar[cigar_size - 3].opcode; |
1793 |
> |
CigarOpCode next = cigar[cigar_size - 1].opcode; |
1794 |
> |
|
1795 |
> |
bool increase1 = false, increase2 = false; |
1796 |
> |
if (prev == MATCH || prev == DEL || prev == INS || prev == REF_SKIP) |
1797 |
> |
increase1 = true; |
1798 |
> |
if (next == MATCH || next == DEL || next == INS || next == REF_SKIP) |
1799 |
> |
increase2 = true; |
1800 |
> |
|
1801 |
> |
if (increase1 && !increase2) |
1802 |
> |
cigar[cigar_size - 2].opcode = FUSION_FR; |
1803 |
> |
else if (!increase1 && increase2) |
1804 |
> |
cigar[cigar_size - 2].opcode = FUSION_RF; |
1805 |
> |
else if (!increase1 && !increase2) |
1806 |
> |
cigar[cigar_size - 2].opcode = FUSION_RR; |
1807 |
> |
} |
1808 |
> |
} |
1809 |
> |
} |
1810 |
> |
else { |
1811 |
> |
for (int i = 0; i < hit_buf->core.n_cigar; ++i) |
1812 |
> |
{ |
1813 |
> |
int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT; |
1814 |
> |
if (length <= 0) |
1815 |
> |
{ |
1816 |
> |
fprintf (stderr, "BAM error: CIGAR op has zero length\n"); |
1817 |
> |
return false; |
1818 |
> |
} |
1819 |
> |
|
1820 |
> |
CigarOpCode opcode; |
1821 |
> |
switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK) |
1822 |
> |
{ |
1823 |
> |
case BAM_CMATCH: opcode = MATCH; break; |
1824 |
> |
case BAM_CINS: opcode = INS; break; |
1825 |
> |
case BAM_CDEL: opcode = DEL; break; |
1826 |
> |
case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break; |
1827 |
> |
case BAM_CHARD_CLIP: opcode = HARD_CLIP; break; |
1828 |
> |
case BAM_CPAD: opcode = PAD; break; |
1829 |
> |
case BAM_CREF_SKIP: |
1830 |
> |
opcode = REF_SKIP; |
1831 |
> |
spliced_alignment = true; |
1832 |
> |
if (length > (int)max_report_intron_length) |
1833 |
> |
{ |
1834 |
> |
//fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n"); |
1835 |
|
return false; |
1836 |
+ |
} |
1837 |
+ |
break; |
1838 |
+ |
default: |
1839 |
+ |
fprintf (stderr, "BAM read: invalid CIGAR operation\n"); |
1840 |
+ |
return false; |
1841 |
+ |
} |
1842 |
+ |
|
1843 |
+ |
if (opcode != HARD_CLIP) |
1844 |
+ |
cigar.push_back(CigarOp(opcode, length)); |
1845 |
+ |
|
1846 |
+ |
/* |
1847 |
+ |
* By convention,the NM field of the SAM record |
1848 |
+ |
* counts an insertion or deletion. I dont' think |
1849 |
+ |
* we want the mismatch count in the BowtieHit |
1850 |
+ |
* record to reflect this. Therefore, subtract out |
1851 |
+ |
* the mismatches due to in/dels |
1852 |
+ |
*/ |
1853 |
+ |
if(opcode == INS){ |
1854 |
+ |
num_mismatches -= length; |
1855 |
|
} |
1856 |
+ |
else if(opcode == DEL){ |
1857 |
+ |
num_mismatches -= length; |
1858 |
+ |
} |
1859 |
+ |
} |
1860 |
+ |
} |
1861 |
|
|
1862 |
< |
return false; |
1862 |
> |
|
1863 |
> |
string mrnm; |
1864 |
> |
if (mate_target_id >= 0) { |
1865 |
> |
if (mate_target_id == target_id) { |
1866 |
> |
//mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id]; |
1867 |
> |
mrnm = _sam_header->target_name[mate_target_id]; |
1868 |
> |
} |
1869 |
> |
else { |
1870 |
> |
//fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n"); |
1871 |
> |
return false; |
1872 |
> |
} |
1873 |
> |
} |
1874 |
> |
else { |
1875 |
> |
text_mate_pos = 0; |
1876 |
> |
} |
1877 |
> |
|
1878 |
> |
if (spliced_alignment) { |
1879 |
> |
bh = create_hit(qname, |
1880 |
> |
text_name, |
1881 |
> |
text_name2, |
1882 |
> |
text_offset, // BAM files are 0-indexed |
1883 |
> |
cigar, |
1884 |
> |
sam_flag & 0x0010, |
1885 |
> |
antisense_splice, |
1886 |
> |
num_mismatches, |
1887 |
> |
num_splice_anchor_mismatches, |
1888 |
> |
end); |
1889 |
> |
|
1890 |
> |
} |
1891 |
> |
else { |
1892 |
> |
bh = create_hit(qname, |
1893 |
> |
text_name, |
1894 |
> |
text_name2, |
1895 |
> |
text_offset, // BAM files are 0-indexed |
1896 |
> |
cigar, |
1897 |
> |
sam_flag & 0x0010, |
1898 |
> |
false, |
1899 |
> |
num_mismatches, |
1900 |
> |
0, |
1901 |
> |
end); |
1902 |
> |
} |
1903 |
> |
|
1904 |
> |
return true; |
1905 |
|
} |
1906 |
|
|
1907 |
+ |
bool BAMHitFactory::inspect_header(HitStream& hs) |
1908 |
+ |
{ |
1909 |
+ |
bam_header_t* header = ((samfile_t*)(hs._hit_file))->header; |
1910 |
+ |
|
1911 |
+ |
if (header == NULL) { |
1912 |
+ |
fprintf(stderr, "Warning: No BAM header\n"); |
1913 |
+ |
return false; |
1914 |
+ |
} |
1915 |
+ |
if (header->l_text == 0) { |
1916 |
+ |
fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n"); |
1917 |
+ |
return false; |
1918 |
+ |
} |
1919 |
+ |
return true; |
1920 |
+ |
} |
1921 |
|
|
1922 |
< |
bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1922 |
> |
bool SplicedBAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1923 |
|
BowtieHit& bh, bool strip_slash, |
1924 |
|
char* name_out, char* name_tags, |
1925 |
|
char* seq, char* qual) { |
1926 |
|
if (_sam_header==NULL) |
1927 |
|
err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!"); |
1928 |
< |
const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf; |
1929 |
< |
|
1930 |
< |
|
1931 |
< |
uint32_t sam_flag = hit_buf->core.flag; |
1932 |
< |
|
1933 |
< |
int text_offset = hit_buf->core.pos; |
1934 |
< |
int text_mate_pos = hit_buf->core.mpos; |
1935 |
< |
int target_id = hit_buf->core.tid; |
1936 |
< |
int mate_target_id = hit_buf->core.mtid; |
1937 |
< |
|
1938 |
< |
vector<CigarOp> cigar; |
1939 |
< |
bool spliced_alignment = false; |
1940 |
< |
int num_hits = 1; |
1941 |
< |
|
1942 |
< |
double mapQ = hit_buf->core.qual; |
1943 |
< |
long double error_prob; |
1944 |
< |
if (mapQ > 0) |
1945 |
< |
{ |
1946 |
< |
long double p = (-1.0 * mapQ) / 10.0; |
1947 |
< |
error_prob = pow(10.0L, p); |
1948 |
< |
} |
1949 |
< |
else |
1950 |
< |
{ |
1951 |
< |
error_prob = 1.0; |
1952 |
< |
} |
1928 |
> |
|
1929 |
> |
const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf; |
1930 |
> |
uint32_t sam_flag = hit_buf->core.flag; |
1931 |
> |
|
1932 |
> |
int text_offset = hit_buf->core.pos; |
1933 |
> |
int text_mate_pos = hit_buf->core.mpos; |
1934 |
> |
int target_id = hit_buf->core.tid; |
1935 |
> |
int mate_target_id = hit_buf->core.mtid; |
1936 |
> |
|
1937 |
> |
vector<CigarOp> samcigar; |
1938 |
> |
bool spliced_alignment = false; |
1939 |
> |
int num_hits = 1; |
1940 |
> |
|
1941 |
> |
double mapQ = hit_buf->core.qual; |
1942 |
> |
long double error_prob; |
1943 |
> |
if (mapQ > 0) |
1944 |
> |
{ |
1945 |
> |
long double p = (-1.0 * mapQ) / 10.0; |
1946 |
> |
error_prob = pow(10.0L, p); |
1947 |
> |
} |
1948 |
> |
else |
1949 |
> |
{ |
1950 |
> |
error_prob = 1.0; |
1951 |
> |
} |
1952 |
> |
|
1953 |
> |
if (seq!=NULL) { |
1954 |
> |
char *bseq = (char*)bam1_seq(hit_buf); |
1955 |
> |
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
1956 |
> |
char v = bam1_seqi(bseq,i); |
1957 |
> |
seq[i]=bam_nt16_rev_table[v]; |
1958 |
> |
} |
1959 |
> |
seq[hit_buf->core.l_qseq]=0; |
1960 |
> |
} |
1961 |
> |
if (qual!=NULL) { |
1962 |
> |
char *bq = (char*)bam1_qual(hit_buf); |
1963 |
> |
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
1964 |
> |
qual[i]=bq[i]+33; |
1965 |
> |
} |
1966 |
> |
qual[hit_buf->core.l_qseq]=0; |
1967 |
> |
} |
1968 |
|
|
1354 |
– |
//header->target_name[c->tid] |
1355 |
– |
|
1969 |
|
bool end = true; |
1970 |
|
unsigned int seg_offset = 0; |
1971 |
|
unsigned int seg_num = 0; |
1972 |
|
unsigned int num_segs = 0; |
1973 |
|
// Copy the tag out of the name field before we might wipe it out |
1974 |
< |
char* qname = bam1_qname(hit_buf); |
1975 |
< |
char* pipe = strrchr(qname, '|'); |
1363 |
< |
if (pipe) |
1364 |
< |
{ |
1365 |
< |
if (name_tags) |
1366 |
< |
strcpy(name_tags, pipe); |
1367 |
< |
|
1368 |
< |
char* tag_buf = pipe + 1; |
1369 |
< |
if (strchr(tag_buf, ':')) |
1370 |
< |
{ |
1371 |
< |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
1372 |
< |
if (seg_num + 1 == num_segs) |
1373 |
< |
end = true; |
1374 |
< |
else |
1375 |
< |
end = false; |
1376 |
< |
} |
1974 |
> |
char* name = bam1_qname(hit_buf); |
1975 |
> |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
1976 |
|
|
1977 |
< |
*pipe = 0; |
1977 |
> |
if (target_id < 0) { |
1978 |
> |
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
1979 |
> |
bh = create_hit(name, |
1980 |
> |
"*", //ref_name |
1981 |
> |
0, //left coord |
1982 |
> |
0, //read_len |
1983 |
> |
false, //antisense_aln |
1984 |
> |
0, //edit_dist |
1985 |
> |
end); |
1986 |
> |
return true; |
1987 |
|
} |
1988 |
|
|
1989 |
< |
|
1990 |
< |
if (target_id < 0) { |
1991 |
< |
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
1992 |
< |
bh = create_hit(qname, |
1385 |
< |
"*", //ref_name |
1386 |
< |
0, //left coord |
1387 |
< |
0, //read_len |
1388 |
< |
false, //antisense_aln |
1389 |
< |
0, //edit_dist |
1390 |
< |
end); |
1391 |
< |
return true; |
1392 |
< |
} |
1393 |
< |
|
1394 |
< |
//string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id]; |
1395 |
< |
string text_name = _sam_header->target_name[target_id]; |
1396 |
< |
for (int i = 0; i < hit_buf->core.n_cigar; ++i) |
1397 |
< |
{ |
1398 |
< |
//char* t; |
1399 |
< |
|
1400 |
< |
int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT; |
1401 |
< |
if (length <= 0) |
1402 |
< |
{ |
1403 |
< |
fprintf (stderr, "BAM error: CIGAR op has zero length\n"); |
1404 |
< |
return false; |
1405 |
< |
} |
1406 |
< |
|
1407 |
< |
CigarOpCode opcode; |
1408 |
< |
switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK) |
1409 |
< |
{ |
1410 |
< |
case BAM_CMATCH: opcode = MATCH; break; |
1411 |
< |
case BAM_CINS: opcode = INS; break; |
1412 |
< |
case BAM_CDEL: opcode = DEL; break; |
1413 |
< |
case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break; |
1414 |
< |
case BAM_CHARD_CLIP: opcode = HARD_CLIP; break; |
1415 |
< |
case BAM_CPAD: opcode = PAD; break; |
1416 |
< |
case BAM_CREF_SKIP: |
1417 |
< |
opcode = REF_SKIP; |
1418 |
< |
spliced_alignment = true; |
1419 |
< |
if (length > (int)max_report_intron_length) |
1420 |
< |
{ |
1421 |
< |
//fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n"); |
1422 |
< |
return false; |
1423 |
< |
} |
1424 |
< |
break; |
1425 |
< |
default: |
1426 |
< |
fprintf (stderr, "BAM read: invalid CIGAR operation\n"); |
1427 |
< |
return false; |
1428 |
< |
} |
1429 |
< |
if (opcode != HARD_CLIP) |
1430 |
< |
cigar.push_back(CigarOp(opcode, length)); |
1431 |
< |
} |
1989 |
> |
string text_name = _sam_header->target_name[target_id]; |
1990 |
> |
for (int i = 0; i < hit_buf->core.n_cigar; ++i) |
1991 |
> |
{ |
1992 |
> |
//char* t; |
1993 |
|
|
1994 |
< |
string mrnm; |
1995 |
< |
if (mate_target_id >= 0) { |
1996 |
< |
if (mate_target_id == target_id) { |
1997 |
< |
//mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id]; |
1998 |
< |
mrnm = _sam_header->target_name[mate_target_id]; |
1999 |
< |
} |
2000 |
< |
else { |
2001 |
< |
//fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n"); |
2002 |
< |
return false; |
2003 |
< |
} |
2004 |
< |
} |
2005 |
< |
else { |
2006 |
< |
text_mate_pos = 0; |
2007 |
< |
} |
2008 |
< |
//CuffStrand source_strand = CUFF_STRAND_UNKNOWN; |
2009 |
< |
bool antisense_splice=false; |
2010 |
< |
unsigned char num_mismatches = 0; |
1994 |
> |
int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT; |
1995 |
> |
if (length <= 0) |
1996 |
> |
{ |
1997 |
> |
fprintf (stderr, "BAM error: CIGAR op has zero length\n"); |
1998 |
> |
return false; |
1999 |
> |
} |
2000 |
> |
|
2001 |
> |
CigarOpCode opcode; |
2002 |
> |
switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK) |
2003 |
> |
{ |
2004 |
> |
case BAM_CMATCH: opcode = MATCH; break; |
2005 |
> |
case BAM_CINS: opcode = INS; break; |
2006 |
> |
case BAM_CDEL: opcode = DEL; break; |
2007 |
> |
case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break; |
2008 |
> |
case BAM_CHARD_CLIP: opcode = HARD_CLIP; break; |
2009 |
> |
case BAM_CPAD: opcode = PAD; break; |
2010 |
> |
case BAM_CREF_SKIP: |
2011 |
> |
opcode = REF_SKIP; |
2012 |
> |
spliced_alignment = true; |
2013 |
> |
if (length > (int)max_report_intron_length) |
2014 |
> |
{ |
2015 |
> |
//fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n"); |
2016 |
> |
return false; |
2017 |
> |
} |
2018 |
> |
break; |
2019 |
> |
default: |
2020 |
> |
fprintf (stderr, "BAM read: invalid CIGAR operation\n"); |
2021 |
> |
return false; |
2022 |
> |
} |
2023 |
> |
if (opcode != HARD_CLIP) |
2024 |
> |
samcigar.push_back(CigarOp(opcode, length)); |
2025 |
> |
} |
2026 |
> |
|
2027 |
> |
string mrnm; |
2028 |
> |
if (mate_target_id >= 0) { |
2029 |
> |
if (mate_target_id == target_id) { |
2030 |
> |
mrnm = _sam_header->target_name[mate_target_id]; |
2031 |
> |
} |
2032 |
> |
else { |
2033 |
> |
return false; |
2034 |
> |
} |
2035 |
> |
} |
2036 |
> |
else { |
2037 |
> |
text_mate_pos = 0; |
2038 |
> |
} |
2039 |
> |
|
2040 |
> |
bool antisense_splice = false; |
2041 |
> |
int sam_nm = 0; |
2042 |
> |
vector<bool> mismatches; |
2043 |
> |
mismatches.resize(strlen(seq), false); |
2044 |
> |
int num_mismatches=getBAMmismatches(hit_buf, samcigar, mismatches, sam_nm, antisense_splice); |
2045 |
|
unsigned char num_splice_anchor_mismatches = 0; |
2046 |
+ |
|
2047 |
+ |
//############################################## |
2048 |
+ |
|
2049 |
+ |
// Add this alignment to the table of hits for this half of the |
2050 |
+ |
// Bowtie map |
2051 |
+ |
|
2052 |
+ |
// Parse the text_name field to recover the splice coords |
2053 |
+ |
vector<string> toks; |
2054 |
+ |
tokenize_strict(text_name.c_str(), "|", toks); |
2055 |
+ |
int num_extra_toks = (int)toks.size() - 6; |
2056 |
+ |
if (num_extra_toks >= 0) |
2057 |
+ |
{ |
2058 |
+ |
static const uint8_t left_window_edge_field = 1; |
2059 |
+ |
static const uint8_t splice_field = 2; |
2060 |
+ |
//static const uint8_t right_window_edge_field = 3; |
2061 |
+ |
static const uint8_t junction_type_field = 4; |
2062 |
+ |
static const uint8_t strand_field = 5; |
2063 |
+ |
|
2064 |
+ |
string contig = toks[0]; |
2065 |
+ |
for (int t = 1; t <= num_extra_toks; ++t) |
2066 |
+ |
{ |
2067 |
+ |
contig += "|"; |
2068 |
+ |
contig += toks[t]; |
2069 |
+ |
} |
2070 |
+ |
|
2071 |
+ |
vector<string> splice_toks; |
2072 |
+ |
tokenize(toks[num_extra_toks + splice_field], "-", splice_toks); |
2073 |
+ |
if (splice_toks.size() != 2) |
2074 |
+ |
{ |
2075 |
+ |
fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); |
2076 |
+ |
//fprintf(stderr, "\t%s (token: %s)\n", text_name, |
2077 |
+ |
// toks[num_extra_toks + splice_field].c_str()); |
2078 |
+ |
return false; |
2079 |
+ |
} |
2080 |
|
|
2081 |
< |
uint8_t* ptr = bam_aux_get(hit_buf, "XS"); |
2082 |
< |
if (ptr) { |
2083 |
< |
char src_strand_char = bam_aux2A(ptr); |
2084 |
< |
if (src_strand_char == '-') |
2085 |
< |
antisense_splice = true; |
2086 |
< |
// else if (src_strand_char == '+') |
2087 |
< |
// source_strand = CUFF_FWD; |
2088 |
< |
} |
2089 |
< |
|
2090 |
< |
ptr = bam_aux_get(hit_buf, "NM"); |
2091 |
< |
if (ptr) { |
2092 |
< |
num_mismatches = bam_aux2i(ptr); |
2093 |
< |
} |
2081 |
> |
const string& junction_type = toks[num_extra_toks + junction_type_field]; |
2082 |
> |
const string junction_strand = toks[num_extra_toks + strand_field]; |
2083 |
> |
|
2084 |
> |
// |
2085 |
> |
// check for an insertion hit |
2086 |
> |
// |
2087 |
> |
if(junction_type == "ins") |
2088 |
> |
{ |
2089 |
> |
//int8_t spliced_read_len = strlen(seq_str); |
2090 |
> |
//TODO FIXME: use the CIGAR instead of seq length! |
2091 |
> |
// The 0-based position of the left edge of the alignment. Note that this |
2092 |
> |
// value may need to be further corrected to account for the presence of |
2093 |
> |
// of the insertion. |
2094 |
> |
int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
2095 |
> |
|
2096 |
> |
// The 0-based position of the last genomic sequence before the insertion |
2097 |
> |
int left_splice_pos = atoi(splice_toks[0].c_str()); |
2098 |
> |
|
2099 |
> |
string insertedSequence = splice_toks[1]; |
2100 |
> |
// The 0-based position of the first genomic sequence after the insertion |
2101 |
> |
|
2102 |
> |
vector<CigarOp> splcigar; |
2103 |
> |
//this also updates left to the adjusted genomic coordinates |
2104 |
> |
int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, |
2105 |
> |
left, left_splice_pos+1, insertedSequence.length(), INS); |
2106 |
> |
|
2107 |
> |
if (spl_num_mismatches<0) return false; |
2108 |
> |
num_mismatches-=spl_num_mismatches; |
2109 |
> |
bh = create_hit(name, |
2110 |
> |
contig, |
2111 |
> |
"", |
2112 |
> |
left, |
2113 |
> |
splcigar, |
2114 |
> |
sam_flag & 0x0010, |
2115 |
> |
junction_strand == "rev", |
2116 |
> |
num_mismatches, |
2117 |
> |
0, |
2118 |
> |
end); |
2119 |
> |
|
2120 |
> |
return true; |
2121 |
> |
} //"ins" |
2122 |
> |
else //"del", "intron", or "fusion" |
2123 |
> |
{ |
2124 |
> |
char orientation = (sam_flag & 0x0010 ? '-' : '+'); |
2125 |
> |
if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")|| |
2126 |
> |
!(orientation == '-' || orientation == '+')) |
2127 |
> |
{ |
2128 |
> |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
2129 |
> |
return false; |
2130 |
> |
} |
2131 |
> |
|
2132 |
> |
// The 0-based position of the left edge of the alignment. |
2133 |
> |
int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
2134 |
> |
if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr")) |
2135 |
> |
left += text_offset; |
2136 |
> |
else |
2137 |
> |
left -= text_offset; |
2138 |
> |
|
2139 |
> |
vector<CigarOp> splcigar; |
2140 |
> |
CigarOpCode opcode; |
2141 |
> |
if(junction_type == "del") |
2142 |
> |
opcode = DEL; |
2143 |
> |
else if(junction_type == "fus") |
2144 |
> |
{ |
2145 |
> |
if (junction_strand == "ff") |
2146 |
> |
opcode = FUSION_FF; |
2147 |
> |
else if (junction_strand == "fr") |
2148 |
> |
opcode = FUSION_FR; |
2149 |
> |
else if (junction_strand == "rf") |
2150 |
> |
opcode = FUSION_RF; |
2151 |
> |
else |
2152 |
> |
opcode = FUSION_RR; |
2153 |
> |
} |
2154 |
> |
else |
2155 |
> |
opcode = REF_SKIP; |
2156 |
|
|
2157 |
< |
ptr = bam_aux_get(hit_buf, "NH"); |
1467 |
< |
if (ptr) { |
1468 |
< |
num_hits = bam_aux2i(ptr); |
1469 |
< |
} |
1470 |
< |
|
1471 |
< |
//bool antisense_aln = bam1_strand(hit_buf); |
1472 |
< |
|
1473 |
< |
//if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN) |
1474 |
< |
// source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping()); |
1475 |
< |
if (spliced_alignment) { |
1476 |
< |
//if (source_strand == CUFF_STRAND_UNKNOWN) { |
1477 |
< |
// fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n"); |
1478 |
< |
// } |
1479 |
< |
bh = create_hit(qname, |
1480 |
< |
text_name, |
1481 |
< |
text_offset, // BAM files are 0-indexed |
1482 |
< |
cigar, |
1483 |
< |
sam_flag & 0x0010, |
1484 |
< |
antisense_splice, |
1485 |
< |
num_mismatches, |
1486 |
< |
num_splice_anchor_mismatches, |
1487 |
< |
end); |
2157 |
> |
int left_splice_pos = atoi(splice_toks[0].c_str()); |
2158 |
|
|
2159 |
< |
} |
2160 |
< |
else { |
2161 |
< |
//assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN); |
2162 |
< |
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
2163 |
< |
bh = create_hit(qname, |
2164 |
< |
text_name, |
2165 |
< |
text_offset, // BAM files are 0-indexed |
2166 |
< |
cigar, |
2167 |
< |
sam_flag & 0x0010, |
2168 |
< |
false, |
2169 |
< |
num_mismatches, |
2170 |
< |
0, |
2171 |
< |
end); |
2172 |
< |
} |
2173 |
< |
if (seq!=NULL) { |
2174 |
< |
char *bseq = (char*)bam1_seq(hit_buf); |
2175 |
< |
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
2176 |
< |
char v = bam1_seqi(bseq,i); |
2177 |
< |
seq[i]=bam_nt16_rev_table[v]; |
2178 |
< |
} |
2179 |
< |
seq[hit_buf->core.l_qseq]=0; |
2180 |
< |
} |
2181 |
< |
if (qual!=NULL) { |
2182 |
< |
char *bq = (char*)bam1_qual(hit_buf); |
2183 |
< |
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
2184 |
< |
qual[i]=bq[i]+33; |
2185 |
< |
} |
2186 |
< |
qual[hit_buf->core.l_qseq]=0; |
2187 |
< |
} |
2188 |
< |
return true; |
2189 |
< |
} |
2159 |
> |
// The 0-based position of the last genomic sequence before the deletion |
2160 |
> |
int gap_len = 0; |
2161 |
> |
if (junction_type == "fus") |
2162 |
> |
gap_len = atoi(splice_toks[1].c_str()); |
2163 |
> |
else |
2164 |
> |
gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1; |
2165 |
> |
|
2166 |
> |
if (opcode == FUSION_RF || opcode == FUSION_RR) |
2167 |
> |
left_splice_pos -= 1; |
2168 |
> |
else |
2169 |
> |
left_splice_pos += 1; |
2170 |
> |
|
2171 |
> |
int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left, |
2172 |
> |
left_splice_pos, gap_len, opcode); |
2173 |
> |
|
2174 |
> |
if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches) |
2175 |
> |
return false; |
2176 |
> |
|
2177 |
> |
string contig2 = ""; |
2178 |
> |
if (junction_type == "fus") |
2179 |
> |
{ |
2180 |
> |
vector<string> contigs; |
2181 |
> |
tokenize(contig, "-", contigs); |
2182 |
> |
if (contigs.size() != 2) |
2183 |
> |
return false; |
2184 |
> |
|
2185 |
> |
contig = contigs[0]; |
2186 |
> |
contig2 = contigs[1]; |
2187 |
> |
|
2188 |
> |
if (junction_strand == "rf" || junction_strand == "rr") |
2189 |
> |
orientation = (orientation == '+' ? '-' : '+'); |
2190 |
> |
} |
2191 |
> |
|
2192 |
> |
bh = create_hit(name, |
2193 |
> |
contig, |
2194 |
> |
contig2, |
2195 |
> |
left, |
2196 |
> |
splcigar, |
2197 |
> |
orientation == '-', |
2198 |
> |
junction_strand == "rev", |
2199 |
> |
num_mismatches, |
2200 |
> |
spl_num_mismatches, |
2201 |
> |
end); |
2202 |
|
|
2203 |
< |
bool BAMHitFactory::inspect_header(HitStream& hs) |
2204 |
< |
{ |
2205 |
< |
bam_header_t* header = ((samfile_t*)(hs._hit_file))->header; |
2206 |
< |
|
2207 |
< |
if (header == NULL) { |
2208 |
< |
fprintf(stderr, "Warning: No BAM header\n"); |
2209 |
< |
return false; |
2210 |
< |
} |
2211 |
< |
if (header->l_text == 0) { |
1530 |
< |
fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n"); |
1531 |
< |
return false; |
2203 |
> |
return true; |
2204 |
> |
} |
2205 |
> |
} //parse splice data |
2206 |
> |
else |
2207 |
> |
{ |
2208 |
> |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
2209 |
> |
//fprintf(stderr, "%s\n", orig_bwt_buf); |
2210 |
> |
// continue; |
2211 |
> |
return false; |
2212 |
|
} |
2213 |
< |
return true; |
2213 |
> |
|
2214 |
> |
return false; |
2215 |
|
} |
2216 |
|
|
1536 |
– |
|
2217 |
|
void get_mapped_reads(FILE* bwtf, |
2218 |
|
HitTable& hits, |
2219 |
|
HitFactory& hit_factory, |
2316 |
|
case MATCH: |
2317 |
|
for (size_t m = 0; m < cigar[c].length; ++m) |
2318 |
|
{ |
2319 |
< |
if (DoC[j + m] < 0xFFFFFFFF) |
2319 |
> |
if (DoC[j + m] < VMAXINT32) |
2320 |
|
DoC[j + m]++; |
2321 |
|
} |
2322 |
|
//fall through this case to REF_SKIP is intentional |
2334 |
|
const char* read_name, |
2335 |
|
const BowtieHit& bh, |
2336 |
|
const char* ref_name, |
2337 |
+ |
const char* ref_name2, |
2338 |
|
const char* sequence, |
2339 |
< |
const char* qualities, |
1659 |
< |
bool from_bowtie) |
2339 |
> |
const char* qualities) |
2340 |
|
{ |
2341 |
< |
string seq; |
2342 |
< |
string quals; |
2343 |
< |
if (sequence) |
2344 |
< |
{ |
2345 |
< |
seq = sequence; |
2346 |
< |
quals = qualities; |
2347 |
< |
seq.resize(bh.read_len()); |
2348 |
< |
quals.resize(bh.read_len()); |
2349 |
< |
} |
2350 |
< |
else |
2351 |
< |
{ |
2352 |
< |
seq = "*"; |
2353 |
< |
} |
2354 |
< |
|
2355 |
< |
if (qualities) |
2356 |
< |
{ |
2357 |
< |
quals = qualities; |
2358 |
< |
quals.resize(bh.read_len()); |
2359 |
< |
} |
2360 |
< |
else |
2361 |
< |
{ |
2362 |
< |
quals = "*"; |
2363 |
< |
} |
2341 |
> |
string seq; |
2342 |
> |
string quals; |
2343 |
> |
if (sequence) |
2344 |
> |
{ |
2345 |
> |
seq = sequence; |
2346 |
> |
quals = qualities; |
2347 |
> |
seq.resize(bh.read_len()); |
2348 |
> |
quals.resize(bh.read_len()); |
2349 |
> |
} |
2350 |
> |
else |
2351 |
> |
{ |
2352 |
> |
seq = "*"; |
2353 |
> |
} |
2354 |
> |
|
2355 |
> |
if (qualities) |
2356 |
> |
{ |
2357 |
> |
quals = qualities; |
2358 |
> |
quals.resize(bh.read_len()); |
2359 |
> |
} |
2360 |
> |
else |
2361 |
> |
{ |
2362 |
> |
quals = "*"; |
2363 |
> |
} |
2364 |
> |
|
2365 |
> |
uint32_t sam_flag = 0; |
2366 |
> |
if (bh.antisense_align()) |
2367 |
> |
{ |
2368 |
> |
sam_flag |= 0x0010; // BAM_FREVERSE |
2369 |
> |
} |
2370 |
> |
|
2371 |
> |
int left = bh.left(); |
2372 |
> |
|
2373 |
> |
uint32_t map_quality = 255; |
2374 |
> |
char cigar[256] = {0}; |
2375 |
> |
string mate_ref_name = "*"; |
2376 |
> |
uint32_t mate_pos = 0; |
2377 |
> |
uint32_t insert_size = 0; |
2378 |
> |
|
2379 |
> |
const vector<CigarOp>& bh_cigar = bh.cigar(); |
2380 |
> |
|
2381 |
> |
/* |
2382 |
> |
* In addition to calculating the cigar string, |
2383 |
> |
* we need to figure out how many in/dels are in the |
2384 |
> |
* sequence, so that we can give the correct |
2385 |
> |
* value for the NM tag |
2386 |
> |
*/ |
2387 |
> |
int indel_distance = 0; |
2388 |
> |
CigarOpCode fusion_dir = FUSION_NOTHING; |
2389 |
> |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
2390 |
> |
{ |
2391 |
> |
const CigarOp& op = bh_cigar[c]; |
2392 |
|
|
2393 |
< |
uint32_t sam_flag = 0; |
2394 |
< |
if (bh.antisense_align()) |
2393 |
> |
char ibuf[64]; |
2394 |
> |
sprintf(ibuf, "%d", op.length); |
2395 |
> |
switch(op.opcode) |
2396 |
|
{ |
2397 |
< |
sam_flag |= 0x0010; // BAM_FREVERSE |
2398 |
< |
if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed. |
2399 |
< |
{ |
2400 |
< |
reverse_complement(seq); |
2401 |
< |
reverse(quals.begin(), quals.end()); |
2402 |
< |
} |
2397 |
> |
case MATCH: |
2398 |
> |
case mATCH: |
2399 |
> |
strcat(cigar, ibuf); |
2400 |
> |
if (bh_cigar[c].opcode == MATCH) |
2401 |
> |
strcat(cigar, "M"); |
2402 |
> |
else |
2403 |
> |
strcat(cigar, "m"); |
2404 |
> |
break; |
2405 |
> |
case INS: |
2406 |
> |
case iNS: |
2407 |
> |
strcat(cigar, ibuf); |
2408 |
> |
if (bh_cigar[c].opcode == INS) |
2409 |
> |
strcat(cigar, "I"); |
2410 |
> |
else |
2411 |
> |
strcat(cigar, "i"); |
2412 |
> |
indel_distance += bh_cigar[c].length; |
2413 |
> |
break; |
2414 |
> |
case DEL: |
2415 |
> |
case dEL: |
2416 |
> |
strcat(cigar, ibuf); |
2417 |
> |
if (bh_cigar[c].opcode == DEL) |
2418 |
> |
strcat(cigar, "D"); |
2419 |
> |
else |
2420 |
> |
strcat(cigar, "d"); |
2421 |
> |
indel_distance += bh_cigar[c].length; |
2422 |
> |
break; |
2423 |
> |
case REF_SKIP: |
2424 |
> |
case rEF_SKIP: |
2425 |
> |
strcat(cigar, ibuf); |
2426 |
> |
if (bh_cigar[c].opcode == REF_SKIP) |
2427 |
> |
strcat(cigar, "N"); |
2428 |
> |
else |
2429 |
> |
strcat(cigar, "n"); |
2430 |
> |
break; |
2431 |
> |
case FUSION_FF: |
2432 |
> |
case FUSION_FR: |
2433 |
> |
case FUSION_RF: |
2434 |
> |
case FUSION_RR: |
2435 |
> |
fusion_dir = op.opcode; |
2436 |
> |
sprintf(ibuf, "%d", bh_cigar[c].length + 1); |
2437 |
> |
strcat(cigar, ibuf); |
2438 |
> |
strcat(cigar, "F"); |
2439 |
> |
break; |
2440 |
> |
default: |
2441 |
> |
break; |
2442 |
|
} |
2443 |
< |
|
1696 |
< |
uint32_t sam_pos = bh.left() + 1; |
1697 |
< |
uint32_t map_quality = 255; |
1698 |
< |
char cigar[256]; |
1699 |
< |
cigar[0] = 0; |
1700 |
< |
string mate_ref_name = "*"; |
1701 |
< |
uint32_t mate_pos = 0; |
1702 |
< |
uint32_t insert_size = 0; |
1703 |
< |
//string qualities = "*"; |
1704 |
< |
|
1705 |
< |
const vector<CigarOp>& bh_cigar = bh.cigar(); |
2443 |
> |
} |
2444 |
|
|
2445 |
< |
/* |
2446 |
< |
* In addition to calculating the cigar string, |
2447 |
< |
* we need to figure out how many in/dels are in the |
2448 |
< |
* sequence, so that we can give the correct |
2449 |
< |
* value for the NM tag |
2450 |
< |
*/ |
2451 |
< |
int indel_distance = 0; |
2452 |
< |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
2445 |
> |
char cigar1[256] = {0}, cigar2[256] = {0}; |
2446 |
> |
string left_seq, right_seq, left_qual, right_qual; |
2447 |
> |
int left1 = -1, left2 = -1; |
2448 |
> |
extract_partial_hits(bh, seq, quals, |
2449 |
> |
cigar1, cigar2, left_seq, right_seq, |
2450 |
> |
left_qual, right_qual, left1, left2); |
2451 |
> |
|
2452 |
> |
|
2453 |
> |
bool containsSplice = false; |
2454 |
> |
for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr) |
2455 |
> |
{ |
2456 |
> |
if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP) |
2457 |
|
{ |
2458 |
< |
char ibuf[64]; |
2459 |
< |
sprintf(ibuf, "%d", bh_cigar[c].length); |
1718 |
< |
switch(bh_cigar[c].opcode) |
1719 |
< |
{ |
1720 |
< |
case MATCH: |
1721 |
< |
strcat(cigar, ibuf); |
1722 |
< |
strcat(cigar, "M"); |
1723 |
< |
break; |
1724 |
< |
case INS: |
1725 |
< |
strcat(cigar, ibuf); |
1726 |
< |
strcat(cigar, "I"); |
1727 |
< |
indel_distance += bh_cigar[c].length; |
1728 |
< |
break; |
1729 |
< |
case DEL: |
1730 |
< |
strcat(cigar, ibuf); |
1731 |
< |
strcat(cigar, "D"); |
1732 |
< |
indel_distance += bh_cigar[c].length; |
1733 |
< |
break; |
1734 |
< |
case REF_SKIP: |
1735 |
< |
strcat(cigar, ibuf); |
1736 |
< |
strcat(cigar, "N"); |
1737 |
< |
break; |
1738 |
< |
default: |
1739 |
< |
break; |
1740 |
< |
} |
2458 |
> |
containsSplice = true; |
2459 |
> |
break; |
2460 |
|
} |
2461 |
< |
|
2462 |
< |
//string q = string(bh.read_len(), '!'); |
2463 |
< |
//string s = string(bh.read_len(), 'N'); |
2464 |
< |
|
2465 |
< |
fprintf(fout, |
2466 |
< |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", |
2467 |
< |
read_name, |
2468 |
< |
sam_flag, |
2469 |
< |
ref_name, |
2470 |
< |
sam_pos, |
2471 |
< |
map_quality, |
2472 |
< |
cigar, |
2473 |
< |
mate_ref_name.c_str(), |
2474 |
< |
mate_pos, |
2475 |
< |
insert_size, |
2476 |
< |
seq.c_str(), |
2477 |
< |
quals.c_str()); |
2478 |
< |
|
2479 |
< |
if (!sam_readgroup_id.empty()) |
2480 |
< |
fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str()); |
2481 |
< |
|
2482 |
< |
fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance); |
2483 |
< |
|
2484 |
< |
bool containsSplice = false; |
2485 |
< |
for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){ |
2486 |
< |
if(itr->opcode == REF_SKIP){ |
2487 |
< |
containsSplice = true; |
2488 |
< |
break; |
2489 |
< |
} |
2461 |
> |
} |
2462 |
> |
|
2463 |
> |
if (fusion_dir != FUSION_NOTHING) |
2464 |
> |
{ |
2465 |
> |
fprintf(fout, |
2466 |
> |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t", |
2467 |
> |
read_name, |
2468 |
> |
sam_flag, |
2469 |
> |
ref_name, |
2470 |
> |
left1 + 1, |
2471 |
> |
map_quality, |
2472 |
> |
cigar1, |
2473 |
> |
mate_ref_name.c_str(), |
2474 |
> |
mate_pos, |
2475 |
> |
insert_size, |
2476 |
> |
left_seq.c_str(), |
2477 |
> |
left_qual.c_str(), |
2478 |
> |
bh.edit_dist() + indel_distance); |
2479 |
> |
|
2480 |
> |
if (containsSplice) |
2481 |
> |
{ |
2482 |
> |
fprintf(fout, |
2483 |
> |
"XS:A:%c\t", |
2484 |
> |
bh.antisense_splice() ? '-' : '+'); |
2485 |
> |
} |
2486 |
> |
|
2487 |
> |
fprintf(fout, |
2488 |
> |
"FR:Z:1 %s-%s %d %s %s %s\n", |
2489 |
> |
ref_name, |
2490 |
> |
ref_name2, |
2491 |
> |
left + 1, |
2492 |
> |
cigar, |
2493 |
> |
seq.c_str(), |
2494 |
> |
quals.c_str()); |
2495 |
> |
|
2496 |
> |
fprintf(fout, |
2497 |
> |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t", |
2498 |
> |
read_name, |
2499 |
> |
sam_flag, |
2500 |
> |
ref_name2, |
2501 |
> |
left2 + 1, |
2502 |
> |
map_quality, |
2503 |
> |
cigar2, |
2504 |
> |
mate_ref_name.c_str(), |
2505 |
> |
mate_pos, |
2506 |
> |
insert_size, |
2507 |
> |
right_seq.c_str(), |
2508 |
> |
right_qual.c_str(), |
2509 |
> |
bh.edit_dist() + indel_distance); |
2510 |
> |
|
2511 |
> |
if (containsSplice) |
2512 |
> |
{ |
2513 |
> |
fprintf(fout, |
2514 |
> |
"XS:A:%c\t", |
2515 |
> |
bh.antisense_splice() ? '-' : '+'); |
2516 |
> |
} |
2517 |
> |
|
2518 |
> |
fprintf(fout, |
2519 |
> |
"FR:Z:2 %s-%s %d %s %s %s\n", |
2520 |
> |
ref_name, |
2521 |
> |
ref_name2, |
2522 |
> |
left + 1, |
2523 |
> |
cigar, |
2524 |
> |
seq.c_str(), |
2525 |
> |
quals.c_str()); |
2526 |
> |
} |
2527 |
> |
else |
2528 |
> |
{ |
2529 |
> |
fprintf(fout, |
2530 |
> |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\tNM:i:%d\t", |
2531 |
> |
read_name, |
2532 |
> |
sam_flag, |
2533 |
> |
ref_name, |
2534 |
> |
left + 1, |
2535 |
> |
map_quality, |
2536 |
> |
cigar, |
2537 |
> |
mate_ref_name.c_str(), |
2538 |
> |
mate_pos, |
2539 |
> |
insert_size, |
2540 |
> |
seq.c_str(), |
2541 |
> |
quals.c_str(), |
2542 |
> |
bh.edit_dist() + indel_distance); |
2543 |
> |
|
2544 |
> |
if (containsSplice) |
2545 |
> |
{ |
2546 |
> |
fprintf(fout, |
2547 |
> |
"XS:A:%c\t", |
2548 |
> |
bh.antisense_splice() ? '-' : '+'); |
2549 |
|
} |
2550 |
|
|
2551 |
< |
if (containsSplice) |
2552 |
< |
fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+'); |
1775 |
< |
|
1776 |
< |
fprintf(fout, "\n"); |
2551 |
> |
fprintf(fout, "\n"); |
2552 |
> |
} |
2553 |
|
} |
2554 |
|
|
2555 |
|
void print_bamhit(GBamWriter& wbam, |
2556 |
< |
const char* read_name, |
2557 |
< |
const BowtieHit& bh, |
2558 |
< |
const char* ref_name, |
2559 |
< |
const char* sequence, |
2560 |
< |
const char* qualities, |
2561 |
< |
bool from_bowtie) |
2562 |
< |
{ |
2563 |
< |
string seq; |
2564 |
< |
string quals; |
2565 |
< |
if (sequence) { |
2566 |
< |
seq = sequence; |
2567 |
< |
quals = qualities; |
2568 |
< |
seq.resize(bh.read_len()); |
2569 |
< |
quals.resize(bh.read_len()); |
2570 |
< |
} |
2571 |
< |
else { |
2572 |
< |
seq = "*"; |
2573 |
< |
} |
2574 |
< |
if (qualities) { |
2575 |
< |
quals = qualities; |
2576 |
< |
quals.resize(bh.read_len()); |
2577 |
< |
} |
2578 |
< |
else |
2556 |
> |
const char* read_name, |
2557 |
> |
const BowtieHit& bh, |
2558 |
> |
const char* ref_name, |
2559 |
> |
const char* ref_name2, |
2560 |
> |
const char* sequence, |
2561 |
> |
const char* qualities, |
2562 |
> |
bool from_bowtie) |
2563 |
> |
{ |
2564 |
> |
string seq; |
2565 |
> |
string quals; |
2566 |
> |
if (sequence) { |
2567 |
> |
seq = sequence; |
2568 |
> |
quals = qualities; |
2569 |
> |
seq.resize(bh.read_len()); |
2570 |
> |
quals.resize(bh.read_len()); |
2571 |
> |
} |
2572 |
> |
else { |
2573 |
> |
seq = "*"; |
2574 |
> |
} |
2575 |
> |
if (qualities) { |
2576 |
> |
quals = qualities; |
2577 |
> |
quals.resize(bh.read_len()); |
2578 |
> |
} |
2579 |
> |
else { |
2580 |
> |
quals = "*"; |
2581 |
> |
} |
2582 |
> |
|
2583 |
> |
uint32_t sam_flag = 0; |
2584 |
> |
if (bh.antisense_align()) |
2585 |
|
{ |
2586 |
< |
quals = "*"; |
2586 |
> |
sam_flag |= 0x0010; // BAM_FREVERSE |
2587 |
> |
if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed. |
2588 |
> |
{ |
2589 |
> |
reverse_complement(seq); |
2590 |
> |
reverse(quals.begin(), quals.end()); |
2591 |
> |
} |
2592 |
|
} |
2593 |
< |
|
2594 |
< |
uint32_t sam_flag = 0; |
2595 |
< |
if (bh.antisense_align()) |
2593 |
> |
|
2594 |
> |
uint32_t sam_pos = bh.left() + 1; |
2595 |
> |
uint32_t map_quality = 255; |
2596 |
> |
char cigar[256]; |
2597 |
> |
cigar[0] = 0; |
2598 |
> |
string mate_ref_name = "*"; |
2599 |
> |
uint32_t mate_pos = 0; |
2600 |
> |
uint32_t insert_size = 0; |
2601 |
> |
//string qualities = "*"; |
2602 |
> |
|
2603 |
> |
const vector<CigarOp>& bh_cigar = bh.cigar(); |
2604 |
> |
/* |
2605 |
> |
* In addition to calculating the cigar string, |
2606 |
> |
* we need to figure out how many in/dels are in the |
2607 |
> |
* sequence, so that we can give the correct |
2608 |
> |
* value for the NM tag |
2609 |
> |
*/ |
2610 |
> |
int indel_distance = 0; |
2611 |
> |
CigarOpCode fusion_dir = FUSION_NOTHING; |
2612 |
> |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
2613 |
|
{ |
2614 |
< |
sam_flag |= 0x0010; // BAM_FREVERSE |
1811 |
< |
if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed. |
1812 |
< |
{ |
1813 |
< |
reverse_complement(seq); |
1814 |
< |
reverse(quals.begin(), quals.end()); |
1815 |
< |
} |
1816 |
< |
} |
2614 |
> |
const CigarOp& op = bh_cigar[c]; |
2615 |
|
|
2616 |
< |
uint32_t sam_pos = bh.left() + 1; |
2617 |
< |
uint32_t map_quality = 255; |
2618 |
< |
char cigar[256]; |
2619 |
< |
cigar[0] = 0; |
2620 |
< |
string mate_ref_name = "*"; |
2621 |
< |
uint32_t mate_pos = 0; |
2622 |
< |
uint32_t insert_size = 0; |
2623 |
< |
//string qualities = "*"; |
2624 |
< |
|
2625 |
< |
const vector<CigarOp>& bh_cigar = bh.cigar(); |
2626 |
< |
/* |
2627 |
< |
* In addition to calculating the cigar string, |
2628 |
< |
* we need to figure out how many in/dels are in the |
2629 |
< |
* sequence, so that we can give the correct |
2630 |
< |
* value for the NM tag |
2631 |
< |
*/ |
2632 |
< |
int indel_distance = 0; |
2633 |
< |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
2634 |
< |
{ |
2635 |
< |
char ibuf[64]; |
2636 |
< |
sprintf(ibuf, "%d", bh_cigar[c].length); |
2637 |
< |
switch(bh_cigar[c].opcode) |
2638 |
< |
{ |
2639 |
< |
case MATCH: |
2640 |
< |
strcat(cigar, ibuf); |
2641 |
< |
strcat(cigar, "M"); |
2642 |
< |
break; |
2643 |
< |
case INS: |
2644 |
< |
strcat(cigar, ibuf); |
2645 |
< |
strcat(cigar, "I"); |
2646 |
< |
indel_distance += bh_cigar[c].length; |
2647 |
< |
break; |
2648 |
< |
case DEL: |
2649 |
< |
strcat(cigar, ibuf); |
2650 |
< |
strcat(cigar, "D"); |
2651 |
< |
indel_distance += bh_cigar[c].length; |
2652 |
< |
break; |
2653 |
< |
case REF_SKIP: |
2654 |
< |
strcat(cigar, ibuf); |
2655 |
< |
strcat(cigar, "N"); |
2656 |
< |
break; |
2657 |
< |
default: |
2658 |
< |
break; |
2659 |
< |
} |
2616 |
> |
char ibuf[64]; |
2617 |
> |
sprintf(ibuf, "%d", op.length); |
2618 |
> |
switch(op.opcode) |
2619 |
> |
{ |
2620 |
> |
case MATCH: |
2621 |
> |
case mATCH: |
2622 |
> |
strcat(cigar, ibuf); |
2623 |
> |
if (bh_cigar[c].opcode == MATCH) |
2624 |
> |
strcat(cigar, "M"); |
2625 |
> |
else |
2626 |
> |
strcat(cigar, "m"); |
2627 |
> |
break; |
2628 |
> |
case INS: |
2629 |
> |
case iNS: |
2630 |
> |
strcat(cigar, ibuf); |
2631 |
> |
if (bh_cigar[c].opcode == INS) |
2632 |
> |
strcat(cigar, "I"); |
2633 |
> |
else |
2634 |
> |
strcat(cigar, "i"); |
2635 |
> |
indel_distance += bh_cigar[c].length; |
2636 |
> |
break; |
2637 |
> |
case DEL: |
2638 |
> |
case dEL: |
2639 |
> |
strcat(cigar, ibuf); |
2640 |
> |
if (bh_cigar[c].opcode == DEL) |
2641 |
> |
strcat(cigar, "D"); |
2642 |
> |
else |
2643 |
> |
strcat(cigar, "d"); |
2644 |
> |
indel_distance += bh_cigar[c].length; |
2645 |
> |
break; |
2646 |
> |
case REF_SKIP: |
2647 |
> |
case rEF_SKIP: |
2648 |
> |
strcat(cigar, ibuf); |
2649 |
> |
if (bh_cigar[c].opcode == REF_SKIP) |
2650 |
> |
strcat(cigar, "N"); |
2651 |
> |
else |
2652 |
> |
strcat(cigar, "n"); |
2653 |
> |
break; |
2654 |
> |
case FUSION_FF: |
2655 |
> |
case FUSION_FR: |
2656 |
> |
case FUSION_RF: |
2657 |
> |
case FUSION_RR: |
2658 |
> |
fusion_dir = op.opcode; |
2659 |
> |
sprintf(ibuf, "%d", bh_cigar[c].length + 1); |
2660 |
> |
strcat(cigar, ibuf); |
2661 |
> |
strcat(cigar, "F"); |
2662 |
> |
break; |
2663 |
> |
default: |
2664 |
> |
break; |
2665 |
> |
} |
2666 |
|
} |
2667 |
|
|
2668 |
< |
//string q = string(bh.read_len(), '!'); |
2669 |
< |
//string s = string(bh.read_len(), 'N'); |
2670 |
< |
/* |
2671 |
< |
fprintf(fout, |
2672 |
< |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", |
2673 |
< |
read_name, |
2674 |
< |
sam_flag, |
2675 |
< |
ref_name, |
2676 |
< |
sam_pos, |
2677 |
< |
map_quality, |
2678 |
< |
cigar, |
2679 |
< |
mate_ref_name.c_str(), |
2680 |
< |
mate_pos, |
2681 |
< |
insert_size, |
2682 |
< |
seq.c_str(), |
2683 |
< |
quals.c_str()); |
2668 |
> |
char cigar1[256] = {0}, cigar2[256] = {0}; |
2669 |
> |
string left_seq, right_seq, left_qual, right_qual; |
2670 |
> |
int left1 = -1, left2 = -1; |
2671 |
> |
extract_partial_hits(bh, seq, quals, |
2672 |
> |
cigar1, cigar2, left_seq, right_seq, |
2673 |
> |
left_qual, right_qual, left1, left2); |
2674 |
> |
|
2675 |
> |
bool containsSplice = false; |
2676 |
> |
for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr) |
2677 |
> |
{ |
2678 |
> |
if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP) |
2679 |
> |
{ |
2680 |
> |
containsSplice = true; |
2681 |
> |
break; |
2682 |
> |
} |
2683 |
> |
} |
2684 |
|
|
2685 |
< |
fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance); |
2686 |
< |
*/ |
1883 |
< |
vector<string> auxdata; |
1884 |
< |
|
1885 |
< |
if (!sam_readgroup_id.empty()) |
2685 |
> |
vector<string> auxdata; |
2686 |
> |
if (!sam_readgroup_id.empty()) |
2687 |
|
{ |
2688 |
< |
string nm("RG:Z:"); |
2689 |
< |
nm += sam_readgroup_id; |
2690 |
< |
auxdata.push_back(nm); |
2688 |
> |
string nm("RG:Z:"); |
2689 |
> |
nm += sam_readgroup_id; |
2690 |
> |
auxdata.push_back(nm); |
2691 |
|
} |
2692 |
< |
|
2693 |
< |
string nm("NM:i:"); |
2694 |
< |
str_appendInt(nm, bh.edit_dist() + indel_distance); |
2692 |
> |
|
2693 |
> |
string nm("NM:i:"); |
2694 |
> |
str_appendInt(nm, bh.edit_dist() + indel_distance); |
2695 |
> |
auxdata.push_back(nm); |
2696 |
> |
|
2697 |
> |
if (containsSplice) { |
2698 |
> |
nm="XS:A:"; |
2699 |
> |
nm+=(char)(bh.antisense_splice() ? '-' : '+'); |
2700 |
|
auxdata.push_back(nm); |
2701 |
< |
bool containsSplice = false; |
2702 |
< |
for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr) |
2703 |
< |
if(itr->opcode == REF_SKIP){ |
2704 |
< |
containsSplice = true; |
2705 |
< |
break; |
2706 |
< |
} |
2707 |
< |
|
2708 |
< |
if (containsSplice) { |
2709 |
< |
//fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+'); |
2710 |
< |
nm="XS:A:"; |
2711 |
< |
nm+=(char)(bh.antisense_splice() ? '-' : '+'); |
2712 |
< |
auxdata.push_back(nm); |
2713 |
< |
} |
2714 |
< |
|
2715 |
< |
GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality, |
2716 |
< |
cigar, mate_ref_name.c_str(), mate_pos, |
2717 |
< |
insert_size, seq.c_str(), quals.c_str(), &auxdata); |
2718 |
< |
|
2719 |
< |
|
2720 |
< |
|
2721 |
< |
wbam.write(brec); |
2722 |
< |
delete brec; |
2701 |
> |
} |
2702 |
> |
|
2703 |
> |
if (fusion_dir != FUSION_NOTHING) |
2704 |
> |
{ |
2705 |
> |
char XF[2048] = {0}; |
2706 |
> |
sprintf(XF, |
2707 |
> |
"XF:Z:1 %s-%s %u %s %s %s", |
2708 |
> |
ref_name, |
2709 |
> |
ref_name2, |
2710 |
> |
sam_pos, |
2711 |
> |
cigar, |
2712 |
> |
seq.c_str(), |
2713 |
> |
quals.c_str()); |
2714 |
> |
auxdata.push_back(XF); |
2715 |
> |
|
2716 |
> |
GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, left1 + 1, map_quality, |
2717 |
> |
cigar1, mate_ref_name.c_str(), mate_pos, |
2718 |
> |
insert_size, left_seq.c_str(), left_qual.c_str(), &auxdata); |
2719 |
> |
|
2720 |
> |
wbam.write(brec); |
2721 |
> |
delete brec; |
2722 |
> |
|
2723 |
> |
sprintf(XF, |
2724 |
> |
"XF:Z:2 %s-%s %u %s %s %s", |
2725 |
> |
ref_name, |
2726 |
> |
ref_name2, |
2727 |
> |
sam_pos, |
2728 |
> |
cigar, |
2729 |
> |
seq.c_str(), |
2730 |
> |
quals.c_str()); |
2731 |
> |
auxdata.back() = XF; |
2732 |
> |
|
2733 |
> |
brec = wbam.new_record(read_name, sam_flag, ref_name, left2 + 1, map_quality, |
2734 |
> |
cigar2, mate_ref_name.c_str(), mate_pos, |
2735 |
> |
insert_size, right_seq.c_str(), right_qual.c_str(), &auxdata); |
2736 |
> |
|
2737 |
> |
wbam.write(brec); |
2738 |
> |
delete brec; |
2739 |
> |
} |
2740 |
> |
else |
2741 |
> |
{ |
2742 |
> |
GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality, |
2743 |
> |
cigar, mate_ref_name.c_str(), mate_pos, |
2744 |
> |
insert_size, seq.c_str(), quals.c_str(), &auxdata); |
2745 |
> |
|
2746 |
> |
wbam.write(brec); |
2747 |
> |
delete brec; |
2748 |
> |
} |
2749 |
|
} |
2750 |
|
|
2751 |
|
/** |
2753 |
|
* @param bh_cigar A vector of CigarOps |
2754 |
|
* @return a string representation of the cigar string |
2755 |
|
*/ |
2756 |
< |
std::string print_cigar(vector<CigarOp>& bh_cigar){ |
2756 |
> |
std::string print_cigar(const vector<CigarOp>& bh_cigar){ |
2757 |
|
char cigar[256]; |
2758 |
|
cigar[0] = 0; |
2759 |
|
for (size_t c = 0; c < bh_cigar.size(); ++c) |
2760 |
|
{ |
2761 |
|
char ibuf[64]; |
2762 |
|
sprintf(ibuf, "%d", bh_cigar[c].length); |
2763 |
+ |
strcat(cigar, ibuf); |
2764 |
|
switch(bh_cigar[c].opcode) |
2765 |
|
{ |
2766 |
< |
case MATCH: |
2767 |
< |
strcat(cigar, ibuf); |
2768 |
< |
strcat(cigar, "M"); |
2769 |
< |
break; |
2770 |
< |
case INS: |
2771 |
< |
strcat(cigar, ibuf); |
2772 |
< |
strcat(cigar, "I"); |
2773 |
< |
break; |
2774 |
< |
case DEL: |
2775 |
< |
strcat(cigar, ibuf); |
2776 |
< |
strcat(cigar, "D"); |
2777 |
< |
break; |
2778 |
< |
case REF_SKIP: |
2779 |
< |
strcat(cigar, ibuf); |
2780 |
< |
strcat(cigar, "N"); |
2781 |
< |
break; |
2782 |
< |
default: |
2783 |
< |
break; |
2766 |
> |
case MATCH: |
2767 |
> |
strcat(cigar, "M"); |
2768 |
> |
break; |
2769 |
> |
case mATCH: |
2770 |
> |
strcat(cigar, "m"); |
2771 |
> |
break; |
2772 |
> |
case INS: |
2773 |
> |
strcat(cigar, "I"); |
2774 |
> |
break; |
2775 |
> |
case iNS: |
2776 |
> |
strcat(cigar, "i"); |
2777 |
> |
break; |
2778 |
> |
case DEL: |
2779 |
> |
strcat(cigar, "D"); |
2780 |
> |
break; |
2781 |
> |
case dEL: |
2782 |
> |
strcat(cigar, "d"); |
2783 |
> |
break; |
2784 |
> |
case REF_SKIP: |
2785 |
> |
strcat(cigar, "N"); |
2786 |
> |
break; |
2787 |
> |
case rEF_SKIP: |
2788 |
> |
strcat(cigar, "n"); |
2789 |
> |
break; |
2790 |
> |
case FUSION_FF: |
2791 |
> |
case FUSION_FR: |
2792 |
> |
case FUSION_RF: |
2793 |
> |
case FUSION_RR: |
2794 |
> |
strcat(cigar, "F"); |
2795 |
> |
break; |
2796 |
> |
default: |
2797 |
> |
break; |
2798 |
|
} |
2799 |
|
} |
2800 |
|
string result(cigar); |
2801 |
|
return result; |
2802 |
|
} |
2803 |
|
|
2804 |
+ |
void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual, |
2805 |
+ |
char* cigar1, char* cigar2, string& seq1, string& seq2, |
2806 |
+ |
string& qual1, string& qual2, int& left1, int& left2) |
2807 |
+ |
{ |
2808 |
+ |
const int left = bh.left(); |
2809 |
+ |
int right = left; |
2810 |
+ |
int fusion_left = -1, fusion_right = -1; |
2811 |
+ |
|
2812 |
+ |
const vector<CigarOp>& bh_cigar = bh.cigar(); |
2813 |
+ |
|
2814 |
+ |
CigarOpCode fusion_dir = FUSION_NOTHING; |
2815 |
+ |
size_t fusion_idx = 0; |
2816 |
+ |
size_t left_part_len = 0; |
2817 |
+ |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
2818 |
+ |
{ |
2819 |
+ |
const CigarOp& op = bh_cigar[c]; |
2820 |
+ |
switch(op.opcode) |
2821 |
+ |
{ |
2822 |
+ |
case MATCH: |
2823 |
+ |
case REF_SKIP: |
2824 |
+ |
case DEL: |
2825 |
+ |
right += op.length; |
2826 |
+ |
break; |
2827 |
+ |
case mATCH: |
2828 |
+ |
case rEF_SKIP: |
2829 |
+ |
case dEL: |
2830 |
+ |
right -= op.length; |
2831 |
+ |
break; |
2832 |
+ |
case FUSION_FF: |
2833 |
+ |
case FUSION_FR: |
2834 |
+ |
case FUSION_RF: |
2835 |
+ |
case FUSION_RR: |
2836 |
+ |
{ |
2837 |
+ |
fusion_dir = op.opcode; |
2838 |
+ |
fusion_idx = c; |
2839 |
+ |
if (op.opcode == FUSION_FF || op.opcode == FUSION_FR) |
2840 |
+ |
fusion_left = right - 1; |
2841 |
+ |
else |
2842 |
+ |
fusion_left = right + 1; |
2843 |
+ |
fusion_right = right = op.length; |
2844 |
+ |
} |
2845 |
+ |
break; |
2846 |
+ |
default: |
2847 |
+ |
break; |
2848 |
+ |
} |
2849 |
+ |
|
2850 |
+ |
if (fusion_dir == FUSION_NOTHING) |
2851 |
+ |
{ |
2852 |
+ |
if (op.opcode == MATCH || op.opcode == mATCH || op.opcode == INS || op.opcode == iNS) |
2853 |
+ |
{ |
2854 |
+ |
left_part_len += op.length; |
2855 |
+ |
} |
2856 |
+ |
} |
2857 |
+ |
} |
2858 |
+ |
|
2859 |
+ |
if (fusion_dir == FUSION_FF || fusion_dir == FUSION_FR) |
2860 |
+ |
{ |
2861 |
+ |
for (size_t c = 0; c < fusion_idx; ++c) |
2862 |
+ |
{ |
2863 |
+ |
const CigarOp& op = bh_cigar[c]; |
2864 |
+ |
char ibuf[64]; |
2865 |
+ |
sprintf(ibuf, "%d", op.length); |
2866 |
+ |
strcat(cigar1, ibuf); |
2867 |
+ |
|
2868 |
+ |
switch (op.opcode) |
2869 |
+ |
{ |
2870 |
+ |
case MATCH: |
2871 |
+ |
strcat(cigar1, "M"); |
2872 |
+ |
break; |
2873 |
+ |
case INS: |
2874 |
+ |
strcat(cigar1, "I"); |
2875 |
+ |
break; |
2876 |
+ |
case DEL: |
2877 |
+ |
strcat(cigar1, "D"); |
2878 |
+ |
break; |
2879 |
+ |
case REF_SKIP: |
2880 |
+ |
strcat(cigar1, "N"); |
2881 |
+ |
break; |
2882 |
+ |
default: |
2883 |
+ |
assert (0); |
2884 |
+ |
break; |
2885 |
+ |
} |
2886 |
+ |
} |
2887 |
+ |
} |
2888 |
+ |
else if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR) |
2889 |
+ |
{ |
2890 |
+ |
assert (fusion_idx > 0); |
2891 |
+ |
for (int c = fusion_idx - 1; c >=0; --c) |
2892 |
+ |
{ |
2893 |
+ |
const CigarOp& op = bh_cigar[c]; |
2894 |
+ |
char ibuf[64]; |
2895 |
+ |
sprintf(ibuf, "%d", op.length); |
2896 |
+ |
strcat(cigar1, ibuf); |
2897 |
+ |
|
2898 |
+ |
switch (op.opcode) |
2899 |
+ |
{ |
2900 |
+ |
case mATCH: |
2901 |
+ |
strcat(cigar1, "M"); |
2902 |
+ |
break; |
2903 |
+ |
case iNS: |
2904 |
+ |
strcat(cigar1, "I"); |
2905 |
+ |
break; |
2906 |
+ |
case dEL: |
2907 |
+ |
strcat(cigar1, "D"); |
2908 |
+ |
break; |
2909 |
+ |
case rEF_SKIP: |
2910 |
+ |
strcat(cigar1, "N"); |
2911 |
+ |
break; |
2912 |
+ |
default: |
2913 |
+ |
assert (0); |
2914 |
+ |
break; |
2915 |
+ |
} |
2916 |
+ |
} |
2917 |
+ |
} |
2918 |
+ |
|
2919 |
+ |
if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RF) |
2920 |
+ |
{ |
2921 |
+ |
for (size_t c = fusion_idx + 1; c < bh_cigar.size(); ++c) |
2922 |
+ |
{ |
2923 |
+ |
const CigarOp& op = bh_cigar[c]; |
2924 |
+ |
char ibuf[64]; |
2925 |
+ |
sprintf(ibuf, "%d", op.length); |
2926 |
+ |
strcat(cigar2, ibuf); |
2927 |
+ |
|
2928 |
+ |
switch (op.opcode) |
2929 |
+ |
{ |
2930 |
+ |
case MATCH: |
2931 |
+ |
strcat(cigar2, "M"); |
2932 |
+ |
break; |
2933 |
+ |
case INS: |
2934 |
+ |
strcat(cigar2, "I"); |
2935 |
+ |
break; |
2936 |
+ |
case DEL: |
2937 |
+ |
strcat(cigar2, "D"); |
2938 |
+ |
break; |
2939 |
+ |
case REF_SKIP: |
2940 |
+ |
strcat(cigar2, "N"); |
2941 |
+ |
break; |
2942 |
+ |
default: |
2943 |
+ |
assert (0); |
2944 |
+ |
break; |
2945 |
+ |
} |
2946 |
+ |
} |
2947 |
+ |
} |
2948 |
+ |
else if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR) |
2949 |
+ |
{ |
2950 |
+ |
assert (bh_cigar.size() > 0); |
2951 |
+ |
for (size_t c = bh_cigar.size() - 1; c > fusion_idx; --c) |
2952 |
+ |
{ |
2953 |
+ |
const CigarOp& op = bh_cigar[c]; |
2954 |
+ |
char ibuf[64]; |
2955 |
+ |
sprintf(ibuf, "%d", op.length); |
2956 |
+ |
strcat(cigar2, ibuf); |
2957 |
+ |
|
2958 |
+ |
switch (op.opcode) |
2959 |
+ |
{ |
2960 |
+ |
case mATCH: |
2961 |
+ |
strcat(cigar2, "M"); |
2962 |
+ |
break; |
2963 |
+ |
case iNS: |
2964 |
+ |
strcat(cigar2, "I"); |
2965 |
+ |
break; |
2966 |
+ |
case dEL: |
2967 |
+ |
strcat(cigar2, "D"); |
2968 |
+ |
break; |
2969 |
+ |
case rEF_SKIP: |
2970 |
+ |
strcat(cigar2, "N"); |
2971 |
+ |
break; |
2972 |
+ |
default: |
2973 |
+ |
assert (0); |
2974 |
+ |
break; |
2975 |
+ |
} |
2976 |
+ |
} |
2977 |
+ |
} |
2978 |
+ |
|
2979 |
+ |
if (fusion_dir != FUSION_NOTHING) |
2980 |
+ |
{ |
2981 |
+ |
seq1 = seq.substr(0, left_part_len); |
2982 |
+ |
qual1 = qual.substr(0, left_part_len); |
2983 |
+ |
|
2984 |
+ |
if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR) |
2985 |
+ |
{ |
2986 |
+ |
reverse_complement(seq1); |
2987 |
+ |
reverse(qual1.begin(), qual1.end()); |
2988 |
+ |
} |
2989 |
+ |
|
2990 |
+ |
seq2 = seq.substr(left_part_len); |
2991 |
+ |
qual2 = qual.substr(left_part_len); |
2992 |
+ |
|
2993 |
+ |
if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR) |
2994 |
+ |
{ |
2995 |
+ |
reverse_complement(seq2); |
2996 |
+ |
reverse(qual2.begin(), qual2.end()); |
2997 |
+ |
} |
2998 |
+ |
|
2999 |
+ |
left1 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_FR) ? left : fusion_left); |
3000 |
+ |
left2 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_RF) ? fusion_right : right + 1); |
3001 |
+ |
} |
3002 |
+ |
} |
3003 |
+ |
|
3004 |
+ |
|
3005 |
|
bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt) |
3006 |
|
{ |
3007 |
< |
RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id); |
3008 |
< |
if (!ref_str) |
3007 |
> |
RefSequenceTable::Sequence* ref_str1 = rt.get_seq(_ref_id); |
3008 |
> |
RefSequenceTable::Sequence* ref_str2 = rt.get_seq(_ref_id2); |
3009 |
> |
|
3010 |
> |
if (!ref_str1 || !ref_str2) |
3011 |
|
return false; |
3012 |
|
|
3013 |
< |
const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right()); |
3013 |
> |
RefSequenceTable::Sequence* ref_str = ref_str1; |
3014 |
|
|
3015 |
|
size_t pos_seq = 0; |
3016 |
< |
size_t pos_ref = 0; |
3016 |
> |
size_t pos_ref = _left; |
3017 |
|
size_t mismatch = 0; |
3018 |
+ |
bool bSawFusion = false; |
3019 |
|
for (size_t i = 0; i < _cigar.size(); ++i) |
3020 |
|
{ |
3021 |
|
CigarOp cigar = _cigar[i]; |
3023 |
|
{ |
3024 |
|
case MATCH: |
3025 |
|
{ |
3026 |
+ |
seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length); |
3027 |
+ |
for (size_t j = 0; j < cigar.length; ++j) |
3028 |
+ |
{ |
3029 |
+ |
if (_seq[pos_seq] != ref_seq[j]) |
3030 |
+ |
++mismatch; |
3031 |
+ |
++pos_seq; |
3032 |
+ |
} |
3033 |
+ |
|
3034 |
+ |
pos_ref += cigar.length; |
3035 |
+ |
} |
3036 |
+ |
break; |
3037 |
+ |
case mATCH: |
3038 |
+ |
{ |
3039 |
+ |
seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1); |
3040 |
+ |
seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>()); |
3041 |
+ |
seqan::reverseInPlace(ref_seq); |
3042 |
+ |
|
3043 |
|
for (size_t j = 0; j < cigar.length; ++j) |
3044 |
|
{ |
3045 |
< |
if (_seq[pos_seq++] != ref_seq[pos_ref++]) |
3045 |
> |
if (_seq[pos_seq] != ref_seq[j]) |
3046 |
|
++mismatch; |
3047 |
+ |
++pos_seq; |
3048 |
|
} |
3049 |
+ |
|
3050 |
+ |
pos_ref -= cigar.length; |
3051 |
|
} |
3052 |
|
break; |
3053 |
|
case INS: |
3054 |
+ |
case iNS: |
3055 |
|
{ |
3056 |
|
pos_seq += cigar.length; |
3057 |
|
} |
3064 |
|
} |
3065 |
|
break; |
3066 |
|
|
3067 |
+ |
case dEL: |
3068 |
+ |
case rEF_SKIP: |
3069 |
+ |
{ |
3070 |
+ |
pos_ref -= cigar.length; |
3071 |
+ |
} |
3072 |
+ |
break; |
3073 |
+ |
|
3074 |
+ |
case FUSION_FF: |
3075 |
+ |
case FUSION_FR: |
3076 |
+ |
case FUSION_RF: |
3077 |
+ |
case FUSION_RR: |
3078 |
+ |
{ |
3079 |
+ |
// We don't allow a read spans more than two chromosomes. |
3080 |
+ |
if (bSawFusion) |
3081 |
+ |
return false; |
3082 |
+ |
|
3083 |
+ |
ref_str = ref_str2; |
3084 |
+ |
pos_ref = cigar.length; |
3085 |
+ |
|
3086 |
+ |
bSawFusion = true; |
3087 |
+ |
} |
3088 |
+ |
break; |
3089 |
+ |
|
3090 |
|
default: |
3091 |
|
break; |
3092 |
|
} |
3093 |
|
} |
3094 |
< |
|
3094 |
> |
|
3095 |
|
return mismatch == _edit_dist; |
3096 |
|
} |