1 |
gpertea |
121 |
#include "GBam.h" |
2 |
|
|
#include <ctype.h> |
3 |
gpertea |
124 |
#include "kstring.h" |
4 |
gpertea |
121 |
|
5 |
gpertea |
133 |
//auxiliary functions for low level BAM record creation |
6 |
gpertea |
121 |
uint8_t* realloc_bdata(bam1_t *b, int size) { |
7 |
|
|
if (b->m_data < size) { |
8 |
|
|
b->m_data = size; |
9 |
|
|
kroundup32(b->m_data); |
10 |
|
|
b->data = (uint8_t*)realloc(b->data, b->m_data); |
11 |
|
|
} |
12 |
|
|
if (b->data_len<size) b->data_len=size; |
13 |
|
|
return b->data; |
14 |
|
|
} |
15 |
|
|
|
16 |
|
|
uint8_t* dupalloc_bdata(bam1_t *b, int size) { |
17 |
|
|
//same as realloc_bdata, but does not free previous data |
18 |
|
|
//but returns it instead |
19 |
|
|
//it ALWAYS duplicates data |
20 |
|
|
b->m_data = size; |
21 |
|
|
kroundup32(b->m_data); |
22 |
|
|
uint8_t* odata=b->data; |
23 |
|
|
b->data = (uint8_t*)malloc(b->m_data); |
24 |
|
|
memcpy((void*)b->data, (void*)odata, b->data_len); |
25 |
|
|
b->data_len=size; |
26 |
|
|
return odata; //user must FREE this after |
27 |
|
|
} |
28 |
|
|
|
29 |
|
|
//from bam_import.c |
30 |
|
|
extern unsigned short bam_char2flag_table[]; |
31 |
|
|
|
32 |
|
|
GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid, |
33 |
gpertea |
124 |
int pos, bool reverse, const char* qseq, |
34 |
|
|
const char* cigar, const char* quals):exons(1) { |
35 |
gpertea |
126 |
//novel=true; |
36 |
gpertea |
121 |
bam_header=NULL; |
37 |
|
|
b=bam_init1(); |
38 |
|
|
b->core.tid=gseq_tid; |
39 |
|
|
if (pos<=0) { |
40 |
|
|
b->core.pos=-1; //unmapped |
41 |
|
|
//if (gseq_tid<0) |
42 |
|
|
b->core.flag |= BAM_FUNMAP; |
43 |
|
|
} |
44 |
|
|
else b->core.pos=pos-1; //BAM is 0-based |
45 |
|
|
b->core.qual=255; |
46 |
|
|
int l_qseq=strlen(qseq); |
47 |
|
|
//this may not be accurate, setting CIGAR is the correct way |
48 |
|
|
//b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1); |
49 |
|
|
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
50 |
|
|
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
51 |
|
|
set_cigar(cigar); //this will also set core.bin |
52 |
|
|
add_sequence(qseq, l_qseq); |
53 |
|
|
add_quals(quals); //quals must be given as Phred33 |
54 |
|
|
if (reverse) { b->core.flag |= BAM_FREVERSE ; } |
55 |
|
|
} |
56 |
|
|
|
57 |
|
|
GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid, |
58 |
|
|
int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, |
59 |
|
|
int insert_size, const char* qseq, const char* quals, |
60 |
gpertea |
124 |
GVec<char*>* aux_strings):exons(1) { |
61 |
gpertea |
126 |
//novel=true; |
62 |
gpertea |
121 |
bam_header=NULL; |
63 |
|
|
b=bam_init1(); |
64 |
|
|
b->core.tid=g_tid; |
65 |
|
|
b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based |
66 |
|
|
b->core.qual=map_qual; |
67 |
|
|
int l_qseq=strlen(qseq); |
68 |
|
|
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
69 |
|
|
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
70 |
|
|
set_cigar(cigar); //this will also set core.bin |
71 |
|
|
add_sequence(qseq, l_qseq); |
72 |
|
|
add_quals(quals); //quals must be given as Phred33 |
73 |
|
|
set_flags(flags); |
74 |
|
|
set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size); |
75 |
|
|
if (aux_strings!=NULL) { |
76 |
|
|
for (int i=0;i<aux_strings->Count();i++) { |
77 |
|
|
add_aux(aux_strings->Get(i)); |
78 |
|
|
} |
79 |
|
|
} |
80 |
|
|
} |
81 |
gpertea |
124 |
|
82 |
gpertea |
121 |
void GBamRecord::set_cigar(const char* cigar) { |
83 |
|
|
//requires b->core.pos and b->core.flag to have been set properly PRIOR to this call |
84 |
|
|
int doff=b->core.l_qname; |
85 |
|
|
uint8_t* after_cigar=NULL; |
86 |
|
|
int after_cigar_len=0; |
87 |
|
|
uint8_t* prev_bdata=NULL; |
88 |
|
|
if (b->data_len>doff) { |
89 |
|
|
//cigar string already allocated, replace it |
90 |
|
|
int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data |
91 |
|
|
after_cigar=b->data+d; |
92 |
|
|
after_cigar_len=b->data_len-d; |
93 |
|
|
} |
94 |
|
|
const char *s; |
95 |
|
|
char *t; |
96 |
|
|
int i, op; |
97 |
|
|
long x; |
98 |
|
|
b->core.n_cigar = 0; |
99 |
|
|
if (cigar != NULL && strcmp(cigar, "*") != 0) { |
100 |
|
|
for (s = cigar; *s; ++s) { |
101 |
|
|
if (isalpha(*s)) b->core.n_cigar++; |
102 |
|
|
else if (!isdigit(*s)) { |
103 |
|
|
GError("Error: invalid CIGAR character (%s)\n",cigar); |
104 |
|
|
} |
105 |
|
|
} |
106 |
|
|
if (after_cigar_len>0) { //replace/insert into existing full data |
107 |
|
|
prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); |
108 |
|
|
memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); |
109 |
|
|
free(prev_bdata); |
110 |
|
|
} |
111 |
|
|
else { |
112 |
|
|
realloc_bdata(b, doff + b->core.n_cigar * 4); |
113 |
|
|
} |
114 |
|
|
for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { |
115 |
|
|
x = strtol(s, &t, 10); |
116 |
|
|
op = toupper(*t); |
117 |
|
|
if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; |
118 |
|
|
else if (op == 'I') op = BAM_CINS; |
119 |
|
|
else if (op == 'D') op = BAM_CDEL; |
120 |
|
|
else if (op == 'N') op = BAM_CREF_SKIP; |
121 |
|
|
else if (op == 'S') op = BAM_CSOFT_CLIP; |
122 |
|
|
else if (op == 'H') op = BAM_CHARD_CLIP; |
123 |
|
|
else if (op == 'P') op = BAM_CPAD; |
124 |
|
|
else GError("Error: invalid CIGAR operation (%s)\n",cigar); |
125 |
|
|
s = t + 1; |
126 |
|
|
bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; |
127 |
|
|
} |
128 |
|
|
if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar); |
129 |
|
|
b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); |
130 |
|
|
} else {//no CIGAR string given |
131 |
|
|
if (!(b->core.flag&BAM_FUNMAP)) { |
132 |
|
|
GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); |
133 |
|
|
b->core.flag |= BAM_FUNMAP; |
134 |
|
|
} |
135 |
|
|
b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); |
136 |
|
|
} |
137 |
gpertea |
124 |
setupCoordinates(); |
138 |
gpertea |
121 |
} //set_cigar() |
139 |
|
|
|
140 |
|
|
void GBamRecord::add_sequence(const char* qseq, int slen) { |
141 |
|
|
//must be called AFTER set_cigar (cannot replace existing sequence for now) |
142 |
|
|
if (qseq==NULL) return; //should we ever care about this? |
143 |
|
|
if (slen<0) slen=strlen(qseq); |
144 |
|
|
int doff = b->core.l_qname + b->core.n_cigar * 4; |
145 |
|
|
if (strcmp(qseq, "*")!=0) { |
146 |
|
|
b->core.l_qseq=slen; |
147 |
|
|
if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) |
148 |
|
|
GError("Error: CIGAR and sequence length are inconsistent!(%s)\n", |
149 |
|
|
qseq); |
150 |
|
|
uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; |
151 |
|
|
//also allocated quals memory |
152 |
|
|
memset(p, 0, (b->core.l_qseq+1)/2); |
153 |
|
|
for (int i = 0; i < b->core.l_qseq; ++i) |
154 |
|
|
p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); |
155 |
|
|
} else b->core.l_qseq = 0; |
156 |
|
|
} |
157 |
|
|
|
158 |
|
|
void GBamRecord::add_quals(const char* quals) { |
159 |
|
|
//requires core.l_qseq already set |
160 |
|
|
//and must be called AFTER add_sequence(), which also allocates the memory for quals |
161 |
|
|
uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2); |
162 |
|
|
if (quals==NULL || strcmp(quals, "*") == 0) { |
163 |
|
|
for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff; |
164 |
|
|
return; |
165 |
|
|
} |
166 |
|
|
for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33; |
167 |
|
|
} |
168 |
|
|
|
169 |
|
|
void GBamRecord::add_aux(const char* str) { |
170 |
|
|
//requires: being called AFTER add_quals() |
171 |
|
|
static char tag[2]; |
172 |
|
|
static uint8_t abuf[512]; |
173 |
|
|
//requires: being called AFTER add_quals() |
174 |
|
|
int strl=strlen(str); |
175 |
|
|
//int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux; |
176 |
|
|
//int doff0=doff; |
177 |
|
|
if (strl < 6 || str[2] != ':' || str[4] != ':') |
178 |
|
|
parse_error("missing colon in auxiliary data"); |
179 |
|
|
tag[0] = str[0]; tag[1] = str[1]; |
180 |
|
|
uint8_t atype = str[3]; |
181 |
|
|
uint8_t* adata=abuf; |
182 |
|
|
int alen=0; |
183 |
|
|
if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility |
184 |
|
|
atype='A'; |
185 |
|
|
alen=1; |
186 |
|
|
adata=(uint8_t*)&str[5]; |
187 |
|
|
} |
188 |
|
|
else if (atype == 'I' || atype == 'i') { |
189 |
|
|
long long x=(long long)atoll(str + 5); |
190 |
|
|
if (x < 0) { |
191 |
|
|
if (x >= -127) { |
192 |
|
|
atype='c'; |
193 |
|
|
abuf[0] = (int8_t)x; |
194 |
|
|
alen=1; |
195 |
|
|
} |
196 |
|
|
else if (x >= -32767) { |
197 |
|
|
atype = 's'; |
198 |
|
|
*(int16_t*)abuf = (int16_t)x; |
199 |
|
|
alen=2; |
200 |
|
|
} |
201 |
|
|
else { |
202 |
|
|
atype='i'; |
203 |
|
|
*(int32_t*)abuf = (int32_t)x; |
204 |
|
|
alen=4; |
205 |
|
|
if (x < -2147483648ll) |
206 |
|
|
GMessage("Parse warning: integer %lld is out of range.", x); |
207 |
|
|
} |
208 |
|
|
} else { //x >=0 |
209 |
|
|
if (x <= 255) { |
210 |
|
|
atype = 'C'; |
211 |
|
|
abuf[0] = (uint8_t)x; |
212 |
|
|
alen=1; |
213 |
|
|
} |
214 |
|
|
else if (x <= 65535) { |
215 |
|
|
atype='S'; |
216 |
|
|
*(uint16_t*)abuf = (uint16_t)x; |
217 |
|
|
alen=2; |
218 |
|
|
} |
219 |
|
|
else { |
220 |
|
|
atype='I'; |
221 |
|
|
*(uint32_t*)abuf = (uint32_t)x; |
222 |
|
|
alen=4; |
223 |
|
|
if (x > 4294967295ll) |
224 |
|
|
GMessage("Parse warning: integer %lld is out of range.", x); |
225 |
|
|
} |
226 |
|
|
} |
227 |
|
|
} //integer type |
228 |
|
|
else if (atype == 'f') { |
229 |
|
|
*(float*)abuf = (float)atof(str + 5); |
230 |
|
|
alen = sizeof(float); |
231 |
|
|
} |
232 |
|
|
else if (atype == 'd') { //? |
233 |
|
|
*(float*)abuf = (float)atof(str + 9); |
234 |
|
|
alen=8; |
235 |
|
|
} |
236 |
|
|
else if (atype == 'Z' || atype == 'H') { |
237 |
|
|
if (atype == 'H') { // check whether the hex string is valid |
238 |
|
|
if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even"); |
239 |
|
|
for (int i = 0; i < strl - 5; ++i) { |
240 |
|
|
int c = toupper(str[5 + i]); |
241 |
|
|
if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) |
242 |
|
|
parse_error("invalid hex character"); |
243 |
|
|
} |
244 |
|
|
} |
245 |
|
|
memcpy(abuf, str + 5, strl - 5); |
246 |
|
|
abuf[strl-5] = 0; |
247 |
|
|
alen=strl-4; |
248 |
|
|
} else parse_error("unrecognized aux type"); |
249 |
|
|
this->add_aux(tag, atype, alen, adata); |
250 |
|
|
}//add_aux() |
251 |
|
|
|
252 |
|
|
|
253 |
|
|
void GBamRecord::setupCoordinates() { |
254 |
|
|
uint32_t *cigar = bam1_cigar(b); |
255 |
|
|
const bam1_core_t *c = &b->core; |
256 |
gpertea |
124 |
if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */ |
257 |
gpertea |
121 |
int l=0; |
258 |
gpertea |
124 |
start=c->pos+1; |
259 |
|
|
int exstart=c->pos; |
260 |
|
|
for (int i = 0; i < c->n_cigar; ++i) { |
261 |
|
|
int op = cigar[i]&0xf; |
262 |
|
|
if (op == BAM_CMATCH || op==BAM_CEQUAL || |
263 |
|
|
op == BAM_CDIFF || op == BAM_CDEL) { |
264 |
gpertea |
121 |
l += cigar[i]>>4; |
265 |
gpertea |
124 |
} |
266 |
|
|
else if (op == BAM_CREF_SKIP) { //N |
267 |
|
|
//intron starts |
268 |
|
|
//exon ends here |
269 |
gpertea |
130 |
GSeg exon(exstart+1,c->pos+l); |
270 |
gpertea |
126 |
exons.Add(exon); |
271 |
gpertea |
124 |
l += cigar[i]>>4; |
272 |
|
|
exstart=c->pos+l; |
273 |
|
|
} |
274 |
|
|
} |
275 |
gpertea |
130 |
GSeg exon(exstart+1,c->pos+l); |
276 |
gpertea |
126 |
exons.Add(exon); |
277 |
gpertea |
124 |
end=c->pos+l; |
278 |
gpertea |
121 |
} |
279 |
gpertea |
124 |
|
280 |
|
|
int GBamRecord::find_tag(const char tag[2], uint8_t* & s, char& tag_type) { |
281 |
|
|
//position s at the beginning of tag "data" (after the type char) |
282 |
|
|
//returns the length of tag data, and tag type in tag_type |
283 |
|
|
s=bam1_aux(b); |
284 |
|
|
while (s < b->data + b->data_len) { |
285 |
|
|
char key[2]; |
286 |
|
|
key[0] = (char)s[0]; key[1] = (char)s[1]; |
287 |
|
|
s += 2; tag_type = (char)*s; ++s; |
288 |
|
|
int inc=0; |
289 |
|
|
int m_inc=0; //for 'B' case |
290 |
|
|
uint8_t sub_type=0; // --,,-- |
291 |
|
|
switch (tag_type) { |
292 |
|
|
case 'A': |
293 |
|
|
case 'C': |
294 |
|
|
case 'c': |
295 |
|
|
inc=1; |
296 |
|
|
break; |
297 |
|
|
case 'S': |
298 |
|
|
case 's': |
299 |
|
|
inc=2; |
300 |
|
|
break; |
301 |
|
|
case 'I': |
302 |
|
|
case 'i': |
303 |
|
|
case 'f': |
304 |
|
|
inc=4; |
305 |
|
|
break; |
306 |
|
|
case 'd': |
307 |
|
|
inc=8; |
308 |
|
|
break; |
309 |
|
|
case 'B': |
310 |
|
|
sub_type = *(s+1); |
311 |
|
|
int32_t n; |
312 |
|
|
memcpy(&n, s+1, 4); |
313 |
|
|
inc += 5; |
314 |
|
|
//kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); |
315 |
|
|
m_inc=0; |
316 |
|
|
if (sub_type == 'c' || sub_type == 'C') |
317 |
|
|
{ m_inc=1; } |
318 |
|
|
else if (sub_type == 's' || sub_type == 'S') |
319 |
|
|
{ m_inc = 2; } |
320 |
|
|
else if ('i' == sub_type || 'I' == sub_type || 'f' == sub_type) |
321 |
|
|
{ m_inc = 4; } |
322 |
|
|
if (m_inc==0) GError("Error: invalid 'B' array subtype (%c)!\n",sub_type); |
323 |
|
|
inc += m_inc*n; |
324 |
|
|
break; |
325 |
|
|
case 'H': |
326 |
|
|
case 'Z': |
327 |
|
|
while (*(s+inc)) ++inc; |
328 |
|
|
break; |
329 |
|
|
} //switch (tag_type) |
330 |
|
|
if (tag[0]==key[0] && tag[1]==key[1]) |
331 |
|
|
return inc; |
332 |
gpertea |
127 |
s+=inc; |
333 |
gpertea |
124 |
}//while aux data |
334 |
|
|
return 0; |
335 |
|
|
} |
336 |
|
|
|
337 |
|
|
char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char |
338 |
|
|
uint8_t *s; |
339 |
|
|
char tag_type=0; |
340 |
|
|
int vlen=find_tag(tag, s, tag_type); |
341 |
|
|
if (vlen==0) return 0; |
342 |
|
|
//if (vlen>1) GWarning("Warning: tag %c%c value has length %d, but char was expected.\n", |
343 |
|
|
// tag[0],tag[1],vlen); |
344 |
|
|
return (char)s[0]; |
345 |
|
|
} |
346 |
|
|
|
347 |
|
|
int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag |
348 |
|
|
uint8_t *s; |
349 |
|
|
char tag_type=0; |
350 |
|
|
int vlen=find_tag(tag, s, tag_type); |
351 |
|
|
if (vlen==0) return 0; |
352 |
|
|
if (vlen==1) return (int)(*(int8_t*)s); |
353 |
|
|
else if (vlen==2) return (int)(*(int16_t*)s); |
354 |
|
|
else if (vlen==4) return (int)(*(int32_t*)s); |
355 |
|
|
else GMessage("Warning: tag %c%c value has length %d, but int type was expected.\n", |
356 |
|
|
tag[0],tag[1], vlen); |
357 |
|
|
return 0; |
358 |
|
|
} |
359 |
|
|
|
360 |
|
|
char* GBamRecord::tag_str(const char tag[2]) { //return string value for a tag |
361 |
|
|
uint8_t *s; |
362 |
|
|
char tag_type=0; |
363 |
|
|
int vlen=find_tag(tag, s, tag_type); |
364 |
|
|
if (vlen==0) return NULL; //not found |
365 |
|
|
//if (vlen>1) GWarning("Warning: tag %c%c value has length %d, but char was expected.\n", |
366 |
|
|
// tag[0],tag[1],vlen); |
367 |
|
|
return (char *)s; |
368 |
|
|
} |
369 |
|
|
|
370 |
|
|
char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag |
371 |
|
|
uint8_t *s; |
372 |
|
|
char tag_type=0; |
373 |
|
|
int vlen=find_tag("XS", s, tag_type); |
374 |
|
|
if (vlen==0) return '.'; |
375 |
|
|
return (char)s[0]; |
376 |
|
|
} |
377 |
|
|
|
378 |
|
|
char* GBamRecord::sequence() { //user must free this after use |
379 |
|
|
char *s = (char*)bam1_seq(b); |
380 |
|
|
char* qseq=NULL; |
381 |
|
|
GMALLOC(qseq, (b->core.l_qseq+1)); |
382 |
|
|
int i; |
383 |
|
|
for (i=0;i<(b->core.l_qseq);i++) { |
384 |
|
|
int8_t v = bam1_seqi(s,i); |
385 |
|
|
qseq[i] = bam_nt16_rev_table[v]; |
386 |
|
|
} |
387 |
|
|
qseq[i] = 0; |
388 |
|
|
return qseq; |
389 |
|
|
} |
390 |
|
|
|
391 |
|
|
char* GBamRecord::qualities() {//user must free this after use |
392 |
|
|
char *qual = (char*)bam1_qual(b); |
393 |
|
|
char* qv=NULL; |
394 |
|
|
GMALLOC(qv, (b->core.l_qseq+1) ); |
395 |
|
|
int i; |
396 |
|
|
for(i=0;i<(b->core.l_qseq);i++) { |
397 |
|
|
qv[i]=qual[i]+33; |
398 |
|
|
} |
399 |
|
|
qv[i]=0; |
400 |
|
|
return qv; |
401 |
|
|
} |
402 |
|
|
|
403 |
|
|
char* GBamRecord::cigar() { //returns text version of the CIGAR string; must be freed by user |
404 |
|
|
kstring_t str; |
405 |
|
|
str.l = str.m = 0; str.s = 0; |
406 |
|
|
if (b->core.n_cigar == 0) kputc('*', &str); |
407 |
|
|
else { |
408 |
|
|
for (int i = 0; i < b->core.n_cigar; ++i) { |
409 |
|
|
kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); |
410 |
|
|
kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); |
411 |
|
|
} |
412 |
|
|
} |
413 |
|
|
return str.s; |
414 |
|
|
} |