1 |
#include "GBam.h" |
2 |
#include <ctype.h> |
3 |
#include "kstring.h" |
4 |
|
5 |
//auxiliary functions for low level BAM record creation |
6 |
uint8_t* realloc_bdata(bam1_t *b, int size) { |
7 |
if (b->m_data < size) { |
8 |
b->m_data = size; |
9 |
kroundup32(b->m_data); |
10 |
b->data = (uint8_t*)realloc(b->data, b->m_data); |
11 |
} |
12 |
if (b->data_len<size) b->data_len=size; |
13 |
return b->data; |
14 |
} |
15 |
|
16 |
uint8_t* dupalloc_bdata(bam1_t *b, int size) { |
17 |
//same as realloc_bdata, but does not free previous data |
18 |
//but returns it instead |
19 |
//it ALWAYS duplicates data |
20 |
b->m_data = size; |
21 |
kroundup32(b->m_data); |
22 |
uint8_t* odata=b->data; |
23 |
b->data = (uint8_t*)malloc(b->m_data); |
24 |
memcpy((void*)b->data, (void*)odata, b->data_len); |
25 |
b->data_len=size; |
26 |
return odata; //user must FREE this after |
27 |
} |
28 |
|
29 |
//from bam_import.c |
30 |
extern unsigned short bam_char2flag_table[]; |
31 |
|
32 |
GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid, |
33 |
int pos, bool reverse, const char* qseq, |
34 |
const char* cigar, const char* quals):exons(1) { |
35 |
//novel=true; |
36 |
bam_header=NULL; |
37 |
b=bam_init1(); |
38 |
b->core.tid=gseq_tid; |
39 |
if (pos<=0) { |
40 |
b->core.pos=-1; //unmapped |
41 |
//if (gseq_tid<0) |
42 |
b->core.flag |= BAM_FUNMAP; |
43 |
} |
44 |
else b->core.pos=pos-1; //BAM is 0-based |
45 |
b->core.qual=255; |
46 |
int l_qseq=strlen(qseq); |
47 |
//this may not be accurate, setting CIGAR is the correct way |
48 |
//b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1); |
49 |
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
50 |
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
51 |
set_cigar(cigar); //this will also set core.bin |
52 |
add_sequence(qseq, l_qseq); |
53 |
add_quals(quals); //quals must be given as Phred33 |
54 |
if (reverse) { b->core.flag |= BAM_FREVERSE ; } |
55 |
} |
56 |
|
57 |
GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid, |
58 |
int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, |
59 |
int insert_size, const char* qseq, const char* quals, |
60 |
GVec<char*>* aux_strings):exons(1) { |
61 |
//novel=true; |
62 |
bam_header=NULL; |
63 |
b=bam_init1(); |
64 |
b->core.tid=g_tid; |
65 |
b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based |
66 |
b->core.qual=map_qual; |
67 |
int l_qseq=strlen(qseq); |
68 |
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
69 |
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
70 |
set_cigar(cigar); //this will also set core.bin |
71 |
add_sequence(qseq, l_qseq); |
72 |
add_quals(quals); //quals must be given as Phred33 |
73 |
set_flags(flags); |
74 |
set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size); |
75 |
if (aux_strings!=NULL) { |
76 |
for (int i=0;i<aux_strings->Count();i++) { |
77 |
add_aux(aux_strings->Get(i)); |
78 |
} |
79 |
} |
80 |
} |
81 |
|
82 |
void GBamRecord::set_cigar(const char* cigar) { |
83 |
//requires b->core.pos and b->core.flag to have been set properly PRIOR to this call |
84 |
int doff=b->core.l_qname; |
85 |
uint8_t* after_cigar=NULL; |
86 |
int after_cigar_len=0; |
87 |
uint8_t* prev_bdata=NULL; |
88 |
if (b->data_len>doff) { |
89 |
//cigar string already allocated, replace it |
90 |
int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data |
91 |
after_cigar=b->data+d; |
92 |
after_cigar_len=b->data_len-d; |
93 |
} |
94 |
const char *s; |
95 |
char *t; |
96 |
int i, op; |
97 |
long x; |
98 |
b->core.n_cigar = 0; |
99 |
if (cigar != NULL && strcmp(cigar, "*") != 0) { |
100 |
for (s = cigar; *s; ++s) { |
101 |
if (isalpha(*s)) b->core.n_cigar++; |
102 |
else if (!isdigit(*s)) { |
103 |
GError("Error: invalid CIGAR character (%s)\n",cigar); |
104 |
} |
105 |
} |
106 |
if (after_cigar_len>0) { //replace/insert into existing full data |
107 |
prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); |
108 |
memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); |
109 |
free(prev_bdata); |
110 |
} |
111 |
else { |
112 |
realloc_bdata(b, doff + b->core.n_cigar * 4); |
113 |
} |
114 |
for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { |
115 |
x = strtol(s, &t, 10); |
116 |
op = toupper(*t); |
117 |
if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; |
118 |
else if (op == 'I') op = BAM_CINS; |
119 |
else if (op == 'D') op = BAM_CDEL; |
120 |
else if (op == 'N') op = BAM_CREF_SKIP; |
121 |
else if (op == 'S') op = BAM_CSOFT_CLIP; |
122 |
else if (op == 'H') op = BAM_CHARD_CLIP; |
123 |
else if (op == 'P') op = BAM_CPAD; |
124 |
else GError("Error: invalid CIGAR operation (%s)\n",cigar); |
125 |
s = t + 1; |
126 |
bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; |
127 |
} |
128 |
if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar); |
129 |
b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); |
130 |
} else {//no CIGAR string given |
131 |
if (!(b->core.flag&BAM_FUNMAP)) { |
132 |
GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); |
133 |
b->core.flag |= BAM_FUNMAP; |
134 |
} |
135 |
b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); |
136 |
} |
137 |
setupCoordinates(); |
138 |
} //set_cigar() |
139 |
|
140 |
void GBamRecord::add_sequence(const char* qseq, int slen) { |
141 |
//must be called AFTER set_cigar (cannot replace existing sequence for now) |
142 |
if (qseq==NULL) return; //should we ever care about this? |
143 |
if (slen<0) slen=strlen(qseq); |
144 |
int doff = b->core.l_qname + b->core.n_cigar * 4; |
145 |
if (strcmp(qseq, "*")!=0) { |
146 |
b->core.l_qseq=slen; |
147 |
if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) |
148 |
GError("Error: CIGAR and sequence length are inconsistent!(%s)\n", |
149 |
qseq); |
150 |
uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; |
151 |
//also allocated quals memory |
152 |
memset(p, 0, (b->core.l_qseq+1)/2); |
153 |
for (int i = 0; i < b->core.l_qseq; ++i) |
154 |
p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); |
155 |
} else b->core.l_qseq = 0; |
156 |
} |
157 |
|
158 |
void GBamRecord::add_quals(const char* quals) { |
159 |
//requires core.l_qseq already set |
160 |
//and must be called AFTER add_sequence(), which also allocates the memory for quals |
161 |
uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2); |
162 |
if (quals==NULL || strcmp(quals, "*") == 0) { |
163 |
for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff; |
164 |
return; |
165 |
} |
166 |
for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33; |
167 |
} |
168 |
|
169 |
void GBamRecord::add_aux(const char* str) { |
170 |
//requires: being called AFTER add_quals() |
171 |
static char tag[2]; |
172 |
static uint8_t abuf[512]; |
173 |
//requires: being called AFTER add_quals() |
174 |
int strl=strlen(str); |
175 |
//int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux; |
176 |
//int doff0=doff; |
177 |
if (strl < 6 || str[2] != ':' || str[4] != ':') |
178 |
parse_error("missing colon in auxiliary data"); |
179 |
tag[0] = str[0]; tag[1] = str[1]; |
180 |
uint8_t atype = str[3]; |
181 |
uint8_t* adata=abuf; |
182 |
int alen=0; |
183 |
if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility |
184 |
atype='A'; |
185 |
alen=1; |
186 |
adata=(uint8_t*)&str[5]; |
187 |
} |
188 |
else if (atype == 'I' || atype == 'i') { |
189 |
long long x=(long long)atoll(str + 5); |
190 |
if (x < 0) { |
191 |
if (x >= -127) { |
192 |
atype='c'; |
193 |
abuf[0] = (int8_t)x; |
194 |
alen=1; |
195 |
} |
196 |
else if (x >= -32767) { |
197 |
atype = 's'; |
198 |
*(int16_t*)abuf = (int16_t)x; |
199 |
alen=2; |
200 |
} |
201 |
else { |
202 |
atype='i'; |
203 |
*(int32_t*)abuf = (int32_t)x; |
204 |
alen=4; |
205 |
if (x < -2147483648ll) |
206 |
GMessage("Parse warning: integer %lld is out of range.", x); |
207 |
} |
208 |
} else { //x >=0 |
209 |
if (x <= 255) { |
210 |
atype = 'C'; |
211 |
abuf[0] = (uint8_t)x; |
212 |
alen=1; |
213 |
} |
214 |
else if (x <= 65535) { |
215 |
atype='S'; |
216 |
*(uint16_t*)abuf = (uint16_t)x; |
217 |
alen=2; |
218 |
} |
219 |
else { |
220 |
atype='I'; |
221 |
*(uint32_t*)abuf = (uint32_t)x; |
222 |
alen=4; |
223 |
if (x > 4294967295ll) |
224 |
GMessage("Parse warning: integer %lld is out of range.", x); |
225 |
} |
226 |
} |
227 |
} //integer type |
228 |
else if (atype == 'f') { |
229 |
*(float*)abuf = (float)atof(str + 5); |
230 |
alen = sizeof(float); |
231 |
} |
232 |
else if (atype == 'd') { //? |
233 |
*(float*)abuf = (float)atof(str + 9); |
234 |
alen=8; |
235 |
} |
236 |
else if (atype == 'Z' || atype == 'H') { |
237 |
if (atype == 'H') { // check whether the hex string is valid |
238 |
if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even"); |
239 |
for (int i = 0; i < strl - 5; ++i) { |
240 |
int c = toupper(str[5 + i]); |
241 |
if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) |
242 |
parse_error("invalid hex character"); |
243 |
} |
244 |
} |
245 |
memcpy(abuf, str + 5, strl - 5); |
246 |
abuf[strl-5] = 0; |
247 |
alen=strl-4; |
248 |
} else parse_error("unrecognized aux type"); |
249 |
this->add_aux(tag, atype, alen, adata); |
250 |
}//add_aux() |
251 |
|
252 |
|
253 |
void GBamRecord::setupCoordinates() { |
254 |
uint32_t *cigar = bam1_cigar(b); |
255 |
const bam1_core_t *c = &b->core; |
256 |
if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */ |
257 |
int l=0; |
258 |
start=c->pos+1; |
259 |
int exstart=c->pos; |
260 |
for (int i = 0; i < c->n_cigar; ++i) { |
261 |
int op = cigar[i]&0xf; |
262 |
if (op == BAM_CMATCH || op==BAM_CEQUAL || |
263 |
op == BAM_CDIFF || op == BAM_CDEL) { |
264 |
l += cigar[i]>>4; |
265 |
} |
266 |
else if (op == BAM_CREF_SKIP) { //N |
267 |
//intron starts |
268 |
//exon ends here |
269 |
GSeg exon(exstart+1,c->pos+l); |
270 |
exons.Add(exon); |
271 |
l += cigar[i]>>4; |
272 |
exstart=c->pos+l; |
273 |
} |
274 |
} |
275 |
GSeg exon(exstart+1,c->pos+l); |
276 |
exons.Add(exon); |
277 |
end=c->pos+l; |
278 |
} |
279 |
|
280 |
int GBamRecord::find_tag(const char tag[2], uint8_t* & s, char& tag_type) { |
281 |
//position s at the beginning of tag "data" (after the type char) |
282 |
//returns the length of tag data, and tag type in tag_type |
283 |
s=bam1_aux(b); |
284 |
while (s < b->data + b->data_len) { |
285 |
char key[2]; |
286 |
key[0] = (char)s[0]; key[1] = (char)s[1]; |
287 |
s += 2; tag_type = (char)*s; ++s; |
288 |
int inc=0; |
289 |
int m_inc=0; //for 'B' case |
290 |
uint8_t sub_type=0; // --,,-- |
291 |
switch (tag_type) { |
292 |
case 'A': |
293 |
case 'C': |
294 |
case 'c': |
295 |
inc=1; |
296 |
break; |
297 |
case 'S': |
298 |
case 's': |
299 |
inc=2; |
300 |
break; |
301 |
case 'I': |
302 |
case 'i': |
303 |
case 'f': |
304 |
inc=4; |
305 |
break; |
306 |
case 'd': |
307 |
inc=8; |
308 |
break; |
309 |
case 'B': |
310 |
sub_type = *(s+1); |
311 |
int32_t n; |
312 |
memcpy(&n, s+1, 4); |
313 |
inc += 5; |
314 |
//kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); |
315 |
m_inc=0; |
316 |
if (sub_type == 'c' || sub_type == 'C') |
317 |
{ m_inc=1; } |
318 |
else if (sub_type == 's' || sub_type == 'S') |
319 |
{ m_inc = 2; } |
320 |
else if ('i' == sub_type || 'I' == sub_type || 'f' == sub_type) |
321 |
{ m_inc = 4; } |
322 |
if (m_inc==0) GError("Error: invalid 'B' array subtype (%c)!\n",sub_type); |
323 |
inc += m_inc*n; |
324 |
break; |
325 |
case 'H': |
326 |
case 'Z': |
327 |
while (*(s+inc)) ++inc; |
328 |
break; |
329 |
} //switch (tag_type) |
330 |
if (tag[0]==key[0] && tag[1]==key[1]) |
331 |
return inc; |
332 |
s+=inc; |
333 |
}//while aux data |
334 |
return 0; |
335 |
} |
336 |
|
337 |
char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char |
338 |
uint8_t *s; |
339 |
char tag_type=0; |
340 |
int vlen=find_tag(tag, s, tag_type); |
341 |
if (vlen==0) return 0; |
342 |
//if (vlen>1) GWarning("Warning: tag %c%c value has length %d, but char was expected.\n", |
343 |
// tag[0],tag[1],vlen); |
344 |
return (char)s[0]; |
345 |
} |
346 |
|
347 |
int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag |
348 |
uint8_t *s; |
349 |
char tag_type=0; |
350 |
int vlen=find_tag(tag, s, tag_type); |
351 |
if (vlen==0) return 0; |
352 |
if (vlen==1) return (int)(*(int8_t*)s); |
353 |
else if (vlen==2) return (int)(*(int16_t*)s); |
354 |
else if (vlen==4) return (int)(*(int32_t*)s); |
355 |
else GMessage("Warning: tag %c%c value has length %d, but int type was expected.\n", |
356 |
tag[0],tag[1], vlen); |
357 |
return 0; |
358 |
} |
359 |
|
360 |
char* GBamRecord::tag_str(const char tag[2]) { //return string value for a tag |
361 |
uint8_t *s; |
362 |
char tag_type=0; |
363 |
int vlen=find_tag(tag, s, tag_type); |
364 |
if (vlen==0) return NULL; //not found |
365 |
//if (vlen>1) GWarning("Warning: tag %c%c value has length %d, but char was expected.\n", |
366 |
// tag[0],tag[1],vlen); |
367 |
return (char *)s; |
368 |
} |
369 |
|
370 |
char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag |
371 |
uint8_t *s; |
372 |
char tag_type=0; |
373 |
int vlen=find_tag("XS", s, tag_type); |
374 |
if (vlen==0) return '.'; |
375 |
return (char)s[0]; |
376 |
} |
377 |
|
378 |
char* GBamRecord::sequence() { //user must free this after use |
379 |
char *s = (char*)bam1_seq(b); |
380 |
char* qseq=NULL; |
381 |
GMALLOC(qseq, (b->core.l_qseq+1)); |
382 |
int i; |
383 |
for (i=0;i<(b->core.l_qseq);i++) { |
384 |
int8_t v = bam1_seqi(s,i); |
385 |
qseq[i] = bam_nt16_rev_table[v]; |
386 |
} |
387 |
qseq[i] = 0; |
388 |
return qseq; |
389 |
} |
390 |
|
391 |
char* GBamRecord::qualities() {//user must free this after use |
392 |
char *qual = (char*)bam1_qual(b); |
393 |
char* qv=NULL; |
394 |
GMALLOC(qv, (b->core.l_qseq+1) ); |
395 |
int i; |
396 |
for(i=0;i<(b->core.l_qseq);i++) { |
397 |
qv[i]=qual[i]+33; |
398 |
} |
399 |
qv[i]=0; |
400 |
return qv; |
401 |
} |
402 |
|
403 |
char* GBamRecord::cigar() { //returns text version of the CIGAR string; must be freed by user |
404 |
kstring_t str; |
405 |
str.l = str.m = 0; str.s = 0; |
406 |
if (b->core.n_cigar == 0) kputc('*', &str); |
407 |
else { |
408 |
for (int i = 0; i < b->core.n_cigar; ++i) { |
409 |
kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); |
410 |
kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); |
411 |
} |
412 |
} |
413 |
return str.s; |
414 |
} |