ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.cpp
Revision: 133
Committed: Fri Dec 9 15:59:14 2011 UTC (12 years, 10 months ago) by gpertea
File size: 14340 byte(s)
Log Message:
TODO: detect sam/bam input for samopen

Line User Rev File contents
1 gpertea 121 #include "GBam.h"
2     #include <ctype.h>
3 gpertea 124 #include "kstring.h"
4 gpertea 121
5 gpertea 133 //auxiliary functions for low level BAM record creation
6 gpertea 121 uint8_t* realloc_bdata(bam1_t *b, int size) {
7     if (b->m_data < size) {
8     b->m_data = size;
9     kroundup32(b->m_data);
10     b->data = (uint8_t*)realloc(b->data, b->m_data);
11     }
12     if (b->data_len<size) b->data_len=size;
13     return b->data;
14     }
15    
16     uint8_t* dupalloc_bdata(bam1_t *b, int size) {
17     //same as realloc_bdata, but does not free previous data
18     //but returns it instead
19     //it ALWAYS duplicates data
20     b->m_data = size;
21     kroundup32(b->m_data);
22     uint8_t* odata=b->data;
23     b->data = (uint8_t*)malloc(b->m_data);
24     memcpy((void*)b->data, (void*)odata, b->data_len);
25     b->data_len=size;
26     return odata; //user must FREE this after
27     }
28    
29     //from bam_import.c
30     extern unsigned short bam_char2flag_table[];
31    
32     GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
33 gpertea 124 int pos, bool reverse, const char* qseq,
34     const char* cigar, const char* quals):exons(1) {
35 gpertea 126 //novel=true;
36 gpertea 121 bam_header=NULL;
37     b=bam_init1();
38     b->core.tid=gseq_tid;
39     if (pos<=0) {
40     b->core.pos=-1; //unmapped
41     //if (gseq_tid<0)
42     b->core.flag |= BAM_FUNMAP;
43     }
44     else b->core.pos=pos-1; //BAM is 0-based
45     b->core.qual=255;
46     int l_qseq=strlen(qseq);
47     //this may not be accurate, setting CIGAR is the correct way
48     //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
49     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
50     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
51     set_cigar(cigar); //this will also set core.bin
52     add_sequence(qseq, l_qseq);
53     add_quals(quals); //quals must be given as Phred33
54     if (reverse) { b->core.flag |= BAM_FREVERSE ; }
55     }
56    
57     GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
58     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
59     int insert_size, const char* qseq, const char* quals,
60 gpertea 124 GVec<char*>* aux_strings):exons(1) {
61 gpertea 126 //novel=true;
62 gpertea 121 bam_header=NULL;
63     b=bam_init1();
64     b->core.tid=g_tid;
65     b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
66     b->core.qual=map_qual;
67     int l_qseq=strlen(qseq);
68     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
69     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
70     set_cigar(cigar); //this will also set core.bin
71     add_sequence(qseq, l_qseq);
72     add_quals(quals); //quals must be given as Phred33
73     set_flags(flags);
74     set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
75     if (aux_strings!=NULL) {
76     for (int i=0;i<aux_strings->Count();i++) {
77     add_aux(aux_strings->Get(i));
78     }
79     }
80     }
81 gpertea 124
82 gpertea 121 void GBamRecord::set_cigar(const char* cigar) {
83     //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
84     int doff=b->core.l_qname;
85     uint8_t* after_cigar=NULL;
86     int after_cigar_len=0;
87     uint8_t* prev_bdata=NULL;
88     if (b->data_len>doff) {
89     //cigar string already allocated, replace it
90     int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
91     after_cigar=b->data+d;
92     after_cigar_len=b->data_len-d;
93     }
94     const char *s;
95     char *t;
96     int i, op;
97     long x;
98     b->core.n_cigar = 0;
99     if (cigar != NULL && strcmp(cigar, "*") != 0) {
100     for (s = cigar; *s; ++s) {
101     if (isalpha(*s)) b->core.n_cigar++;
102     else if (!isdigit(*s)) {
103     GError("Error: invalid CIGAR character (%s)\n",cigar);
104     }
105     }
106     if (after_cigar_len>0) { //replace/insert into existing full data
107     prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
108     memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
109     free(prev_bdata);
110     }
111     else {
112     realloc_bdata(b, doff + b->core.n_cigar * 4);
113     }
114     for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
115     x = strtol(s, &t, 10);
116     op = toupper(*t);
117     if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
118     else if (op == 'I') op = BAM_CINS;
119     else if (op == 'D') op = BAM_CDEL;
120     else if (op == 'N') op = BAM_CREF_SKIP;
121     else if (op == 'S') op = BAM_CSOFT_CLIP;
122     else if (op == 'H') op = BAM_CHARD_CLIP;
123     else if (op == 'P') op = BAM_CPAD;
124     else GError("Error: invalid CIGAR operation (%s)\n",cigar);
125     s = t + 1;
126     bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
127     }
128     if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar);
129     b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
130     } else {//no CIGAR string given
131     if (!(b->core.flag&BAM_FUNMAP)) {
132     GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
133     b->core.flag |= BAM_FUNMAP;
134     }
135     b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
136     }
137 gpertea 124 setupCoordinates();
138 gpertea 121 } //set_cigar()
139    
140     void GBamRecord::add_sequence(const char* qseq, int slen) {
141     //must be called AFTER set_cigar (cannot replace existing sequence for now)
142     if (qseq==NULL) return; //should we ever care about this?
143     if (slen<0) slen=strlen(qseq);
144     int doff = b->core.l_qname + b->core.n_cigar * 4;
145     if (strcmp(qseq, "*")!=0) {
146     b->core.l_qseq=slen;
147     if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
148     GError("Error: CIGAR and sequence length are inconsistent!(%s)\n",
149     qseq);
150     uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
151     //also allocated quals memory
152     memset(p, 0, (b->core.l_qseq+1)/2);
153     for (int i = 0; i < b->core.l_qseq; ++i)
154     p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
155     } else b->core.l_qseq = 0;
156     }
157    
158     void GBamRecord::add_quals(const char* quals) {
159     //requires core.l_qseq already set
160     //and must be called AFTER add_sequence(), which also allocates the memory for quals
161     uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
162     if (quals==NULL || strcmp(quals, "*") == 0) {
163     for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
164     return;
165     }
166     for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
167     }
168    
169     void GBamRecord::add_aux(const char* str) {
170     //requires: being called AFTER add_quals()
171     static char tag[2];
172     static uint8_t abuf[512];
173     //requires: being called AFTER add_quals()
174     int strl=strlen(str);
175     //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
176     //int doff0=doff;
177     if (strl < 6 || str[2] != ':' || str[4] != ':')
178     parse_error("missing colon in auxiliary data");
179     tag[0] = str[0]; tag[1] = str[1];
180     uint8_t atype = str[3];
181     uint8_t* adata=abuf;
182     int alen=0;
183     if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
184     atype='A';
185     alen=1;
186     adata=(uint8_t*)&str[5];
187     }
188     else if (atype == 'I' || atype == 'i') {
189     long long x=(long long)atoll(str + 5);
190     if (x < 0) {
191     if (x >= -127) {
192     atype='c';
193     abuf[0] = (int8_t)x;
194     alen=1;
195     }
196     else if (x >= -32767) {
197     atype = 's';
198     *(int16_t*)abuf = (int16_t)x;
199     alen=2;
200     }
201     else {
202     atype='i';
203     *(int32_t*)abuf = (int32_t)x;
204     alen=4;
205     if (x < -2147483648ll)
206     GMessage("Parse warning: integer %lld is out of range.", x);
207     }
208     } else { //x >=0
209     if (x <= 255) {
210     atype = 'C';
211     abuf[0] = (uint8_t)x;
212     alen=1;
213     }
214     else if (x <= 65535) {
215     atype='S';
216     *(uint16_t*)abuf = (uint16_t)x;
217     alen=2;
218     }
219     else {
220     atype='I';
221     *(uint32_t*)abuf = (uint32_t)x;
222     alen=4;
223     if (x > 4294967295ll)
224     GMessage("Parse warning: integer %lld is out of range.", x);
225     }
226     }
227     } //integer type
228     else if (atype == 'f') {
229     *(float*)abuf = (float)atof(str + 5);
230     alen = sizeof(float);
231     }
232     else if (atype == 'd') { //?
233     *(float*)abuf = (float)atof(str + 9);
234     alen=8;
235     }
236     else if (atype == 'Z' || atype == 'H') {
237     if (atype == 'H') { // check whether the hex string is valid
238     if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
239     for (int i = 0; i < strl - 5; ++i) {
240     int c = toupper(str[5 + i]);
241     if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
242     parse_error("invalid hex character");
243     }
244     }
245     memcpy(abuf, str + 5, strl - 5);
246     abuf[strl-5] = 0;
247     alen=strl-4;
248     } else parse_error("unrecognized aux type");
249     this->add_aux(tag, atype, alen, adata);
250     }//add_aux()
251    
252    
253     void GBamRecord::setupCoordinates() {
254     uint32_t *cigar = bam1_cigar(b);
255     const bam1_core_t *c = &b->core;
256 gpertea 124 if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */
257 gpertea 121 int l=0;
258 gpertea 124 start=c->pos+1;
259     int exstart=c->pos;
260     for (int i = 0; i < c->n_cigar; ++i) {
261     int op = cigar[i]&0xf;
262     if (op == BAM_CMATCH || op==BAM_CEQUAL ||
263     op == BAM_CDIFF || op == BAM_CDEL) {
264 gpertea 121 l += cigar[i]>>4;
265 gpertea 124 }
266     else if (op == BAM_CREF_SKIP) { //N
267     //intron starts
268     //exon ends here
269 gpertea 130 GSeg exon(exstart+1,c->pos+l);
270 gpertea 126 exons.Add(exon);
271 gpertea 124 l += cigar[i]>>4;
272     exstart=c->pos+l;
273     }
274     }
275 gpertea 130 GSeg exon(exstart+1,c->pos+l);
276 gpertea 126 exons.Add(exon);
277 gpertea 124 end=c->pos+l;
278 gpertea 121 }
279 gpertea 124
280     int GBamRecord::find_tag(const char tag[2], uint8_t* & s, char& tag_type) {
281     //position s at the beginning of tag "data" (after the type char)
282     //returns the length of tag data, and tag type in tag_type
283     s=bam1_aux(b);
284     while (s < b->data + b->data_len) {
285     char key[2];
286     key[0] = (char)s[0]; key[1] = (char)s[1];
287     s += 2; tag_type = (char)*s; ++s;
288     int inc=0;
289     int m_inc=0; //for 'B' case
290     uint8_t sub_type=0; // --,,--
291     switch (tag_type) {
292     case 'A':
293     case 'C':
294     case 'c':
295     inc=1;
296     break;
297     case 'S':
298     case 's':
299     inc=2;
300     break;
301     case 'I':
302     case 'i':
303     case 'f':
304     inc=4;
305     break;
306     case 'd':
307     inc=8;
308     break;
309     case 'B':
310     sub_type = *(s+1);
311     int32_t n;
312     memcpy(&n, s+1, 4);
313     inc += 5;
314     //kputc(type, &str); kputc(':', &str); kputc(sub_type, &str);
315     m_inc=0;
316     if (sub_type == 'c' || sub_type == 'C')
317     { m_inc=1; }
318     else if (sub_type == 's' || sub_type == 'S')
319     { m_inc = 2; }
320     else if ('i' == sub_type || 'I' == sub_type || 'f' == sub_type)
321     { m_inc = 4; }
322     if (m_inc==0) GError("Error: invalid 'B' array subtype (%c)!\n",sub_type);
323     inc += m_inc*n;
324     break;
325     case 'H':
326     case 'Z':
327     while (*(s+inc)) ++inc;
328     break;
329     } //switch (tag_type)
330     if (tag[0]==key[0] && tag[1]==key[1])
331     return inc;
332 gpertea 127 s+=inc;
333 gpertea 124 }//while aux data
334     return 0;
335     }
336    
337     char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char
338     uint8_t *s;
339     char tag_type=0;
340     int vlen=find_tag(tag, s, tag_type);
341     if (vlen==0) return 0;
342     //if (vlen>1) GWarning("Warning: tag %c%c value has length %d, but char was expected.\n",
343     // tag[0],tag[1],vlen);
344     return (char)s[0];
345     }
346    
347     int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag
348     uint8_t *s;
349     char tag_type=0;
350     int vlen=find_tag(tag, s, tag_type);
351     if (vlen==0) return 0;
352     if (vlen==1) return (int)(*(int8_t*)s);
353     else if (vlen==2) return (int)(*(int16_t*)s);
354     else if (vlen==4) return (int)(*(int32_t*)s);
355     else GMessage("Warning: tag %c%c value has length %d, but int type was expected.\n",
356     tag[0],tag[1], vlen);
357     return 0;
358     }
359    
360     char* GBamRecord::tag_str(const char tag[2]) { //return string value for a tag
361     uint8_t *s;
362     char tag_type=0;
363     int vlen=find_tag(tag, s, tag_type);
364     if (vlen==0) return NULL; //not found
365     //if (vlen>1) GWarning("Warning: tag %c%c value has length %d, but char was expected.\n",
366     // tag[0],tag[1],vlen);
367     return (char *)s;
368     }
369    
370     char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag
371     uint8_t *s;
372     char tag_type=0;
373     int vlen=find_tag("XS", s, tag_type);
374     if (vlen==0) return '.';
375     return (char)s[0];
376     }
377    
378     char* GBamRecord::sequence() { //user must free this after use
379     char *s = (char*)bam1_seq(b);
380     char* qseq=NULL;
381     GMALLOC(qseq, (b->core.l_qseq+1));
382     int i;
383     for (i=0;i<(b->core.l_qseq);i++) {
384     int8_t v = bam1_seqi(s,i);
385     qseq[i] = bam_nt16_rev_table[v];
386     }
387     qseq[i] = 0;
388     return qseq;
389     }
390    
391     char* GBamRecord::qualities() {//user must free this after use
392     char *qual = (char*)bam1_qual(b);
393     char* qv=NULL;
394     GMALLOC(qv, (b->core.l_qseq+1) );
395     int i;
396     for(i=0;i<(b->core.l_qseq);i++) {
397     qv[i]=qual[i]+33;
398     }
399     qv[i]=0;
400     return qv;
401     }
402    
403     char* GBamRecord::cigar() { //returns text version of the CIGAR string; must be freed by user
404     kstring_t str;
405     str.l = str.m = 0; str.s = 0;
406     if (b->core.n_cigar == 0) kputc('*', &str);
407     else {
408     for (int i = 0; i < b->core.n_cigar; ++i) {
409     kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
410     kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
411     }
412     }
413     return str.s;
414     }