ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/AceParser.cpp
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (13 years, 1 month ago) by gpertea
File size: 11256 byte(s)
Log Message:
sync with local source

Line User Rev File contents
1 gpertea 16 #include "AceParser.h"
2     #include <ctype.h>
3    
4     bool AceParser::open() { //also checks if it looks like a valid ACE file
5     if (f==stdin) return true;
6     if ((f=fopen(fname,"rb"))==NULL) return false;
7     char first3[4];
8     first3[3]='\0';
9     if (fread(first3, 1,3, f)==3) {
10     f_pos=3;
11     return (strcmp(first3, "AS ")==0 || strcmp(first3, "CO ")==0);
12     }
13     return false;
14     }
15    
16    
17     LytSeqInfo* AceParser::addSeq(char* s, LytCtgData* ctg) {
18     LytSeqInfo* seq;
19     if (!startsWith(s, "AF ", 3))
20     return NULL;
21     s+=3;
22     char* p=strchrs(s," \t");
23     if (p==NULL) return NULL;
24     p++;
25     char c;
26     int offs;
27     if (sscanf(p,"%c %d", &c, &offs)!=2) return NULL;
28     p--;
29     *p='\0';
30     if ((seq=seqinfo.Find(s))!=NULL) {
31     GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
32     " so it cannot be added for contig '%s (%d)'\n",
33     s, seq->contig->name, seq->contig->len,
34     ctg->name, ctg->len);
35     return NULL;
36     }
37     seq = new LytSeqInfo(s, ctg, offs, (c=='C') ? 1 : 0);
38     seqinfo.shkAdd(seq->name, seq);
39     ctg->seqs.Add(seq);
40     return seq;
41     }
42    
43    
44     bool AceParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
45    
46     bool forgetCtg = false;
47     if (ctgidx>=contigs.Count())
48     GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
49     LytCtgData* ctgdata=contigs[ctgidx];
50     if (re_pos && currentContig!=NULL) { //free previously loaded contig data
51     currentContig->seqs.Clear(); // unless it was a parse() call
52     seqinfo.Clear();
53     }
54     currentContig=ctgdata;
55     int ctg_numSeqs=ctgdata->numseqs;
56    
57     if (re_pos) {
58     seek(ctgdata->fpos); //position right where the contig definition starts
59     char *r = linebuf->getLine(f,f_pos);
60     if (r==NULL) return false;
61     }
62    
63     if (seqfn!=NULL) { //process the contig sequence!
64     char* ctgseq=readSeq();
65     forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, ctgseq);
66     GFREE(ctgseq); //obviously the caller should have made a copy
67     }
68     //now look for all the component sequences
69     if (fskipTo("AF ")<0) {
70     GMessage("AceParser: error finding sequence offsets (AF)"
71     " for contig '%s' (%d)\n", ctgdata->name, ctgdata->len);
72     return false;
73     }
74     int numseqs=0;
75     while (startsWith(linebuf->chars(), "AF ",3)) {
76     if (addSeq(linebuf->chars(), ctgdata)==NULL) {
77     GMessage("AceParser: error parsing AF entry:\n%s\n",linebuf->chars());
78     return false;
79     }
80     numseqs++;
81     //read next line:
82     linebuf->getLine(f,f_pos);
83     }
84     if (numseqs!=ctg_numSeqs) {
85     GMessage("Invalid number of AF entries found (%d) for contig '%s' "
86     "(length %d, numseqs %d)\n", numseqs,
87     ctgdata->name, ctgdata->len, ctg_numSeqs);
88     return false;
89     }
90     //now read each sequence entry
91     off_t seqpos=fskipTo("RD ");
92     numseqs=0; //count again, now the RD entries
93     if (seqpos<0) {
94     GMessage("AceParser: error locating first RD entry for contig '%s'\n",
95     ctgdata->name);
96     return false;
97     }
98     //int numseqs=0;
99     //reading the actual component sequence details
100     while (startsWith(linebuf->chars(), "RD ",3)) {
101     char* s=linebuf->chars()+3;
102     char* p=strchrs(s, " \t");
103     LytSeqInfo* seq;
104     if (p==NULL) {
105     GMessage("AceParser: Error parsing RD header line:\n%s\n", linebuf->chars());
106     return false;
107     }
108     *p='\0';
109     if ((seq=seqinfo.Find(s))==NULL) {
110     GMessage("AceParser: unknown RD encountered: '%s'\n", s);
111     return false;
112     }
113     p++; //now p is in linebuf after the RD name
114     seq->fpos=seqpos;
115     int len;
116     if (sscanf(p, "%d", &len)!=1) {
117     GMessage("AceParser: cannot parse RD length for '%s'\n", s);
118     return false;
119     }
120     seq->setLength(len);
121     //read the sequence data here if a callback fn was given:
122     char* sseq=NULL;
123     if (seqfn!=NULL)
124     sseq=readSeq(seq); //read full sequence here
125     if (fskipTo("QA ")<0) {
126     GMessage("AceParser: Error finding QA entry for read %s! (fpos=%llu)\n", seq->name, (unsigned long long)f_pos);
127     return false;
128     }
129     //parse QA entry:
130     int tmpa, tmpb;
131     if (sscanf(linebuf->chars()+3, "%d %d %d %d", &tmpa, &tmpb, &seq->left,&seq->right)!=4 ||
132     seq->left<=0 || seq->right<=0) {
133     GMessage("AceParser: Error parsing QA entry.\n");
134     return false;
135     }
136     /*
137     if (fskipTo("DS")<0) {
138     GMessage("AceParser: Error closing RD entry ('DS' not found).\n");
139     return false;
140     }
141     */
142     seqpos=getFilePos()+1;
143     bool forgetSeq=false;
144     if (seqfn!=NULL) {
145     forgetSeq=(*seqfn)(numContigs, ctgdata, seq, sseq);
146     GFREE(sseq);
147     }
148     if (forgetSeq) { //parsing the whole stream -- aceconv)
149     ctg_numSeqs--;
150     seqinfo.Remove(seq->name);
151     ctgdata->seqs.RemovePtr(seq);
152     }
153     numseqs++;
154     if (numseqs<ctgdata->numseqs)
155     seqpos=fskipTo("RD ", "CO "); //more sequences left to read
156     }
157     if (numseqs!=ctgdata->numseqs) {
158     GMessage("Error: Invalid number of RD entries found (%d) for contig '%s' "
159     "(length %d, numseqs %d)\n", numseqs,
160     ctgdata->name, ctgdata->len, ctg_numSeqs);
161     return false;
162     }
163     if (forgetCtg) {
164     ctgIDs.Remove(ctgdata->name);
165     ctgdata->seqs.Clear();
166     seqinfo.Clear();
167     contigs.RemovePtr(ctgdata);
168     }
169     return true;
170     }
171    
172     //read only the contig headers from the file
173     //also checks for duplicate seqnames (just in case)
174     bool AceParser::parseContigs() {
175     if (f!=stdin) seek(0);
176     off_t ctgpos;
177     numContigs=0;
178     while ((ctgpos=fskipTo("CO "))>=0) {
179     numContigs++;
180     LytCtgData* ctgdata=new LytCtgData(ctgpos);
181     char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
182     if (p==NULL) {
183     GMessage("AceParser: error parsing contig name!\n");
184     return false;
185     }
186     int ctglen, ctg_numSeqs, numseqs;
187     //p must be after contig name within linebuf!
188     if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
189     GMessage("Error parsing contig len and seq count at:\n%s\n",
190     linebuf->chars());
191     }
192     ctg_numSeqs=numseqs;
193     ctgdata->len=ctglen;
194     ctgdata->numseqs=numseqs;
195     ctgdata->offs = 1;
196     contigs.Add(ctgdata);
197     //ctgpos=fskipTo("CO ");
198     }
199     if (ctgpos==-2) return false; //parsing failed: line too long ?!?
200     contigs.setSorted(true);
201     return true;
202     }
203    
204    
205     bool AceParser::parse(fnLytSeq* seqfn) {
206     //read all seqs and their positions from the file
207     //also checks for duplicate seqnames (just in case)
208     if (f!=stdin) seek(0);
209     ctgIDs.Clear();
210     //GHash<int> ctgIDs; //contig IDs, to make them unique!
211     //
212     off_t ctgpos;
213     numContigs=0;
214     while ((ctgpos=fskipTo("CO "))>=0) {
215     numContigs++;
216     LytCtgData* ctgdata=new LytCtgData(ctgpos);
217     char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
218     if (p==NULL) {
219     GMessage("AceParser: error parsing contig name!\n");
220     return false;
221     }
222     int ctglen, numseqs;
223     //p must be after contig name within linebuf!
224     if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
225     GMessage("Error parsing contig len and seq count at:\n%s\n",
226     linebuf->chars());
227     }
228     ctgdata->len=ctglen;
229     ctgdata->numseqs = numseqs;
230     ctgdata->offs = 1;
231     int ctgidx=contigs.Add(ctgdata);
232     loadContig(ctgidx, seqfn, false);
233     } //while contigs
234     if (ctgpos==-2) return false; //parsing failed: line too long ?!?
235     contigs.setSorted(true);
236     return true;
237     }
238    
239    
240     void add_intron(char*& buf,int accrd, int& rgpos,
241     const char* numstr, LytSeqInfo* seqinfo, char lspl, char rspl) {
242     int n=atoi(numstr);
243     if (n<=0) return;
244     if (seqinfo!=NULL) {
245     char sL = 0;
246     char sR = 0;
247     if (lspl>0) sL=(lspl=='['?'S':'s');
248     if (rspl>0) sR=(rspl==']'?'S':'s');
249     seqinfo->addInterSeg(seqinfo->offs+rgpos-1,seqinfo->offs+rgpos+n,
250     0,0,sL,sR,accrd);
251     }
252     rgpos+=n;
253     }
254    
255     char* AceParser::readSeq(LytSeqInfo* sqinfo) {
256     //assumes the next line is where a sequence starts!
257     //stops at the next empty line encountered
258     char* buf;
259     static char rlenbuf[12]={0,0,0,0,0,0,0,0,0,0,0,0}; //buffer for parsing the gap length
260     int rlenbufacc=0; //how many digits accumulated in rlenbuf so far
261     int buflen=512;
262     GMALLOC(buf, buflen); //this MUST be freed by the caller
263     buf[0]='\0';
264     int accrd=0; //accumulated read length so far -- excludes interseg gaps!
265     int rgpos=0; //accumulated offset including interseg gaps!
266     char *r = linebuf->getLine(f,f_pos);
267     int linelen=linebuf->length();
268     char r_splice=0, l_splice=0;
269     while (linelen>0) {
270     if (r==NULL) {
271     GMessage("AceParser: error reading sequence data\n");
272     return NULL;
273     }
274     //-- pass the line content for accumulation
275     int i=0;
276     while (r[i]) {
277     while (r[i] && isdigit(r[i])) {
278     rlenbuf[rlenbufacc]=r[i];
279     rlenbufacc++;
280     i++;
281     }
282     if (r[i]==0) break; //end of line reached already
283     //now r[i] is surely a non-digit char
284     if (rlenbufacc>0) { //have we just had a number before?
285     rlenbuf[rlenbufacc]=0;
286     if (r[i]=='=' || r[i]=='-') {
287     i++;
288     if (r[i]==0) break;
289     } else { //check for splice site markers for this introns
290     if (r[i]=='('||r[i]=='[') {
291     l_splice=r[i];
292     i++;
293     if (r[i]==0) break;
294     }
295     if (r[i]==')'||r[i]==']') {
296     r_splice=r[i];
297     i++;
298     if (r[i]==0) break;
299     }
300     }//splice check
301     add_intron(buf, accrd, rgpos, rlenbuf, sqinfo, l_splice, r_splice);
302     rlenbufacc=0;
303     r_splice=0;l_splice=0;
304     //i++;//skip the gap character
305     }
306     //check for digits here and break the linebuf as needed
307     int bi=i; //start of non-digit run
308     while (r[i] && !isdigit(r[i])) i++;
309     int nl=(i-bi); //length of non-digit run
310     if (nl>0) {
311     int si=accrd;
312     accrd+=nl;
313     rgpos+=nl;
314     if (accrd>=buflen-1) {
315     buflen=accrd+512;
316     GREALLOC(buf,buflen);
317     }
318     //append these non-digit chars
319     for(int b=0;b<nl;b++) {
320     buf[si+b]=r[bi+b];
321     }
322     }//non-digit run
323     } //while line chars
324    
325     /*
326     //-- append the line to buf
327     accrd+=linelen;
328     if (accrd>=buflen) {
329     buflen+=1024;
330     GREALLOC(buf,buflen);
331     }
332     strcat(buf, r);
333     */
334    
335     r=linebuf->getLine(f,f_pos);
336     linelen=linebuf->length();
337     }//while linelen>0
338    
339     //add the 0-ending
340     buf[accrd]=0;
341     return buf;
342     }
343    
344     char* AceParser::getSeq(LytSeqInfo* seq) {
345     if (f==stdin || seek(seq->fpos)!=0) {
346     GMessage("AceParser: error seeking seq '%s'\n", seq->name);
347     return NULL;
348     }
349     //skip the contig header:
350     char* r=linebuf->getLine(f,f_pos);
351     if (r==NULL || !startsWith(linebuf->chars(), "RD ", 3)) {
352     GMessage("AceParser: error seeking seq '%s'\n"
353     " (no RD entry found at location %d)\n", seq->name, seq->fpos);
354     return NULL;
355     }
356     return readSeq(seq);
357     }
358    
359     char* AceParser::getContigSeq(LytCtgData* ctg) {
360     if (f==stdin || seek(ctg->fpos)!=0) {
361     GMessage("AceParser: error seeking contig '%s'\n", ctg->name);
362     return NULL;
363     }
364     //skip the contig header:
365     char* r=linebuf->getLine(f,f_pos);
366     if (r==NULL || !startsWith(linebuf->chars(), "CO ", 3)) {
367     GMessage("AceParser: error seeking contig '%s'\n"
368     " (no CO entry found at location %d)\n", ctg->name, ctg->fpos);
369     return NULL;
370     }
371     return readSeq();
372     }