ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/AceParser.cpp
(Generate patch)
# Line 1 | Line 1
1 < #include "AceParser.h"
2 < #include <ctype.h>
3 <
4 < bool AceParser::open() { //also checks if it looks like a valid ACE file
5 <   if (f==stdin) return true;
6 <   if ((f=fopen(fname,"rb"))==NULL) return false;
7 <   char first3[4];
8 <   first3[3]='\0';
9 <   if (fread(first3, 1,3, f)==3) {
10 <     f_pos=3;
11 <     return (strcmp(first3, "AS ")==0 || strcmp(first3, "CO ")==0);
12 <     }
13 <   return false;
14 <   }
15 <
16 <
17 < LytSeqInfo* AceParser::addSeq(char* s, LytCtgData* ctg) {
18 < LytSeqInfo* seq;
19 < if (!startsWith(s, "AF ", 3))
20 <       return NULL;
21 < s+=3;
22 < char* p=strchrs(s," \t");
23 < if (p==NULL) return NULL;
24 < p++;
25 < char c;
26 < int offs;
27 < if (sscanf(p,"%c %d", &c, &offs)!=2) return NULL;
28 < p--;
29 < *p='\0';
30 < if ((seq=seqinfo.Find(s))!=NULL) {
31 <   GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
32 <      " so it cannot be added for contig '%s (%d)'\n",
33 <     s, seq->contig->name, seq->contig->len,
34 <     ctg->name, ctg->len);
35 <   return NULL;
36 <   }
37 < seq = new LytSeqInfo(s, ctg, offs, (c=='C') ? 1 : 0);
38 < seqinfo.shkAdd(seq->name, seq);
39 < ctg->seqs.Add(seq);
40 < return seq;
41 < }
42 <
43 <
44 < bool AceParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
45 <
46 <    bool forgetCtg = false;
47 <    if (ctgidx>=contigs.Count())
48 <      GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
49 <    LytCtgData* ctgdata=contigs[ctgidx];
50 <    if (re_pos && currentContig!=NULL) { //free previously loaded contig data
51 <      currentContig->seqs.Clear();       // unless it was a parse() call
52 <      seqinfo.Clear();
53 <      }
54 <    currentContig=ctgdata;
55 <    int ctg_numSeqs=ctgdata->numseqs;
56 <
57 <    if (re_pos) {
58 <       seek(ctgdata->fpos); //position right where the contig definition starts
59 <       char *r = linebuf->getLine(f,f_pos);
60 <       if (r==NULL) return false;
61 <       }
62 <
63 <    if (seqfn!=NULL) { //process the contig sequence!
64 <       char* ctgseq=readSeq();
65 <       forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, ctgseq);
66 <       GFREE(ctgseq); //obviously the caller should have made a copy
67 <       }
68 <    //now look for all the component sequences
69 <    if (fskipTo("AF ")<0) {
70 <       GMessage("AceParser: error finding sequence offsets (AF)"
71 <                " for contig '%s' (%d)\n", ctgdata->name, ctgdata->len);
72 <       return false;
73 <       }
74 <    int numseqs=0;
75 <    while (startsWith(linebuf->chars(), "AF ",3)) {
76 <      if (addSeq(linebuf->chars(), ctgdata)==NULL) {
77 <         GMessage("AceParser: error parsing AF entry:\n%s\n",linebuf->chars());
78 <         return false;
79 <         }
80 <       numseqs++;
81 <      //read next line:
82 <      linebuf->getLine(f,f_pos);
83 <      }
84 <    if (numseqs!=ctg_numSeqs) {
85 <      GMessage("Invalid number of AF entries found (%d) for contig '%s' "
86 <         "(length %d, numseqs %d)\n", numseqs,
87 <                ctgdata->name, ctgdata->len, ctg_numSeqs);
88 <      return false;
89 <      }
90 <    //now read each sequence entry
91 <    off_t seqpos=fskipTo("RD ");
92 <    numseqs=0; //count again, now the RD entries
93 <    if (seqpos<0) {
94 <         GMessage("AceParser: error locating first RD entry for contig '%s'\n",
95 <             ctgdata->name);
96 <         return false;
97 <         }
98 <    //int numseqs=0;
99 <    //reading the actual component sequence details
100 <    while (startsWith(linebuf->chars(), "RD ",3)) {
101 <      char* s=linebuf->chars()+3;
102 <      char* p=strchrs(s, " \t");
103 <      LytSeqInfo* seq;
104 <      if (p==NULL) {
105 <          GMessage("AceParser: Error parsing RD header line:\n%s\n", linebuf->chars());
106 <          return false;
107 <          }
108 <      *p='\0';
109 <      if ((seq=seqinfo.Find(s))==NULL) {
110 <          GMessage("AceParser: unknown RD encountered: '%s'\n", s);
111 <          return false;
112 <          }
113 <      p++; //now p is in linebuf after the RD name
114 <      seq->fpos=seqpos;
115 <      int len;
116 <      if (sscanf(p, "%d", &len)!=1) {
117 <          GMessage("AceParser: cannot parse RD length for '%s'\n", s);
118 <          return false;
119 <          }
120 <      seq->setLength(len);
121 <      //read the sequence data here if a callback fn was given:
122 <      char* sseq=NULL;
123 <      if (seqfn!=NULL)
124 <          sseq=readSeq(seq); //read full sequence here
125 <      if (fskipTo("QA ")<0) {
126 <           GMessage("AceParser: Error finding QA entry for read %s! (fpos=%llu)\n", seq->name, (unsigned long long)f_pos);
127 <           return false;
128 <           }
129 <      //parse QA entry:
130 <      int tmpa, tmpb;
131 <      if (sscanf(linebuf->chars()+3, "%d %d %d %d", &tmpa, &tmpb, &seq->left,&seq->right)!=4 ||
132 <             seq->left<=0 || seq->right<=0) {
133 <           GMessage("AceParser: Error parsing QA entry.\n");
134 <           return false;
135 <           }
136 <      /*
137 <      if (fskipTo("DS")<0) {
138 <           GMessage("AceParser: Error closing RD entry ('DS' not found).\n");
139 <           return false;
140 <           }
141 <           */
142 <      seqpos=getFilePos()+1;
143 <      bool forgetSeq=false;
144 <      if (seqfn!=NULL) {
145 <          forgetSeq=(*seqfn)(numContigs, ctgdata, seq, sseq);
146 <          GFREE(sseq);
147 <          }
148 <      if (forgetSeq) { //parsing the whole stream -- aceconv)
149 <            ctg_numSeqs--;
150 <            seqinfo.Remove(seq->name);
151 <            ctgdata->seqs.RemovePtr(seq);
152 <            }
153 <      numseqs++;
154 <      if (numseqs<ctgdata->numseqs)
155 <        seqpos=fskipTo("RD ", "CO "); //more sequences left to read
156 <      }
157 <    if (numseqs!=ctgdata->numseqs) {
158 <      GMessage("Error: Invalid number of RD entries found (%d) for contig '%s' "
159 <         "(length %d, numseqs %d)\n", numseqs,
160 <                ctgdata->name, ctgdata->len, ctg_numSeqs);
161 <      return false;
162 <      }
163 <    if (forgetCtg) {
164 <      ctgIDs.Remove(ctgdata->name);
165 <      ctgdata->seqs.Clear();
166 <      seqinfo.Clear();
167 <      contigs.RemovePtr(ctgdata);
168 <      }
169 < return true;
170 < }
171 <
172 < //read only the contig headers from the file
173 < //also checks for duplicate seqnames (just in case)
174 < bool AceParser::parseContigs() {
175 <  if (f!=stdin)  seek(0);
176 <  off_t ctgpos;
177 <  numContigs=0;
178 <  while ((ctgpos=fskipTo("CO "))>=0) {
179 <    numContigs++;
180 <    LytCtgData* ctgdata=new LytCtgData(ctgpos);
181 <    char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
182 <    if (p==NULL) {
183 <       GMessage("AceParser: error parsing contig name!\n");
184 <       return false;
185 <       }
186 <    int ctglen, ctg_numSeqs, numseqs;
187 <    //p must be after contig name within linebuf!
188 <    if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
189 <      GMessage("Error parsing contig len and seq count at:\n%s\n",
190 <         linebuf->chars());
191 <      }
192 <    ctg_numSeqs=numseqs;
193 <    ctgdata->len=ctglen;
194 <    ctgdata->numseqs=numseqs;
195 <    ctgdata->offs = 1;
196 <    contigs.Add(ctgdata);
197 <    //ctgpos=fskipTo("CO ");
198 <    }
199 <  if (ctgpos==-2) return false; //parsing failed: line too long ?!?
200 <  contigs.setSorted(true);
201 <  return true;
202 < }
203 <
204 <
205 < bool AceParser::parse(fnLytSeq* seqfn) {
206 <  //read all seqs and their positions from the file
207 <  //also checks for duplicate seqnames (just in case)
208 <  if (f!=stdin)  seek(0);
209 <  ctgIDs.Clear();
210 <  //GHash<int> ctgIDs; //contig IDs, to make them unique!
211 <  //
212 <  off_t ctgpos;
213 <  numContigs=0;
214 <  while ((ctgpos=fskipTo("CO "))>=0) {
215 <    numContigs++;
216 <    LytCtgData* ctgdata=new LytCtgData(ctgpos);
217 <    char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
218 <    if (p==NULL) {
219 <       GMessage("AceParser: error parsing contig name!\n");
220 <       return false;
221 <       }
222 <        int ctglen, numseqs;
223 <        //p must be after contig name within linebuf!
224 <        if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
225 <          GMessage("Error parsing contig len and seq count at:\n%s\n",
226 <                 linebuf->chars());
227 <          }
228 <        ctgdata->len=ctglen;
229 <        ctgdata->numseqs = numseqs;
230 <        ctgdata->offs = 1;
231 <        int ctgidx=contigs.Add(ctgdata);
232 <        loadContig(ctgidx, seqfn, false);
233 <        } //while contigs
234 <  if (ctgpos==-2) return false; //parsing failed: line too long ?!?
235 <  contigs.setSorted(true);
236 <  return true;
237 <  }
238 <
239 <
240 < void add_intron(char*& buf,int accrd, int& rgpos,
241 <        const char* numstr, LytSeqInfo* seqinfo, char lspl, char rspl) {
242 <        int n=atoi(numstr);
243 <        if (n<=0) return;
244 <        if (seqinfo!=NULL) {
245 <          char sL = 0;
246 <      char sR = 0;
247 <      if (lspl>0) sL=(lspl=='['?'S':'s');
248 <      if (rspl>0) sR=(rspl==']'?'S':'s');
249 <          seqinfo->addInterSeg(seqinfo->offs+rgpos-1,seqinfo->offs+rgpos+n,
250 <                                                   0,0,sL,sR,accrd);
251 <          }
252 <        rgpos+=n;
253 < }
254 <
255 < char* AceParser::readSeq(LytSeqInfo* seqinfo) {
256 < //assumes the next line is where a sequence starts!
257 < //stops at the next empty line encountered
258 < char* buf;
259 < static char rlenbuf[12]={0,0,0,0,0,0,0,0,0,0,0,0}; //buffer for parsing the gap length
260 < int rlenbufacc=0; //how many digits accumulated in rlenbuf so far
261 < int buflen=512;
262 < GMALLOC(buf, buflen); //this MUST be freed by the caller
263 < buf[0]='\0';
264 < int accrd=0; //accumulated read length so far -- excludes interseg gaps!
265 < int rgpos=0; //accumulated offset including interseg gaps!
266 < char *r = linebuf->getLine(f,f_pos);
267 < int linelen=linebuf->length();
268 < char r_splice=0, l_splice=0;
269 < while (linelen>0) {
270 <   if (r==NULL) {
271 <          GMessage("AceParser: error reading sequence data\n");
272 <          return NULL;
273 <          }
274 <   //-- pass the line content for accumulation
275 <   int i=0;
276 <   while (r[i]) {
277 <         while (r[i] && isdigit(r[i])) {
278 <                 rlenbuf[rlenbufacc]=r[i];
279 <                 rlenbufacc++;
280 <                 i++;
281 <                 }
282 <         if (r[i]==0) break; //end of line reached already
283 <         //now r[i] is surely a non-digit char
284 <         if (rlenbufacc>0) { //have we just had a number before?
285 <                rlenbuf[rlenbufacc]=0;
286 <                if (r[i]=='=' || r[i]=='-') {
287 <                     i++;
288 <                     if (r[i]==0) break;
289 <                   } else { //check for splice site markers for this introns
290 <                     if (r[i]=='('||r[i]=='[') {
291 <                         l_splice=r[i];
292 <                         i++;
293 <                         if (r[i]==0) break;
294 <                         }
295 <                     if (r[i]==')'||r[i]==']') {
296 <                 r_splice=r[i];
297 <                 i++;
298 <                 if (r[i]==0) break;
299 <                         }
300 <                   }//splice check
301 <                add_intron(buf, accrd, rgpos, rlenbuf, seqinfo, l_splice, r_splice);
302 <                rlenbufacc=0;
303 <                r_splice=0;l_splice=0;
304 <                //i++;//skip the gap character
305 <                }
306 <         //check for digits here and break the linebuf as needed
307 <         int bi=i; //start of non-digit run
308 <         while (r[i] && !isdigit(r[i])) i++;
309 <         int nl=(i-bi); //length of non-digit run
310 <         if (nl>0) {
311 <                int si=accrd;
312 <                accrd+=nl;
313 <                rgpos+=nl;
314 <                if (accrd>=buflen-1) {
315 <                        buflen=accrd+512;
316 <                        GREALLOC(buf,buflen);
317 <                        }
318 <                //append these non-digit chars
319 <                for(int b=0;b<nl;b++) {
320 <                        buf[si+b]=r[bi+b];
321 <                        }
322 <                }//non-digit run
323 <         } //while line chars
324 <
325 <   /*
326 <   //-- append the line to buf
327 <   accrd+=linelen;
328 <   if (accrd>=buflen) {
329 <                buflen+=1024;
330 <                GREALLOC(buf,buflen);
331 <                }
332 <   strcat(buf, r);
333 <   */
334 <
335 <   r=linebuf->getLine(f,f_pos);
336 <   linelen=linebuf->length();
337 <   }//while linelen>0
338 <
339 < //add the 0-ending
340 < buf[accrd]=0;
341 < return buf;
342 < }
343 <
344 < char* AceParser::getSeq(LytSeqInfo* seq) {
345 < if (f==stdin || seek(seq->fpos)!=0) {
346 <   GMessage("AceParser: error seeking seq '%s'\n", seq->name);
347 <   return NULL;
348 <   }
349 < //skip the contig header:
350 < char* r=linebuf->getLine(f,f_pos);
351 < if (r==NULL || !startsWith(linebuf->chars(), "RD ", 3)) {
352 <        GMessage("AceParser: error seeking seq '%s'\n"
353 <                " (no RD entry found at location %d)\n", seq->name, seq->fpos);
354 <    return NULL;
355 <        }
356 < return readSeq(seq);
357 < }
358 <
359 < char* AceParser::getContigSeq(LytCtgData* ctg) {
360 < if (f==stdin || seek(ctg->fpos)!=0) {
361 <   GMessage("AceParser: error seeking contig '%s'\n", ctg->name);
362 <   return NULL;
363 <   }
364 < //skip the contig header:
365 < char* r=linebuf->getLine(f,f_pos);
366 < if (r==NULL || !startsWith(linebuf->chars(), "CO ", 3)) {
367 <    GMessage("AceParser: error seeking contig '%s'\n"
368 <        " (no CO entry found at location %d)\n", ctg->name, ctg->fpos);
369 <    return NULL;
370 <    }
371 < return readSeq();
372 < }
373 <
374 <
1 > #include "AceParser.h"
2 > #include <ctype.h>
3 >
4 > bool AceParser::open() { //also checks if it looks like a valid ACE file
5 >   if (f==stdin) return true;
6 >   if ((f=fopen(fname,"rb"))==NULL) return false;
7 >   char first3[4];
8 >   first3[3]='\0';
9 >   if (fread(first3, 1,3, f)==3) {
10 >     f_pos=3;
11 >     return (strcmp(first3, "AS ")==0 || strcmp(first3, "CO ")==0);
12 >     }
13 >   return false;
14 >   }
15 >
16 >
17 > LytSeqInfo* AceParser::addSeq(char* s, LytCtgData* ctg) {
18 > LytSeqInfo* seq;
19 > if (!startsWith(s, "AF ", 3))
20 >       return NULL;
21 > s+=3;
22 > char* p=strchrs(s," \t");
23 > if (p==NULL) return NULL;
24 > p++;
25 > char c;
26 > int offs;
27 > if (sscanf(p,"%c %d", &c, &offs)!=2) return NULL;
28 > p--;
29 > *p='\0';
30 > if ((seq=seqinfo.Find(s))!=NULL) {
31 >   GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
32 >      " so it cannot be added for contig '%s (%d)'\n",
33 >     s, seq->contig->name, seq->contig->len,
34 >     ctg->name, ctg->len);
35 >   return NULL;
36 >   }
37 > seq = new LytSeqInfo(s, ctg, offs, (c=='C') ? 1 : 0);
38 > seqinfo.shkAdd(seq->name, seq);
39 > ctg->seqs.Add(seq);
40 > return seq;
41 > }
42 >
43 >
44 > bool AceParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
45 >
46 >    bool forgetCtg = false;
47 >    if (ctgidx>=contigs.Count())
48 >      GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
49 >    LytCtgData* ctgdata=contigs[ctgidx];
50 >    if (re_pos && currentContig!=NULL) { //free previously loaded contig data
51 >      currentContig->seqs.Clear();       // unless it was a parse() call
52 >      seqinfo.Clear();
53 >      }
54 >    currentContig=ctgdata;
55 >    int ctg_numSeqs=ctgdata->numseqs;
56 >
57 >    if (re_pos) {
58 >       seek(ctgdata->fpos); //position right where the contig definition starts
59 >       char *r = linebuf->getLine(f,f_pos);
60 >       if (r==NULL) return false;
61 >       }
62 >
63 >    if (seqfn!=NULL) { //process the contig sequence!
64 >       char* ctgseq=readSeq();
65 >       forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, ctgseq);
66 >       GFREE(ctgseq); //obviously the caller should have made a copy
67 >       }
68 >    //now look for all the component sequences
69 >    if (fskipTo("AF ")<0) {
70 >       GMessage("AceParser: error finding sequence offsets (AF)"
71 >                " for contig '%s' (%d)\n", ctgdata->name, ctgdata->len);
72 >       return false;
73 >       }
74 >    int numseqs=0;
75 >    while (startsWith(linebuf->chars(), "AF ",3)) {
76 >      if (addSeq(linebuf->chars(), ctgdata)==NULL) {
77 >         GMessage("AceParser: error parsing AF entry:\n%s\n",linebuf->chars());
78 >         return false;
79 >         }
80 >       numseqs++;
81 >      //read next line:
82 >      linebuf->getLine(f,f_pos);
83 >      }
84 >    if (numseqs!=ctg_numSeqs) {
85 >      GMessage("Invalid number of AF entries found (%d) for contig '%s' "
86 >         "(length %d, numseqs %d)\n", numseqs,
87 >                ctgdata->name, ctgdata->len, ctg_numSeqs);
88 >      return false;
89 >      }
90 >    //now read each sequence entry
91 >    off_t seqpos=fskipTo("RD ");
92 >    numseqs=0; //count again, now the RD entries
93 >    if (seqpos<0) {
94 >         GMessage("AceParser: error locating first RD entry for contig '%s'\n",
95 >             ctgdata->name);
96 >         return false;
97 >         }
98 >    //int numseqs=0;
99 >    //reading the actual component sequence details
100 >    while (startsWith(linebuf->chars(), "RD ",3)) {
101 >      char* s=linebuf->chars()+3;
102 >      char* p=strchrs(s, " \t");
103 >      LytSeqInfo* seq;
104 >      if (p==NULL) {
105 >          GMessage("AceParser: Error parsing RD header line:\n%s\n", linebuf->chars());
106 >          return false;
107 >          }
108 >      *p='\0';
109 >      if ((seq=seqinfo.Find(s))==NULL) {
110 >          GMessage("AceParser: unknown RD encountered: '%s'\n", s);
111 >          return false;
112 >          }
113 >      p++; //now p is in linebuf after the RD name
114 >      seq->fpos=seqpos;
115 >      int len;
116 >      if (sscanf(p, "%d", &len)!=1) {
117 >          GMessage("AceParser: cannot parse RD length for '%s'\n", s);
118 >          return false;
119 >          }
120 >      seq->setLength(len);
121 >      //read the sequence data here if a callback fn was given:
122 >      char* sseq=NULL;
123 >      if (seqfn!=NULL)
124 >          sseq=readSeq(seq); //read full sequence here
125 >      if (fskipTo("QA ")<0) {
126 >           GMessage("AceParser: Error finding QA entry for read %s! (fpos=%llu)\n", seq->name, (unsigned long long)f_pos);
127 >           return false;
128 >           }
129 >      //parse QA entry:
130 >      int tmpa, tmpb;
131 >      if (sscanf(linebuf->chars()+3, "%d %d %d %d", &tmpa, &tmpb, &seq->left,&seq->right)!=4 ||
132 >             seq->left<=0 || seq->right<=0) {
133 >           GMessage("AceParser: Error parsing QA entry.\n");
134 >           return false;
135 >           }
136 >      /*
137 >      if (fskipTo("DS")<0) {
138 >           GMessage("AceParser: Error closing RD entry ('DS' not found).\n");
139 >           return false;
140 >           }
141 >           */
142 >      seqpos=getFilePos()+1;
143 >      bool forgetSeq=false;
144 >      if (seqfn!=NULL) {
145 >          forgetSeq=(*seqfn)(numContigs, ctgdata, seq, sseq);
146 >          GFREE(sseq);
147 >          }
148 >      if (forgetSeq) { //parsing the whole stream -- aceconv)
149 >            ctg_numSeqs--;
150 >            seqinfo.Remove(seq->name);
151 >            ctgdata->seqs.RemovePtr(seq);
152 >            }
153 >      numseqs++;
154 >      if (numseqs<ctgdata->numseqs)
155 >        seqpos=fskipTo("RD ", "CO "); //more sequences left to read
156 >      }
157 >    if (numseqs!=ctgdata->numseqs) {
158 >      GMessage("Error: Invalid number of RD entries found (%d) for contig '%s' "
159 >         "(length %d, numseqs %d)\n", numseqs,
160 >                ctgdata->name, ctgdata->len, ctg_numSeqs);
161 >      return false;
162 >      }
163 >    if (forgetCtg) {
164 >      ctgIDs.Remove(ctgdata->name);
165 >      ctgdata->seqs.Clear();
166 >      seqinfo.Clear();
167 >      contigs.RemovePtr(ctgdata);
168 >      }
169 > return true;
170 > }
171 >
172 > //read only the contig headers from the file
173 > //also checks for duplicate seqnames (just in case)
174 > bool AceParser::parseContigs() {
175 >  if (f!=stdin)  seek(0);
176 >  off_t ctgpos;
177 >  numContigs=0;
178 >  while ((ctgpos=fskipTo("CO "))>=0) {
179 >    numContigs++;
180 >    LytCtgData* ctgdata=new LytCtgData(ctgpos);
181 >    char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
182 >    if (p==NULL) {
183 >       GMessage("AceParser: error parsing contig name!\n");
184 >       return false;
185 >       }
186 >    int ctglen, ctg_numSeqs, numseqs;
187 >    //p must be after contig name within linebuf!
188 >    if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
189 >      GMessage("Error parsing contig len and seq count at:\n%s\n",
190 >         linebuf->chars());
191 >      }
192 >    ctg_numSeqs=numseqs;
193 >    ctgdata->len=ctglen;
194 >    ctgdata->numseqs=numseqs;
195 >    ctgdata->offs = 1;
196 >    contigs.Add(ctgdata);
197 >    //ctgpos=fskipTo("CO ");
198 >    }
199 >  if (ctgpos==-2) return false; //parsing failed: line too long ?!?
200 >  contigs.setSorted(true);
201 >  return true;
202 > }
203 >
204 >
205 > bool AceParser::parse(fnLytSeq* seqfn) {
206 >  //read all seqs and their positions from the file
207 >  //also checks for duplicate seqnames (just in case)
208 >  if (f!=stdin)  seek(0);
209 >  ctgIDs.Clear();
210 >  //GHash<int> ctgIDs; //contig IDs, to make them unique!
211 >  //
212 >  off_t ctgpos;
213 >  numContigs=0;
214 >  while ((ctgpos=fskipTo("CO "))>=0) {
215 >    numContigs++;
216 >    LytCtgData* ctgdata=new LytCtgData(ctgpos);
217 >    char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
218 >    if (p==NULL) {
219 >       GMessage("AceParser: error parsing contig name!\n");
220 >       return false;
221 >       }
222 >        int ctglen, numseqs;
223 >        //p must be after contig name within linebuf!
224 >        if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
225 >          GMessage("Error parsing contig len and seq count at:\n%s\n",
226 >                 linebuf->chars());
227 >          }
228 >        ctgdata->len=ctglen;
229 >        ctgdata->numseqs = numseqs;
230 >        ctgdata->offs = 1;
231 >        int ctgidx=contigs.Add(ctgdata);
232 >        loadContig(ctgidx, seqfn, false);
233 >        } //while contigs
234 >  if (ctgpos==-2) return false; //parsing failed: line too long ?!?
235 >  contigs.setSorted(true);
236 >  return true;
237 >  }
238 >
239 >
240 > void add_intron(char*& buf,int accrd, int& rgpos,
241 >        const char* numstr, LytSeqInfo* seqinfo, char lspl, char rspl) {
242 >        int n=atoi(numstr);
243 >        if (n<=0) return;
244 >        if (seqinfo!=NULL) {
245 >          char sL = 0;
246 >      char sR = 0;
247 >      if (lspl>0) sL=(lspl=='['?'S':'s');
248 >      if (rspl>0) sR=(rspl==']'?'S':'s');
249 >          seqinfo->addInterSeg(seqinfo->offs+rgpos-1,seqinfo->offs+rgpos+n,
250 >                                                   0,0,sL,sR,accrd);
251 >          }
252 >        rgpos+=n;
253 > }
254 >
255 > char* AceParser::readSeq(LytSeqInfo* sqinfo) {
256 > //assumes the next line is where a sequence starts!
257 > //stops at the next empty line encountered
258 > char* buf;
259 > static char rlenbuf[12]={0,0,0,0,0,0,0,0,0,0,0,0}; //buffer for parsing the gap length
260 > int rlenbufacc=0; //how many digits accumulated in rlenbuf so far
261 > int buflen=512;
262 > GMALLOC(buf, buflen); //this MUST be freed by the caller
263 > buf[0]='\0';
264 > int accrd=0; //accumulated read length so far -- excludes interseg gaps!
265 > int rgpos=0; //accumulated offset including interseg gaps!
266 > char *r = linebuf->getLine(f,f_pos);
267 > int linelen=linebuf->length();
268 > char r_splice=0, l_splice=0;
269 > while (linelen>0) {
270 >   if (r==NULL) {
271 >          GMessage("AceParser: error reading sequence data\n");
272 >          return NULL;
273 >          }
274 >   //-- pass the line content for accumulation
275 >   int i=0;
276 >   while (r[i]) {
277 >         while (r[i] && isdigit(r[i])) {
278 >                 rlenbuf[rlenbufacc]=r[i];
279 >                 rlenbufacc++;
280 >                 i++;
281 >                 }
282 >         if (r[i]==0) break; //end of line reached already
283 >         //now r[i] is surely a non-digit char
284 >         if (rlenbufacc>0) { //have we just had a number before?
285 >                rlenbuf[rlenbufacc]=0;
286 >                if (r[i]=='=' || r[i]=='-') {
287 >                     i++;
288 >                     if (r[i]==0) break;
289 >                   } else { //check for splice site markers for this introns
290 >                     if (r[i]=='('||r[i]=='[') {
291 >                         l_splice=r[i];
292 >                         i++;
293 >                         if (r[i]==0) break;
294 >                         }
295 >                     if (r[i]==')'||r[i]==']') {
296 >                 r_splice=r[i];
297 >                 i++;
298 >                 if (r[i]==0) break;
299 >                         }
300 >                   }//splice check
301 >                add_intron(buf, accrd, rgpos, rlenbuf, sqinfo, l_splice, r_splice);
302 >                rlenbufacc=0;
303 >                r_splice=0;l_splice=0;
304 >                //i++;//skip the gap character
305 >                }
306 >         //check for digits here and break the linebuf as needed
307 >         int bi=i; //start of non-digit run
308 >         while (r[i] && !isdigit(r[i])) i++;
309 >         int nl=(i-bi); //length of non-digit run
310 >         if (nl>0) {
311 >                int si=accrd;
312 >                accrd+=nl;
313 >                rgpos+=nl;
314 >                if (accrd>=buflen-1) {
315 >                        buflen=accrd+512;
316 >                        GREALLOC(buf,buflen);
317 >                        }
318 >                //append these non-digit chars
319 >                for(int b=0;b<nl;b++) {
320 >                        buf[si+b]=r[bi+b];
321 >                        }
322 >                }//non-digit run
323 >         } //while line chars
324 >
325 >   /*
326 >   //-- append the line to buf
327 >   accrd+=linelen;
328 >   if (accrd>=buflen) {
329 >                buflen+=1024;
330 >                GREALLOC(buf,buflen);
331 >                }
332 >   strcat(buf, r);
333 >   */
334 >
335 >   r=linebuf->getLine(f,f_pos);
336 >   linelen=linebuf->length();
337 >   }//while linelen>0
338 >
339 > //add the 0-ending
340 > buf[accrd]=0;
341 > return buf;
342 > }
343 >
344 > char* AceParser::getSeq(LytSeqInfo* seq) {
345 > if (f==stdin || seek(seq->fpos)!=0) {
346 >   GMessage("AceParser: error seeking seq '%s'\n", seq->name);
347 >   return NULL;
348 >   }
349 > //skip the contig header:
350 > char* r=linebuf->getLine(f,f_pos);
351 > if (r==NULL || !startsWith(linebuf->chars(), "RD ", 3)) {
352 >        GMessage("AceParser: error seeking seq '%s'\n"
353 >                " (no RD entry found at location %d)\n", seq->name, seq->fpos);
354 >    return NULL;
355 >        }
356 > return readSeq(seq);
357 > }
358 >
359 > char* AceParser::getContigSeq(LytCtgData* ctg) {
360 > if (f==stdin || seek(ctg->fpos)!=0) {
361 >   GMessage("AceParser: error seeking contig '%s'\n", ctg->name);
362 >   return NULL;
363 >   }
364 > //skip the contig header:
365 > char* r=linebuf->getLine(f,f_pos);
366 > if (r==NULL || !startsWith(linebuf->chars(), "CO ", 3)) {
367 >    GMessage("AceParser: error seeking contig '%s'\n"
368 >        " (no CO entry found at location %d)\n", ctg->name, ctg->fpos);
369 >    return NULL;
370 >    }
371 > return readSeq();
372 > }
373 >
374 >

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines