ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/GFaSeqGet.cpp
Revision: 228
Committed: Wed Mar 28 05:13:28 2012 UTC (12 years, 5 months ago) by gpertea
File size: 10226 byte(s)
Log Message:
wip - prefiltering fix

Line User Rev File contents
1 gpertea 29 #include "GFaSeqGet.h"
2     #include "gdna.h"
3     #include <ctype.h>
4    
5     void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) {
6     if (sovl==0) {
7     GFREE(sq);
8     sqstart=sstart;
9     uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ;
10     sqlen = (slen==0 ? max_len : slen);
11     GMALLOC(sq, sqlen);
12     return;
13     }
14     //overlap -- copy the overlapping region
15     char* newsq=NULL;
16     GMALLOC(newsq, slen);
17     memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl);
18     GFREE(sq);
19     sq=newsq;
20     sqstart=sstart;
21     sqlen=slen;
22     }
23    
24     void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
25     fh=fopen(fn,"rb");
26     if (fh==NULL) {
27     GError("Error (GFaSeqGet) opening file '%s'\n",fn);
28     }
29     fname=Gstrdup(fn);
30     initialParse(fofs, validate);
31     lastsub=new GSubSeq();
32     }
33    
34     GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen) {
35     //for GFastaIndex use mostly -- the important difference is that
36     //the file offset is to the sequence, not to the defline
37     fh=fopen(faname,"rb");
38     if (fh==NULL) {
39     GError("Error (GFaSeqGet) opening file '%s'\n",faname);
40     }
41     fname=Gstrdup(faname);
42     line_len=l_len;
43     line_blen=l_blen;
44     seq_len=seqlen;
45     if (line_blen<line_len)
46     GError("Error (GFaSeqGet): invalid line length info (len=%d, blen=%d)\n",
47     line_len, line_blen);
48     fseqstart=fseqofs;
49     lastsub=new GSubSeq();
50     }
51    
52     GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
53 gpertea 228 fname=NULL;
54     fseqstart=0;
55     if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
56     seq_len=0;
57     fh=f;
58     initialParse(fofs, validate);
59     lastsub=new GSubSeq();
60 gpertea 29 }
61    
62     void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
63     static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
64     if (fofs!=0) { fseeko(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
65     //read the first two lines to determine fasta parameters
66     fseqstart=fofs;
67     int c=getc(fh);
68     fseqstart++;
69     if (c!='>') GError("Error (GFaSeqGet): not a fasta header?\n");
70     while ((c=getc(fh))!=EOF) {
71     fseqstart++;
72     if (c=='\n' || c=='\r') { break; } //end of defline
73     }
74    
75     if (c==EOF) GError(gfa_ERRPARSE);
76     line_len=0;
77     int lendlen=0;
78     while ((c=getc(fh))!=EOF) {
79     if (c=='\n' || c=='\r') { //end of line encountered
80     if (line_len>0) { //end of the first "sequence" line
81     lendlen++;
82     break;
83     }
84     else {// another EoL char at the end of defline
85     fseqstart++;
86     continue;
87     }
88     }// end-of-line characters
89     line_len++;
90     }
91     //we are at the end of first sequence line
92     while ((c=getc(fh))!=EOF) {
93     if (c=='\n' || c=='\r') lendlen++;
94     else {
95     ungetc(c,fh);
96     break;
97     }
98     }
99     line_blen=line_len+lendlen;
100     if (c==EOF) return;
101     // -- you don't need to check it all if you're sure it's safe
102     if (checkall) { //validate the rest of the FASTA record
103     int llen=0; //last line length
104     int elen=0; //length of last line ending
105     bool waseol=true;
106     while ((c=getc(fh))!=EOF) {
107     if (c=='>' && waseol) { ungetc(c,fh); break; }
108     if (c=='\n' || c=='\r') {
109     // eol char
110     elen++;
111     if (waseol) continue; //2nd eol char
112     waseol=true;
113     elen=1;
114     continue;
115     }
116     if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
117     //--- on a seq char here:
118     if (waseol) {//beginning of a seq line
119     if (elen && (llen!=line_len || elen!=lendlen))
120     //GError(gfa_ERRPARSE);
121     GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
122     the sequence lines have the same length (except for the last line)");
123     waseol=false;
124     llen=0;
125     elen=0;
126     }
127     llen++;
128     } //while reading chars
129     }// FASTA checking was requested
130     fseeko(fh,fseqstart,SEEK_SET);
131     }
132    
133     const char* GFaSeqGet::subseq(uint cstart, int& clen) {
134     //cstart is 1-based genomic coordinate within current fasta sequence
135     int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ;
136     //GMessage("--> call: subseq(%u, %d)\n", cstart, clen);
137     if (clen>maxlen) {
138     GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
139     return NULL;
140     }
141     if (seq_len>0 && clen+cstart-1>seq_len) {
142     GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len);
143     }
144     if (lastsub->sq==NULL || lastsub->sqlen==0) {
145     lastsub->setup(cstart, clen, 0,0,0,seq_len);
146     loadsubseq(cstart, clen);
147     lastsub->sqlen=clen;
148     return (const char*)lastsub->sq;
149     }
150     //allow extension up to MAX_FASUBSEQ
151     uint bstart=lastsub->sqstart;
152     uint bend=lastsub->sqstart+lastsub->sqlen-1;
153     uint cend=cstart+clen-1;
154     int qlen=0; //only the extra len to be allocated/appended/prepended
155     uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file
156     int newlen=0; //the new total length of the buffered sequence lastsub->sq
157     int kovl=0;
158     int czfrom=0;//0-based offsets for copying a previously read sequence chunk
159     int czto=0;
160     uint newstart=cstart;
161     if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer
162     return (const char*) &(lastsub->sq[cstart-bstart]) ;
163     }
164     //extend downward
165     uint newend=GMAX(cend, bend);
166     if (cstart<bstart) { //requested start < old buffer start
167     newstart=cstart;
168     newlen=(newend-newstart+1);
169     if (newlen>MAX_FASUBSEQ) {
170     newlen=MAX_FASUBSEQ;
171     newend=cstart+newlen-1; //keep newstart, set newend
172     }
173     qlen=bstart-cstart;
174     if (newend>bstart) { //overlap
175     if (newend>bend) {// new region is larger & around the old one - so we have two regions to update
176     kovl=bend-bstart+1;
177     czfrom=0;
178     czto=bstart-cstart;
179     lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq
180     qlen=bstart-cstart;
181     loadsubseq(newstart, qlen);
182     qlen=newend-bend;
183     int toread=qlen;
184     loadsubseq(bend+1, qlen);
185     clen-=(toread-qlen);
186     lastsub->sqlen=clen;
187     return (const char*)lastsub->sq;
188     }
189     //newend<=bend
190     kovl=newend-bstart+1;
191     }
192     else { //no overlap with previous buffer
193     if (newend>bend) kovl=bend-bstart+1;
194     else kovl=newend-bstart+1;
195     }
196     qlen=bstart-cstart;
197     czfrom=0;
198     czto=qlen;
199     } //cstart<bstart
200     else { //cstart>=bstart, possibly extend upwards
201     newstart=bstart;
202     newlen=(newend-newstart+1);
203     if (newlen>MAX_FASUBSEQ) {
204     newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart
205     newlen=MAX_FASUBSEQ;
206     if (newstart<=bend) { //overlap with old buffer
207     kovl=bend-newstart+1;
208     czfrom=newstart-bstart;
209     czto=0;
210     }
211     else { //not overlapping old buffer
212     kovl=0;
213     }
214     } //newstart reassigned
215     else { //we can extend the buffer to include the old one
216     qlen=newend-bend; //how much to read from file
217     qstart=bend+1;
218     kovl=bend-bstart+1;
219     czfrom=0;
220     czto=0;
221     }
222     }
223     lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region
224     lastsub->sqlen-=qlen; //appending may result in a premature eof
225     int toread=qlen;
226     loadsubseq(qstart, qlen); //read the missing chunk, if any
227     clen-=(toread-qlen);
228     lastsub->sqlen+=qlen;
229     return (const char*)(lastsub->sq+(cstart-newstart));
230     }
231    
232     char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
233 gpertea 154 if (cstart>cend) { Gswap(cstart, cend); }
234 gpertea 29 int clen=cend-cstart+1;
235     const char* gs=subseq(cstart, clen);
236     if (gs==NULL) return NULL;
237     char* r=NULL;
238     GMALLOC(r,clen+1);
239     r[clen]=0;
240     memcpy((void*)r,(void*)gs, clen);
241     if (revCmpl) reverseComplement(r,clen);
242     if (upCase) {
243     for (int i=0;i<clen;i++)
244     r[i]=toupper(r[i]);
245     }
246     return r;
247     }
248    
249     const char* GFaSeqGet::loadsubseq(uint cstart, int& clen) {
250     //assumes enough lastsub->sq space allocated previously
251     //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
252     int sofs=cstart-lastsub->sqstart;
253     int lendlen=line_blen-line_len;
254     char* seqp=lastsub->sq+sofs;
255     //find the proper file offset and read the appropriate lines
256     uint seqofs=cstart-1;
257     uint startlno = seqofs/line_len;
258     int lineofs = seqofs % line_len;
259     off_t fstart=fseqstart + (startlno*line_blen);
260     fstart+=lineofs;
261 gpertea 228
262 gpertea 29 fseeko(fh, fstart, SEEK_SET);
263     int toread=clen;
264     int maxlen=(seq_len>0)? seq_len-cstart+1 : MAX_FASUBSEQ ;
265     if (toread==0) toread=maxlen; //read max allowed, or to the end of file
266     int actualrlen=0;
267     int sublen=0;
268     if (lineofs>0) { //read the partial first line
269     int reqrlen=line_len-lineofs;
270     if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
271     actualrlen=fread((void*)seqp, 1, reqrlen, fh);
272     if (actualrlen<reqrlen) { //eof reached prematurely
273     while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
274     //check for new sequences in between
275     clen=actualrlen;
276     sublen+=actualrlen;
277     return (const char*)seqp;
278     }
279     toread-=reqrlen;
280     sublen+=reqrlen;
281     fseeko(fh, lendlen, SEEK_CUR);
282     }
283     //read the rest of the lines
284     while (toread>=line_len) {
285     char* rseqp=&(seqp[sublen]);
286     actualrlen=fread((void*)rseqp, 1, line_len, fh);
287     /*
288     char dbuf[256];dbuf[255]=0;
289     strncpy(dbuf,rseqp, actualrlen);
290     dbuf[actualrlen]=0;
291     GMessage("<<<read line: %s\n",dbuf);
292     */
293     if (actualrlen<line_len) {
294     while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r') actualrlen--;
295     sublen+=actualrlen;
296     clen=sublen;
297     return (const char*)seqp;
298     }
299     toread-=actualrlen;
300     sublen+=actualrlen;
301     fseeko(fh, lendlen, SEEK_CUR);
302     }
303     // read the last partial line, if any
304     if (toread>0) {
305     char* rseqp=&(seqp[sublen]);
306     actualrlen=fread((void*)rseqp, 1, toread, fh);
307     if (actualrlen<toread) {
308     while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r')
309     actualrlen--;
310     }
311     sublen+=actualrlen;
312     }
313     //lastsub->sqlen+=sublen;
314     clen=sublen;
315 gpertea 228
316 gpertea 29 return (const char*)seqp;
317     }