ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.cpp
Revision: 171
Committed: Tue Feb 14 22:36:26 2012 UTC (12 years, 6 months ago) by gpertea
File size: 10226 byte(s)
Log Message:
wip fqtrim

Line File contents
1 #include "GFaSeqGet.h"
2 #include "gdna.h"
3 #include <ctype.h>
4
5 void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) {
6 if (sovl==0) {
7 GFREE(sq);
8 sqstart=sstart;
9 uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ;
10 sqlen = (slen==0 ? max_len : slen);
11 GMALLOC(sq, sqlen);
12 return;
13 }
14 //overlap -- copy the overlapping region
15 char* newsq=NULL;
16 GMALLOC(newsq, slen);
17 memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl);
18 GFREE(sq);
19 sq=newsq;
20 sqstart=sstart;
21 sqlen=slen;
22 }
23
24 void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
25 fh=fopen(fn,"rb");
26 if (fh==NULL) {
27 GError("Error (GFaSeqGet) opening file '%s'\n",fn);
28 }
29 fname=Gstrdup(fn);
30 initialParse(fofs, validate);
31 lastsub=new GSubSeq();
32 }
33
34 GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen) {
35 //for GFastaIndex use mostly -- the important difference is that
36 //the file offset is to the sequence, not to the defline
37 fh=fopen(faname,"rb");
38 if (fh==NULL) {
39 GError("Error (GFaSeqGet) opening file '%s'\n",faname);
40 }
41 fname=Gstrdup(faname);
42 line_len=l_len;
43 line_blen=l_blen;
44 seq_len=seqlen;
45 if (line_blen<line_len)
46 GError("Error (GFaSeqGet): invalid line length info (len=%d, blen=%d)\n",
47 line_len, line_blen);
48 fseqstart=fseqofs;
49 lastsub=new GSubSeq();
50 }
51
52 GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
53 fname=NULL;
54 fseqstart=0;
55 if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
56 seq_len=0;
57 fh=f;
58 initialParse(fofs, validate);
59 lastsub=new GSubSeq();
60 }
61
62 void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
63 static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
64 if (fofs!=0) { fseeko(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
65 //read the first two lines to determine fasta parameters
66 fseqstart=fofs;
67 int c=getc(fh);
68 fseqstart++;
69 if (c!='>') GError("Error (GFaSeqGet): not a fasta header?\n");
70 while ((c=getc(fh))!=EOF) {
71 fseqstart++;
72 if (c=='\n' || c=='\r') { break; } //end of defline
73 }
74
75 if (c==EOF) GError(gfa_ERRPARSE);
76 line_len=0;
77 int lendlen=0;
78 while ((c=getc(fh))!=EOF) {
79 if (c=='\n' || c=='\r') { //end of line encountered
80 if (line_len>0) { //end of the first "sequence" line
81 lendlen++;
82 break;
83 }
84 else {// another EoL char at the end of defline
85 fseqstart++;
86 continue;
87 }
88 }// end-of-line characters
89 line_len++;
90 }
91 //we are at the end of first sequence line
92 while ((c=getc(fh))!=EOF) {
93 if (c=='\n' || c=='\r') lendlen++;
94 else {
95 ungetc(c,fh);
96 break;
97 }
98 }
99 line_blen=line_len+lendlen;
100 if (c==EOF) return;
101 // -- you don't need to check it all if you're sure it's safe
102 if (checkall) { //validate the rest of the FASTA record
103 int llen=0; //last line length
104 int elen=0; //length of last line ending
105 bool waseol=true;
106 while ((c=getc(fh))!=EOF) {
107 if (c=='>' && waseol) { ungetc(c,fh); break; }
108 if (c=='\n' || c=='\r') {
109 // eol char
110 elen++;
111 if (waseol) continue; //2nd eol char
112 waseol=true;
113 elen=1;
114 continue;
115 }
116 if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
117 //--- on a seq char here:
118 if (waseol) {//beginning of a seq line
119 if (elen && (llen!=line_len || elen!=lendlen))
120 //GError(gfa_ERRPARSE);
121 GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
122 the sequence lines have the same length (except for the last line)");
123 waseol=false;
124 llen=0;
125 elen=0;
126 }
127 llen++;
128 } //while reading chars
129 }// FASTA checking was requested
130 fseeko(fh,fseqstart,SEEK_SET);
131 }
132
133 const char* GFaSeqGet::subseq(uint cstart, int& clen) {
134 //cstart is 1-based genomic coordinate within current fasta sequence
135 int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ;
136 //GMessage("--> call: subseq(%u, %d)\n", cstart, clen);
137 if (clen>maxlen) {
138 GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
139 return NULL;
140 }
141 if (seq_len>0 && clen+cstart-1>seq_len) {
142 GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len);
143 }
144 if (lastsub->sq==NULL || lastsub->sqlen==0) {
145 lastsub->setup(cstart, clen, 0,0,0,seq_len);
146 loadsubseq(cstart, clen);
147 lastsub->sqlen=clen;
148 return (const char*)lastsub->sq;
149 }
150 //allow extension up to MAX_FASUBSEQ
151 uint bstart=lastsub->sqstart;
152 uint bend=lastsub->sqstart+lastsub->sqlen-1;
153 uint cend=cstart+clen-1;
154 int qlen=0; //only the extra len to be allocated/appended/prepended
155 uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file
156 int newlen=0; //the new total length of the buffered sequence lastsub->sq
157 int kovl=0;
158 int czfrom=0;//0-based offsets for copying a previously read sequence chunk
159 int czto=0;
160 uint newstart=cstart;
161 if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer
162 return (const char*) &(lastsub->sq[cstart-bstart]) ;
163 }
164 //extend downward
165 uint newend=GMAX(cend, bend);
166 if (cstart<bstart) { //requested start < old buffer start
167 newstart=cstart;
168 newlen=(newend-newstart+1);
169 if (newlen>MAX_FASUBSEQ) {
170 newlen=MAX_FASUBSEQ;
171 newend=cstart+newlen-1; //keep newstart, set newend
172 }
173 qlen=bstart-cstart;
174 if (newend>bstart) { //overlap
175 if (newend>bend) {// new region is larger & around the old one - so we have two regions to update
176 kovl=bend-bstart+1;
177 czfrom=0;
178 czto=bstart-cstart;
179 lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq
180 qlen=bstart-cstart;
181 loadsubseq(newstart, qlen);
182 qlen=newend-bend;
183 int toread=qlen;
184 loadsubseq(bend+1, qlen);
185 clen-=(toread-qlen);
186 lastsub->sqlen=clen;
187 return (const char*)lastsub->sq;
188 }
189 //newend<=bend
190 kovl=newend-bstart+1;
191 }
192 else { //no overlap with previous buffer
193 if (newend>bend) kovl=bend-bstart+1;
194 else kovl=newend-bstart+1;
195 }
196 qlen=bstart-cstart;
197 czfrom=0;
198 czto=qlen;
199 } //cstart<bstart
200 else { //cstart>=bstart, possibly extend upwards
201 newstart=bstart;
202 newlen=(newend-newstart+1);
203 if (newlen>MAX_FASUBSEQ) {
204 newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart
205 newlen=MAX_FASUBSEQ;
206 if (newstart<=bend) { //overlap with old buffer
207 kovl=bend-newstart+1;
208 czfrom=newstart-bstart;
209 czto=0;
210 }
211 else { //not overlapping old buffer
212 kovl=0;
213 }
214 } //newstart reassigned
215 else { //we can extend the buffer to include the old one
216 qlen=newend-bend; //how much to read from file
217 qstart=bend+1;
218 kovl=bend-bstart+1;
219 czfrom=0;
220 czto=0;
221 }
222 }
223 lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region
224 lastsub->sqlen-=qlen; //appending may result in a premature eof
225 int toread=qlen;
226 loadsubseq(qstart, qlen); //read the missing chunk, if any
227 clen-=(toread-qlen);
228 lastsub->sqlen+=qlen;
229 return (const char*)(lastsub->sq+(cstart-newstart));
230 }
231
232 char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
233 if (cstart>cend) { Gswap(cstart, cend); }
234 int clen=cend-cstart+1;
235 const char* gs=subseq(cstart, clen);
236 if (gs==NULL) return NULL;
237 char* r=NULL;
238 GMALLOC(r,clen+1);
239 r[clen]=0;
240 memcpy((void*)r,(void*)gs, clen);
241 if (revCmpl) reverseComplement(r,clen);
242 if (upCase) {
243 for (int i=0;i<clen;i++)
244 r[i]=toupper(r[i]);
245 }
246 return r;
247 }
248
249 const char* GFaSeqGet::loadsubseq(uint cstart, int& clen) {
250 //assumes enough lastsub->sq space allocated previously
251 //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
252 int sofs=cstart-lastsub->sqstart;
253 int lendlen=line_blen-line_len;
254 char* seqp=lastsub->sq+sofs;
255 //find the proper file offset and read the appropriate lines
256 uint seqofs=cstart-1;
257 uint startlno = seqofs/line_len;
258 int lineofs = seqofs % line_len;
259 off_t fstart=fseqstart + (startlno*line_blen);
260 fstart+=lineofs;
261
262 fseeko(fh, fstart, SEEK_SET);
263 int toread=clen;
264 int maxlen=(seq_len>0)? seq_len-cstart+1 : MAX_FASUBSEQ ;
265 if (toread==0) toread=maxlen; //read max allowed, or to the end of file
266 int actualrlen=0;
267 int sublen=0;
268 if (lineofs>0) { //read the partial first line
269 int reqrlen=line_len-lineofs;
270 if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
271 actualrlen=fread((void*)seqp, 1, reqrlen, fh);
272 if (actualrlen<reqrlen) { //eof reached prematurely
273 while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
274 //check for new sequences in between
275 clen=actualrlen;
276 sublen+=actualrlen;
277 return (const char*)seqp;
278 }
279 toread-=reqrlen;
280 sublen+=reqrlen;
281 fseeko(fh, lendlen, SEEK_CUR);
282 }
283 //read the rest of the lines
284 while (toread>=line_len) {
285 char* rseqp=&(seqp[sublen]);
286 actualrlen=fread((void*)rseqp, 1, line_len, fh);
287 /*
288 char dbuf[256];dbuf[255]=0;
289 strncpy(dbuf,rseqp, actualrlen);
290 dbuf[actualrlen]=0;
291 GMessage("<<<read line: %s\n",dbuf);
292 */
293 if (actualrlen<line_len) {
294 while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r') actualrlen--;
295 sublen+=actualrlen;
296 clen=sublen;
297 return (const char*)seqp;
298 }
299 toread-=actualrlen;
300 sublen+=actualrlen;
301 fseeko(fh, lendlen, SEEK_CUR);
302 }
303 // read the last partial line, if any
304 if (toread>0) {
305 char* rseqp=&(seqp[sublen]);
306 actualrlen=fread((void*)rseqp, 1, toread, fh);
307 if (actualrlen<toread) {
308 while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r')
309 actualrlen--;
310 }
311 sublen+=actualrlen;
312 }
313 //lastsub->sqlen+=sublen;
314 clen=sublen;
315
316 return (const char*)seqp;
317 }