ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.cpp
Revision: 144
Committed: Thu Dec 22 19:15:46 2011 UTC (12 years, 8 months ago) by gpertea
File size: 10196 byte(s)
Log Message:
added GBitVec.h; removed operator> requirement for sorted vectors; swap() is now template Gswap()

Line File contents
1 #include "GFaSeqGet.h"
2 #include "gdna.h"
3 #include <ctype.h>
4
5 void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) {
6 if (sovl==0) {
7 GFREE(sq);
8 sqstart=sstart;
9 uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ;
10 sqlen = (slen==0 ? max_len : slen);
11 GMALLOC(sq, sqlen);
12 return;
13 }
14 //overlap -- copy the overlapping region
15 char* newsq=NULL;
16 GMALLOC(newsq, slen);
17 memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl);
18 GFREE(sq);
19 sq=newsq;
20 sqstart=sstart;
21 sqlen=slen;
22 }
23
24 void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
25 fh=fopen(fn,"rb");
26 if (fh==NULL) {
27 GError("Error (GFaSeqGet) opening file '%s'\n",fn);
28 }
29 fname=Gstrdup(fn);
30 initialParse(fofs, validate);
31 lastsub=new GSubSeq();
32 }
33
34 GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen) {
35 //for GFastaIndex use mostly -- the important difference is that
36 //the file offset is to the sequence, not to the defline
37 fh=fopen(faname,"rb");
38 if (fh==NULL) {
39 GError("Error (GFaSeqGet) opening file '%s'\n",faname);
40 }
41 fname=Gstrdup(faname);
42 line_len=l_len;
43 line_blen=l_blen;
44 seq_len=seqlen;
45 if (line_blen<line_len)
46 GError("Error (GFaSeqGet): invalid line length info (len=%d, blen=%d)\n",
47 line_len, line_blen);
48 fseqstart=fseqofs;
49 lastsub=new GSubSeq();
50 }
51
52 GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
53 if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
54 seq_len=0;
55 fh=f;
56 initialParse(fofs, validate);
57 lastsub=new GSubSeq();
58 }
59
60 void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
61 static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
62 if (fofs!=0) { fseeko(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
63 //read the first two lines to determine fasta parameters
64 fseqstart=fofs;
65 int c=getc(fh);
66 fseqstart++;
67 if (c!='>') GError("Error (GFaSeqGet): not a fasta header?\n");
68 while ((c=getc(fh))!=EOF) {
69 fseqstart++;
70 if (c=='\n' || c=='\r') { break; } //end of defline
71 }
72
73 if (c==EOF) GError(gfa_ERRPARSE);
74 line_len=0;
75 int lendlen=0;
76 while ((c=getc(fh))!=EOF) {
77 if (c=='\n' || c=='\r') { //end of line encountered
78 if (line_len>0) { //end of the first "sequence" line
79 lendlen++;
80 break;
81 }
82 else {// another EoL char at the end of defline
83 fseqstart++;
84 continue;
85 }
86 }// end-of-line characters
87 line_len++;
88 }
89 //we are at the end of first sequence line
90 while ((c=getc(fh))!=EOF) {
91 if (c=='\n' || c=='\r') lendlen++;
92 else {
93 ungetc(c,fh);
94 break;
95 }
96 }
97 line_blen=line_len+lendlen;
98 if (c==EOF) return;
99 // -- you don't need to check it all if you're sure it's safe
100 if (checkall) { //validate the rest of the FASTA record
101 int llen=0; //last line length
102 int elen=0; //length of last line ending
103 bool waseol=true;
104 while ((c=getc(fh))!=EOF) {
105 if (c=='>' && waseol) { ungetc(c,fh); break; }
106 if (c=='\n' || c=='\r') {
107 // eol char
108 elen++;
109 if (waseol) continue; //2nd eol char
110 waseol=true;
111 elen=1;
112 continue;
113 }
114 if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
115 //--- on a seq char here:
116 if (waseol) {//beginning of a seq line
117 if (elen && (llen!=line_len || elen!=lendlen))
118 //GError(gfa_ERRPARSE);
119 GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
120 the sequence lines have the same length (except for the last line)");
121 waseol=false;
122 llen=0;
123 elen=0;
124 }
125 llen++;
126 } //while reading chars
127 }// FASTA checking was requested
128 fseeko(fh,fseqstart,SEEK_SET);
129 }
130
131 const char* GFaSeqGet::subseq(uint cstart, int& clen) {
132 //cstart is 1-based genomic coordinate within current fasta sequence
133 int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ;
134 //GMessage("--> call: subseq(%u, %d)\n", cstart, clen);
135 if (clen>maxlen) {
136 GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
137 return NULL;
138 }
139 if (seq_len>0 && clen+cstart-1>seq_len) {
140 GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len);
141 }
142 if (lastsub->sq==NULL || lastsub->sqlen==0) {
143 lastsub->setup(cstart, clen, 0,0,0,seq_len);
144 loadsubseq(cstart, clen);
145 lastsub->sqlen=clen;
146 return (const char*)lastsub->sq;
147 }
148 //allow extension up to MAX_FASUBSEQ
149 uint bstart=lastsub->sqstart;
150 uint bend=lastsub->sqstart+lastsub->sqlen-1;
151 uint cend=cstart+clen-1;
152 int qlen=0; //only the extra len to be allocated/appended/prepended
153 uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file
154 int newlen=0; //the new total length of the buffered sequence lastsub->sq
155 int kovl=0;
156 int czfrom=0;//0-based offsets for copying a previously read sequence chunk
157 int czto=0;
158 uint newstart=cstart;
159 if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer
160 return (const char*) &(lastsub->sq[cstart-bstart]) ;
161 }
162 //extend downward
163 uint newend=GMAX(cend, bend);
164 if (cstart<bstart) { //requested start < old buffer start
165 newstart=cstart;
166 newlen=(newend-newstart+1);
167 if (newlen>MAX_FASUBSEQ) {
168 newlen=MAX_FASUBSEQ;
169 newend=cstart+newlen-1; //keep newstart, set newend
170 }
171 qlen=bstart-cstart;
172 if (newend>bstart) { //overlap
173 if (newend>bend) {// new region is larger & around the old one - so we have two regions to update
174 kovl=bend-bstart+1;
175 czfrom=0;
176 czto=bstart-cstart;
177 lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq
178 qlen=bstart-cstart;
179 loadsubseq(newstart, qlen);
180 qlen=newend-bend;
181 int toread=qlen;
182 loadsubseq(bend+1, qlen);
183 clen-=(toread-qlen);
184 lastsub->sqlen=clen;
185 return (const char*)lastsub->sq;
186 }
187 //newend<=bend
188 kovl=newend-bstart+1;
189 }
190 else { //no overlap with previous buffer
191 if (newend>bend) kovl=bend-bstart+1;
192 else kovl=newend-bstart+1;
193 }
194 qlen=bstart-cstart;
195 czfrom=0;
196 czto=qlen;
197 } //cstart<bstart
198 else { //cstart>=bstart, possibly extend upwards
199 newstart=bstart;
200 newlen=(newend-newstart+1);
201 if (newlen>MAX_FASUBSEQ) {
202 newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart
203 newlen=MAX_FASUBSEQ;
204 if (newstart<=bend) { //overlap with old buffer
205 kovl=bend-newstart+1;
206 czfrom=newstart-bstart;
207 czto=0;
208 }
209 else { //not overlapping old buffer
210 kovl=0;
211 }
212 } //newstart reassigned
213 else { //we can extend the buffer to include the old one
214 qlen=newend-bend; //how much to read from file
215 qstart=bend+1;
216 kovl=bend-bstart+1;
217 czfrom=0;
218 czto=0;
219 }
220 }
221 lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region
222 lastsub->sqlen-=qlen; //appending may result in a premature eof
223 int toread=qlen;
224 loadsubseq(qstart, qlen); //read the missing chunk, if any
225 clen-=(toread-qlen);
226 lastsub->sqlen+=qlen;
227 return (const char*)(lastsub->sq+(cstart-newstart));
228 }
229
230 char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
231 if (cstart>cend) { Gswap(cstart, cend); }
232 int clen=cend-cstart+1;
233 const char* gs=subseq(cstart, clen);
234 if (gs==NULL) return NULL;
235 char* r=NULL;
236 GMALLOC(r,clen+1);
237 r[clen]=0;
238 memcpy((void*)r,(void*)gs, clen);
239 if (revCmpl) reverseComplement(r,clen);
240 if (upCase) {
241 for (int i=0;i<clen;i++)
242 r[i]=toupper(r[i]);
243 }
244 return r;
245 }
246
247 const char* GFaSeqGet::loadsubseq(uint cstart, int& clen) {
248 //assumes enough lastsub->sq space allocated previously
249 //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
250 int sofs=cstart-lastsub->sqstart;
251 int lendlen=line_blen-line_len;
252 char* seqp=lastsub->sq+sofs;
253 //find the proper file offset and read the appropriate lines
254 uint seqofs=cstart-1;
255 uint startlno = seqofs/line_len;
256 int lineofs = seqofs % line_len;
257 off_t fstart=fseqstart + (startlno*line_blen);
258 fstart+=lineofs;
259
260 fseeko(fh, fstart, SEEK_SET);
261 int toread=clen;
262 int maxlen=(seq_len>0)? seq_len-cstart+1 : MAX_FASUBSEQ ;
263 if (toread==0) toread=maxlen; //read max allowed, or to the end of file
264 int actualrlen=0;
265 int sublen=0;
266 if (lineofs>0) { //read the partial first line
267 int reqrlen=line_len-lineofs;
268 if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
269 actualrlen=fread((void*)seqp, 1, reqrlen, fh);
270 if (actualrlen<reqrlen) { //eof reached prematurely
271 while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
272 //check for new sequences in between
273 clen=actualrlen;
274 sublen+=actualrlen;
275 return (const char*)seqp;
276 }
277 toread-=reqrlen;
278 sublen+=reqrlen;
279 fseeko(fh, lendlen, SEEK_CUR);
280 }
281 //read the rest of the lines
282 while (toread>=line_len) {
283 char* rseqp=&(seqp[sublen]);
284 actualrlen=fread((void*)rseqp, 1, line_len, fh);
285 /*
286 char dbuf[256];dbuf[255]=0;
287 strncpy(dbuf,rseqp, actualrlen);
288 dbuf[actualrlen]=0;
289 GMessage("<<<read line: %s\n",dbuf);
290 */
291 if (actualrlen<line_len) {
292 while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r') actualrlen--;
293 sublen+=actualrlen;
294 clen=sublen;
295 return (const char*)seqp;
296 }
297 toread-=actualrlen;
298 sublen+=actualrlen;
299 fseeko(fh, lendlen, SEEK_CUR);
300 }
301 // read the last partial line, if any
302 if (toread>0) {
303 char* rseqp=&(seqp[sublen]);
304 actualrlen=fread((void*)rseqp, 1, toread, fh);
305 if (actualrlen<toread) {
306 while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r')
307 actualrlen--;
308 }
309 sublen+=actualrlen;
310 }
311 //lastsub->sqlen+=sublen;
312 clen=sublen;
313
314 return (const char*)seqp;
315 }