ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.h
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (14 years, 5 months ago) by gpertea
File size: 3084 byte(s)
Log Message:
added my gclib source files

Line File contents
1 #ifndef GFASEQGET_H
2 #define GFASEQGET_H
3
4 #include "GBase.h"
5 #include "GList.hh"
6
7
8 #define MAX_FASUBSEQ 0x20000000
9 //max 512MB sequence data held in memory at a time
10
11 class GSubSeq {
12 public:
13 uint sqstart; //1-based coord of subseq start on sequence
14 uint sqlen; //length of subseq loaded
15 char* sq; //actual subsequence data will be stored here
16 // (with end-of-line characters removed)
17
18 /*char* xseq; //the exposed pointer to the last requested subsequence start
19 off_t xstart; //the coordinate start for the last requested subseq
20 off_t xlen; //the last requested subseq len*/
21 GSubSeq() {
22 sqstart=0;
23 sqlen=0;
24 sq=NULL;
25 /* xseq=NULL;
26 xstart=0;
27 xlen=0;*/
28 }
29 ~GSubSeq() {
30 GFREE(sq);
31 }
32 // genomic, 1-based coordinates:
33 void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0);
34 //check for overlap with previous window and realloc/extend appropriately
35 //returns offset from seq that corresponds to sstart
36 // the window will keep extending until MAX_FASUBSEQ is reached
37 };
38
39 class GFaSeqGet {
40 char* fname;
41 FILE* fh;
42 //raw offset in the file where the sequence actually starts:
43 off_t fseqstart;
44 int linelen; //length of each sequence line (assumed fixed)
45 char lendlen; //length of end-of-line characters between lines
46 //(assumed fixed)
47 char lendch; //end-of-line signal character (can only be '\n' or '\r')
48 GSubSeq* lastsub;
49 void initialParse(off_t fofs=0, bool checkall=true);
50 const char* loadsubseq(uint cstart, int& clen);
51 void finit(const char* fn, off_t fofs, bool validate);
52 public:
53 GFaSeqGet() {
54 fseqstart=0;
55 linelen=0;
56 lendch='\0';
57 fname=NULL;
58 lastsub=NULL;
59 }
60 GFaSeqGet(const char* fn, off_t fofs, bool validate=false) {
61 finit(fn,fofs,validate);
62 }
63 GFaSeqGet(const char* fn, bool validate=false) {
64 finit(fn,0,validate);
65 }
66 /*
67 GFaSeqGet(bool readAll, const char* fn, off_t fofs=0);
68 GFaSeqGet(bool readAll, FILE* f, off_t fofs=0);
69 */
70 GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
71 ~GFaSeqGet() {
72 if (fname!=NULL) {
73 GFREE(fname);
74 fclose(fh);
75 }
76 delete lastsub;
77 }
78 const char* subseq(uint cstart, int& clen);
79 const char* getRange(uint cstart, uint cend) {
80 if (cstart>cend) { swap(cstart, cend); }
81 int clen=cend-cstart+1;
82 //int rdlen=clen;
83 return subseq(cstart, clen);
84 }
85 //caller is responsible for deallocating copyRange() return string
86 char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
87
88 void loadall() {
89 int clen=MAX_FASUBSEQ;
90 subseq(1, clen);
91 }
92 void load(uint cstart, uint cend) {
93 //cache as much as possible
94 int clen=cend-cstart+1;
95 subseq(cstart, clen);
96 }
97 int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
98 off_t getseqofs() { return fseqstart; }
99 int getlinelen() { return linelen; }
100 int getlendlen() { return lendlen; }
101 //reads a subsequence starting at genomic coordinate cstart (1-based)
102 };
103
104
105 #endif