ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/GFaSeqGet.h
Revision: 228
Committed: Wed Mar 28 05:13:28 2012 UTC (12 years, 5 months ago) by gpertea
File size: 3485 byte(s)
Log Message:
wip - prefiltering fix

Line File contents
1 #ifndef GFASEQGET_H
2 #define GFASEQGET_H
3 #include "GList.hh"
4
5 #define MAX_FASUBSEQ 0x20000000
6 //max 512MB sequence data held in memory at a time
7
8 class GSubSeq {
9 public:
10 uint sqstart; //1-based coord of subseq start on sequence
11 uint sqlen; //length of subseq loaded
12 char* sq; //actual subsequence data will be stored here
13 // (with end-of-line characters removed)
14
15 /*char* xseq; //the exposed pointer to the last requested subsequence start
16 off_t xstart; //the coordinate start for the last requested subseq
17 off_t xlen; //the last requested subseq len*/
18 GSubSeq() {
19 sqstart=0;
20 sqlen=0;
21 sq=NULL;
22 /* xseq=NULL;
23 xstart=0;
24 xlen=0;*/
25 }
26 ~GSubSeq() {
27 GFREE(sq);
28 }
29 // genomic, 1-based coordinates:
30 void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0);
31 //check for overlap with previous window and realloc/extend appropriately
32 //returns offset from seq that corresponds to sstart
33 // the window will keep extending until MAX_FASUBSEQ is reached
34 };
35
36 class GFaSeqGet {
37 char* fname;
38 FILE* fh;
39 //raw offset in the file where the sequence actually starts:
40 off_t fseqstart;
41 uint seq_len; //total sequence length, if known (when created from GFastaIndex)
42 int line_len; //length of each line of text
43 int line_blen; //binary length of each line
44 // = line_len + number of EOL character(s)
45 GSubSeq* lastsub;
46 void initialParse(off_t fofs=0, bool checkall=true);
47 const char* loadsubseq(uint cstart, int& clen);
48 void finit(const char* fn, off_t fofs, bool validate);
49 public:
50 GFaSeqGet() {
51 fh=NULL;
52 fseqstart=0;
53 seq_len=0;
54 line_len=0;
55 line_blen=0;
56 fname=NULL;
57 lastsub=NULL;
58 }
59 GFaSeqGet(const char* fn, off_t fofs, bool validate=false) {
60 seq_len=0;
61 finit(fn,fofs,validate);
62 }
63 GFaSeqGet(const char* fn, bool validate=false) {
64 seq_len=0;
65 finit(fn,0,validate);
66 }
67
68 GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen);
69 //constructor from GFastaIndex record
70
71 GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
72
73 ~GFaSeqGet() {
74 if (fname!=NULL) {
75 GFREE(fname);
76 fclose(fh);
77 }
78 delete lastsub;
79 }
80 const char* subseq(uint cstart, int& clen);
81 const char* getRange(uint cstart=1, uint cend=0) {
82 if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ;
83 if (cstart>cend) { Gswap(cstart, cend); }
84 int clen=cend-cstart+1;
85 //int rdlen=clen;
86 return subseq(cstart, clen);
87 }
88
89 char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
90 //caller is responsible for deallocating the return string
91
92 void loadall(uint32 max_len=0) {
93 //TODO: must read the whole sequence differently here - line by line
94 //so when EOF or another '>' line is found, the reading stops!
95 int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ);
96 subseq(1, clen);
97 }
98 void load(uint cstart, uint cend) {
99 //cache as much as possible
100 if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request
101 int clen=cend-cstart+1;
102 subseq(cstart, clen);
103 }
104 int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
105 off_t getseqofs() { return fseqstart; }
106 int getLineLen() { return line_len; }
107 int getLineBLen() { return line_blen; }
108 //reads a subsequence starting at genomic coordinate cstart (1-based)
109 };
110
111
112 #endif