ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/GFaSeqGet.h
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (12 years, 7 months ago) by gpertea
File size: 3486 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 #ifndef GFASEQGET_H
2 #define GFASEQGET_H
3
4 #include "GList.hh"
5
6 #define MAX_FASUBSEQ 0x20000000
7 //max 512MB sequence data held in memory at a time
8
9 class GSubSeq {
10 public:
11 uint sqstart; //1-based coord of subseq start on sequence
12 uint sqlen; //length of subseq loaded
13 char* sq; //actual subsequence data will be stored here
14 // (with end-of-line characters removed)
15
16 /*char* xseq; //the exposed pointer to the last requested subsequence start
17 off_t xstart; //the coordinate start for the last requested subseq
18 off_t xlen; //the last requested subseq len*/
19 GSubSeq() {
20 sqstart=0;
21 sqlen=0;
22 sq=NULL;
23 /* xseq=NULL;
24 xstart=0;
25 xlen=0;*/
26 }
27 ~GSubSeq() {
28 GFREE(sq);
29 }
30 // genomic, 1-based coordinates:
31 void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0);
32 //check for overlap with previous window and realloc/extend appropriately
33 //returns offset from seq that corresponds to sstart
34 // the window will keep extending until MAX_FASUBSEQ is reached
35 };
36
37 class GFaSeqGet {
38 char* fname;
39 FILE* fh;
40 //raw offset in the file where the sequence actually starts:
41 off_t fseqstart;
42 uint seq_len; //total sequence length, if known (when created from GFastaIndex)
43 int line_len; //length of each line of text
44 int line_blen; //binary length of each line
45 // = line_len + number of EOL character(s)
46 GSubSeq* lastsub;
47 void initialParse(off_t fofs=0, bool checkall=true);
48 const char* loadsubseq(uint cstart, int& clen);
49 void finit(const char* fn, off_t fofs, bool validate);
50 public:
51 GFaSeqGet() {
52 fh=NULL;
53 fseqstart=0;
54 seq_len=0;
55 line_len=0;
56 line_blen=0;
57 fname=NULL;
58 lastsub=NULL;
59 }
60 GFaSeqGet(const char* fn, off_t fofs, bool validate=false) {
61 seq_len=0;
62 finit(fn,fofs,validate);
63 }
64 GFaSeqGet(const char* fn, bool validate=false) {
65 seq_len=0;
66 finit(fn,0,validate);
67 }
68
69 GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen);
70 //constructor from GFastaIndex record
71
72 GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
73
74 ~GFaSeqGet() {
75 if (fname!=NULL) {
76 GFREE(fname);
77 fclose(fh);
78 }
79 delete lastsub;
80 }
81 const char* subseq(uint cstart, int& clen);
82 const char* getRange(uint cstart=1, uint cend=0) {
83 if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ;
84 if (cstart>cend) { Gswap(cstart, cend); }
85 int clen=cend-cstart+1;
86 //int rdlen=clen;
87 return subseq(cstart, clen);
88 }
89
90 char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
91 //caller is responsible for deallocating the return string
92
93 void loadall(uint32 max_len=0) {
94 //TODO: must read the whole sequence differently here - line by line
95 //so when EOF or another '>' line is found, the reading stops!
96 int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ);
97 subseq(1, clen);
98 }
99 void load(uint cstart, uint cend) {
100 //cache as much as possible
101 if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request
102 int clen=cend-cstart+1;
103 subseq(cstart, clen);
104 }
105 int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
106 off_t getseqofs() { return fseqstart; }
107 int getLineLen() { return line_len; }
108 int getLineBLen() { return line_blen; }
109 //reads a subsequence starting at genomic coordinate cstart (1-based)
110 };
111
112
113 #endif