ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/LayoutParser.h
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (13 years, 1 month ago) by gpertea
File size: 6233 byte(s)
Log Message:
sync with local source

Line User Rev File contents
1 gpertea 16 #ifndef LayoutParser_H
2     #define LayoutParser_H
3    
4     #include "GBase.h"
5     #include "GList.hh"
6     #include "GHash.hh"
7     #include <stdio.h>
8     //hash data associated with a contig/sequence name
9     //a contig name key is always stored as its name plus .<length>
10    
11     class LytCtgData;
12    
13     struct LytSeqInterSeg {
14     int segEnd, segRClip;
15     int nextStart, nextLClip;
16     char segRSplice, nextLSplice;
17     int nextSegSeq;
18     LytSeqInterSeg(int end, int nextstart, int rclip=0, int nextlclip=0,
19     char segrsplice=0, char nextlsplice=0, int seqpos=0) {
20     segEnd=end; segRClip=rclip;
21     nextStart=nextstart; nextLClip=nextlclip;
22     segRSplice=segrsplice;
23     nextLSplice=nextlsplice;
24     nextSegSeq=seqpos;
25     }
26     int length() {
27     return (nextStart-segEnd-1);
28     }
29     };
30    
31     class LytSeqInfo { //info for a sequence within the file
32     int xlen; //total sequence length (with all the added * within contig)
33     int interseglen;
34     public:
35     char *name;
36     LytCtgData* contig; //contig data containing this sequence, as above
37     bool segmented;
38     int numisegs; // number of intersegs[]
39     LytSeqInterSeg* intersegs;
40     off_t fpos; //file position for the sequence data
41     unsigned char reversed;
42     int offs; //offset in contig (of the very left end)
43     int left,right; //clear range (relative to sequence itself, max 1..xlen)
44     LytSeqInfo(char* seqid, LytCtgData* ctg, int pos=0, unsigned char minus=0,
45     int slen=0, int clpL=0, int clpR=0) {
46     contig=ctg;
47     offs=pos;
48     reversed=minus;
49     fpos=0;
50     interseglen=0;
51     xlen=slen;
52     left=clpL+1; //1 if no clpL given
53     right=xlen-clpR; //0 if no len given
54     segmented=false;
55     numisegs=0;
56     name=Gstrdup(seqid);
57     intersegs=NULL;
58     }
59     ~LytSeqInfo() {
60     GFREE(name);
61     GFREE(intersegs);
62     }
63     bool hasIntrons() { return (numisegs>0); }
64     void addInterSeg(int end, int nextstart, int rclip=0, int nextlclip=0,
65     char splice=0, char nextsplice=0, int seqofs=0) {
66     GREALLOC(intersegs,(numisegs+1)*sizeof(LytSeqInterSeg));
67     interseglen+=nextstart-end-1;
68     intersegs[numisegs].segEnd=end; intersegs[numisegs].segRClip=rclip;
69     intersegs[numisegs].nextStart=nextstart; intersegs[numisegs].nextLClip=nextlclip;
70     intersegs[numisegs].segRSplice=splice;
71     intersegs[numisegs].nextLSplice=nextsplice;
72     intersegs[numisegs].nextSegSeq=seqofs;
73     numisegs++;
74     }
75    
76     void setLength(int len) {
77     //should only be called BEFORE setting the real clipping coordinates
78     //(left,right)
79     xlen=len;
80     left=1;
81     right=xlen;
82     }
83     int length() { return xlen; } //xtended span, including introns
84     int seglen() { return xlen-interseglen; } //segments only, no introns
85     bool operator==(const LytSeqInfo& s) {
86     return (offs+left-1==s.offs+s.left-1);
87     }
88     bool operator>(const LytSeqInfo& s) {
89     return (offs+left-1>s.offs+s.left-1);
90     }
91     bool operator<(const LytSeqInfo& s) {
92     return (offs+left-1<s.offs+s.left-1);
93     }
94     char* expandGaps(char* s);
95     };
96    
97    
98     class LytCtgData {
99     public:
100     char* name; //contig name, as stored in file
101     unsigned int len; //contig length (lsequence, from ACE file)
102     int lpos, rpos;
103     int numseqs;
104     int offs; //some other type of user data that might be of use
105     off_t fpos; //position in file for this contig's entry
106     GList<LytSeqInfo> seqs;
107     LytCtgData(off_t pos=0):seqs(false,false,false) {
108     name=NULL;
109     offs=0;
110     len=0;
111     numseqs=0;
112     fpos=pos;
113     }
114     ~LytCtgData() {
115     GFREE(name);
116     seqs.Clear();
117     }
118    
119     char* readName(char* s, GHash<int>& names);
120    
121     bool operator==(const LytCtgData& s) {
122     return (strcmp(name,s.name)==0);
123     }
124     bool operator>(const LytCtgData& s) {
125     return (strcmp(name,s.name)>0);
126     }
127     bool operator<(const LytCtgData& s) {
128     return (strcmp(name,s.name)<0);
129     }
130     };
131     //callback -- called after a read or contig sequence is loaded
132     typedef bool fnLytSeq(int ctgno, LytCtgData* d, LytSeqInfo* s, char* seq);
133    
134     class LayoutParser {
135     protected:
136     FILE* f; //file stream
137     off_t f_pos;
138     char* fname;
139     LytCtgData* currentContig; // currently loaded contig -- for browsing/loading
140     int numContigs; //total number of contigs found in this file
141     //int numSeqs; //total number of (distinct) sequences found in this file
142     GHash<LytSeqInfo> seqinfo; //sequence locations in the file
143     GHash<int> ctgIDs; //contig IDs, to make them unique!
144    
145     GList<LytCtgData> contigs; //list of contig names with their size,
146     //number of sequences and filepos
147     protected:
148     GLineReader* linebuf; //the line buffer
149     off_t fskipTo(const char* linestart, const char* butnot=NULL);
150     bool startsWith(const char* s, const char* start, int tlen);
151     virtual LytSeqInfo* addSeq(char* s, LytCtgData* ctg);
152     int seek(off_t offset) {
153     int r=fseeko(f, offset, SEEK_SET);
154     if (r==0) f_pos=offset;
155     return r;
156     }
157    
158     public:
159     LayoutParser(const char* filename):contigs(false,true) {
160     f=NULL;
161     f_pos=0;
162     numContigs=0;
163     currentContig=NULL;
164     if (filename==NULL) {
165     f=stdin;
166     fname=Gstrdup("stdin");
167     }
168     else
169     fname=Gstrdup(filename);
170     linebuf=new GLineReader();
171     }
172     virtual ~LayoutParser() {
173     ctgIDs.Clear();
174     GFREE(fname);
175     delete linebuf;
176     close();
177     numContigs=0;
178     seqinfo.Clear();
179     contigs.Clear();
180     }
181     virtual bool open();
182     void close();
183     virtual bool parse(fnLytSeq* seqfn=NULL); //load all the file offsets
184     virtual bool parseContigs(); //load all the file offsets for contigs
185     virtual bool loadContig(int ctgidx, fnLytSeq* seqfn=NULL, bool re_pos=true); //for loading by browsing
186     //if parsefn is not NULL, it is executed, passing the sequence data(first time, with the contig sequence)
187     //if parserfn returns true, the data is freed after it is processed
188     virtual char getFileType() { return 'L'; }
189     //sequence loading - only by request
190     LytCtgData* getContig(int idx) { return contigs[idx]; }
191     virtual char* getSeq(LytSeqInfo* sqinfo) { return NULL; }
192     virtual char* getContigSeq(LytCtgData* ctgdata) { return NULL; }
193     int getNumContigs() { return numContigs; }
194     //int getNumSeqs() { return numSeqs; }
195     off_t getFilePos() { return f_pos; }
196     //sorting the list of contigs:
197     void contigsByName();
198     void contigsByLen();
199     void contigsByNumSeqs();
200     };
201    
202     #endif