ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/gff.h
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (13 years, 1 month ago) by gpertea
File size: 34861 byte(s)
Log Message:
adding tophat source work

Line User Rev File contents
1 gpertea 29 #ifndef GFF_H
2     #define GFF_H
3    
4     #include "GBase.h"
5     #include "gdna.h"
6     #include "codons.h"
7     #include "GFaSeqGet.h"
8     #include "GList.hh"
9     #include "GHash.hh"
10    
11     /*
12     const byte exMskMajSpliceL = 0x01;
13     const byte exMskMajSpliceR = 0x02;
14     const byte exMskMinSpliceL = 0x04;
15     const byte exMskMinSpliceR = 0x08;
16     const byte exMskTag = 0x80;
17     */
18    
19     //reserved Gffnames::feats entries -- basic feature types
20     extern const int gff_fid_mRNA; // "mRNA" feature name
21     extern const int gff_fid_transcript; // *RNA, *transcript feature name
22     extern const int gff_fid_exon;
23     extern const int gff_fid_CDS; //never really used, except for display only
24     //use gff_fid_exon instead
25     extern const uint GFF_MAX_LOCUS;
26     extern const uint GFF_MAX_EXON;
27     extern const uint GFF_MAX_INTRON;
28    
29     extern const uint gfo_flag_CHILDREN_PROMOTED;
30     extern const uint gfo_flag_HAS_ERRORS;
31     extern const uint gfo_flag_IS_GENE;
32     extern const uint gfo_flag_FROM_GFF3; //parsed from GFF3 formatted record
33     extern const uint gfo_flag_BY_EXON; //created by subfeature (exon) directly
34     //(GTF2 and some chado gff3 dumps with exons given before their mRNA)
35     extern const uint gfo_flag_IS_TRANSCRIPT; //recognized as '*RNA' or '*transcript'
36     extern const uint gfo_flag_DISCARDED; //should not be printed under the "transcriptsOnly" directive
37     extern const uint gfo_flag_LST_KEEP; //GffObj from GffReader::gflst is to be kept (not deallocated)
38     //when GffReader is destroyed
39     extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent
40     extern const byte gfo_flagShift_LEVEL;
41    
42     extern bool gff_show_warnings;
43    
44     #define GFF_LINELEN 2048
45     #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
46    
47    
48     enum GffExonType {
49     exgffNone=0, //not a recognizable exon or CDS segment
50     exgffStart, //from "start_codon" feature (within CDS)
51     exgffStop, //from "stop_codon" feature (may be outside CDS)
52     exgffCDS, //from "CDS" feature
53     exgffUTR, //from "UTR" feature
54     exgffCDSUTR, //from a merge of UTR and CDS feature
55     exgffExon, //from "exon" feature
56     };
57    
58     class GffReader;
59    
60     class GffLine {
61     char* _parents; //stores a copy of the Parent attribute value,
62     //with commas replaced by \0
63     int _parents_len;
64     public:
65     char* dupline; //duplicate of original line
66     char* line; //this will have tabs replaced by \0
67     int llen;
68     char* gseqname;
69     char* track;
70     char* ftype; //feature name: mRNA/gene/exon/CDS
71     char* info; //the last, attributes' field, unparsed
72     uint fstart;
73     uint fend;
74     uint qstart; //overlap coords on query, if available
75     uint qend;
76     uint qlen; //query len, if given
77     double score;
78     char strand;
79     bool skip;
80     bool is_gff3; //if the line appears to be in GFF3 format
81     bool is_cds; //"cds" and "stop_codon" features
82     bool is_exon; //"exon" and "utr" features
83     char exontype; // gffExonType
84     bool is_transcript; //if current feature is *RNA or *transcript
85     bool is_gene; //if current feature is *gene
86     char phase; // '.' , '0', '1' or '2'
87     // -- allocated strings:
88     char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3)
89     char* gene_id; //value of gene_id attribute (GTF) if present or ID attribute of a gene feature (GFF3)
90     //
91     char** parents; //for GTF only parents[0] is used
92     int num_parents;
93     char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id)
94     GffLine(GffReader* reader, const char* l); //parse the line accordingly
95     void discardParent() {
96     GFREE(_parents);
97     _parents_len=0;
98     num_parents=0;
99     parents=NULL;
100     }
101     char* extractAttr(const char* pre, bool caseStrict=true, bool enforce_GTF2=false);
102     GffLine(GffLine* l) { //a copy constructor
103     memcpy((void*)this, (void*)l, sizeof(GffLine));
104     line=NULL;
105     GMALLOC(line, llen+1);
106     memcpy(line, l->line, llen+1);
107     GMALLOC(dupline, llen+1);
108     memcpy(dupline, l->dupline, llen+1);
109     //--offsets within line[]
110     gseqname=line+(l->gseqname-l->line);
111     track=line+(l->track-l->line);
112     ftype=line+(l->ftype-l->line);
113     info=line+(l->info-l->line);
114     //Parent=Gstrdup(l->Parent);
115     if (l->_parents_len>0) {
116     _parents_len=l->_parents_len;
117     GMALLOC(_parents, _parents_len);
118     memcpy(_parents, l->_parents, _parents_len);
119     num_parents=l->num_parents;
120     for (int i=0;i<num_parents;i++) {
121     parents[i]=_parents+(l->parents[i] - l->_parents);
122     }
123     }
124     //-- allocated string copies:
125     ID=Gstrdup(l->ID);
126     if (l->gene_name!=NULL)
127     gene_name=Gstrdup(l->gene_name);
128     if (l->gene_id!=NULL)
129     gene_id=Gstrdup(l->gene_id);
130     }
131     GffLine() {
132     line=NULL;
133     dupline=NULL;
134     gseqname=NULL;
135     track=NULL;
136     ftype=NULL;
137     fstart=0;
138     fend=0;
139     info=NULL;
140     _parents=NULL;
141     _parents_len=0;
142     parents=NULL;
143     num_parents=0;
144     ID=NULL;
145     gene_name=NULL;
146     gene_id=NULL;
147     skip=true;
148     qstart=0;
149     qend=0;
150     qlen=0;
151     exontype=0;
152     is_cds=false;
153     is_gff3=false;
154     is_transcript=false;
155     is_gene=false;
156     is_exon=false;
157     }
158     ~GffLine() {
159     GFREE(dupline);
160     GFREE(line);
161     GFREE(_parents);
162     GFREE(parents);
163     GFREE(ID);
164     GFREE(gene_name);
165     GFREE(gene_id);
166     }
167     };
168    
169     class GffAttr {
170     public:
171     int attr_id;
172     char* attr_val;
173     GffAttr(int an_id, const char* av=NULL) {
174     attr_id=an_id;
175     attr_val=NULL;
176     setValue(av);
177     }
178     ~GffAttr() {
179     GFREE(attr_val);
180     }
181     void setValue(const char* av) {
182     if (attr_val!=NULL) {
183     GFREE(attr_val);
184     }
185     if (av==NULL || av[0]==0) return;
186     //trim spaces
187     const char* vstart=av;
188     while (*vstart==' ') av++;
189     const char* vend=vstart;
190     bool keep_dq=false;
191     while (vend[1]!=0) {
192     if (*vend==' ' && vend[1]!=' ') keep_dq=true;
193     else if (*vend==';') keep_dq=true;
194     vend++;
195     }
196     //remove spaces at the end:
197     while (*vend==' ' && vend!=vstart) vend--;
198     //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes
199     if (!keep_dq && *vstart=='"' && *vend=='"') {
200     vend--;
201     vstart++;
202     }
203     attr_val=Gstrdup(vstart, vend);
204     }
205     bool operator==(GffAttr& d){
206     return (this==&d);
207     }
208     bool operator>(GffAttr& d){
209     return (this>&d);
210     }
211     bool operator<(GffAttr& d){
212     return (this<&d);
213     }
214    
215     };
216    
217     class GffNameList;
218     class GffNames;
219    
220     class GffNameInfo {
221     friend class GffNameList;
222     protected:
223     int idx;
224     public:
225     char* name;
226     GffNameInfo() { name=NULL; idx=-1; }
227     GffNameInfo(const char* n) {
228     name=Gstrdup(n);
229     }
230    
231     ~GffNameInfo() {
232     GFREE(name);
233     }
234    
235     bool operator==(GffNameInfo& d){
236     return (strcmp(this->name, d.name)==0);
237     }
238     bool operator>(GffNameInfo& d){
239     return (strcmp(this->name, d.name)>0);
240     }
241     bool operator<(GffNameInfo& d){
242     return (strcmp(this->name, d.name)<0);
243     }
244     };
245    
246     class GffNameList:public GList<GffNameInfo> {
247     friend class GffNameInfo;
248     friend class GffNames;
249     protected:
250     GHash<GffNameInfo> byName;//hash with shared keys
251     int idlast; //fList index of last added/reused name
252     void addStatic(const char* tname) {// fast add
253     GffNameInfo* f=new GffNameInfo(tname);
254     idlast=this->Add(f);
255     f->idx=idlast;
256     byName.shkAdd(f->name,f);
257     }
258     public:
259     GffNameList():GList<GffNameInfo>(false,true,true), byName(false) {
260     idlast=-1;
261     }
262     char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; }
263     int lastNameId() { return idlast; }
264     char* getName(int nid) { //retrieve name by its ID
265     if (nid<0 || nid>=fCount)
266     GError("GffNameList Error: invalid index (%d)\n",nid);
267     return fList[nid]->name;
268     }
269    
270     int addName(const char* tname) {//returns or create an id for the given name
271     //check idlast first, chances are it's the same feature name checked
272     if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0)
273     return idlast;
274     GffNameInfo* f=byName.Find(tname);
275     int fidx=-1;
276     if (f!=NULL) fidx=f->idx;
277     else {//add new entry
278     f=new GffNameInfo(tname);
279     fidx=this->Add(f);
280     f->idx=fidx;
281     byName.shkAdd(f->name,f);
282     }
283     idlast=fidx;
284     return fidx;
285     }
286    
287     int addNewName(const char* tname) {
288     GffNameInfo* f=new GffNameInfo(tname);
289     int fidx=this->Add(f);
290     f->idx=fidx;
291     byName.shkAdd(f->name,f);
292     return fidx;
293     }
294    
295     int getId(const char* tname) { //only returns a name id# if found
296     GffNameInfo* f=byName.Find(tname);
297     if (f==NULL) return -1;
298     return f->idx;
299     }
300     int removeName() {
301     GError("Error: removing names from GffNameList not allowed!\n");
302     return -1;
303     }
304     };
305    
306     class GffNames {
307     public:
308     int numrefs;
309     GffNameList tracks;
310     GffNameList gseqs;
311     GffNameList attrs;
312     GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc.
313     GffNames():tracks(),gseqs(),attrs(), feats() {
314     numrefs=0;
315     //the order below is critical!
316     //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
317     feats.addStatic("mRNA");//index 0=gff_fid_mRNA
318     feats.addStatic("transcript");//index 1=gff_fid_transcript
319     feats.addStatic("exon");//index 1=gff_fid_exon
320     feats.addStatic("CDS"); //index 2=gff_fid_CDS
321     }
322     };
323    
324     void gffnames_ref(GffNames* &n);
325     void gffnames_unref(GffNames* &n);
326    
327     enum GffPrintMode {
328     pgtfAny, //print record as read
329     pgtfExon,
330     pgtfCDS,
331     pgffAny, //print record as read
332     pgffExon,
333     pgffCDS,
334     pgffBoth,
335     };
336    
337    
338     class GffAttrs:public GList<GffAttr> {
339     public:
340     GffAttrs():GList<GffAttr>(false,true,false) { }
341     void add_or_update(GffNames* names, const char* attrname, const char* val) {
342     int aid=names->attrs.getId(attrname);
343     if (aid>=0) {
344     //attribute found in the dictionary
345     for (int i=0;i<Count();i++) {
346     //do we have it?
347     if (aid==Get(i)->attr_id) {
348     //update the value
349     Get(i)->setValue(val);
350     return;
351     }
352     }
353     }
354     else {
355     aid=names->attrs.addNewName(attrname);
356     }
357     this->Add(new GffAttr(aid, val));
358     }
359    
360     char* getAttr(GffNames* names, const char* attrname) {
361     int aid=names->attrs.getId(attrname);
362     if (aid>=0)
363     for (int i=0;i<Count();i++)
364     if (aid==Get(i)->attr_id) return Get(i)->attr_val;
365     return NULL;
366     }
367     char* getAttr(int aid) {
368     if (aid>=0)
369     for (int i=0;i<Count();i++)
370     if (aid==Get(i)->attr_id) return Get(i)->attr_val;
371     return NULL;
372     }
373     };
374    
375    
376     class GffExon : public GSeg {
377     public:
378     void* uptr; //for later extensions
379     GffAttrs* attrs; //other attributes kept for this exon
380     double score; // gff score column
381     char phase; //GFF phase column - for CDS segments only
382     // '.' = undefined (UTR), '0','1','2' for CDS exons
383     char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon"
384     int qstart; // for mRNA/protein exon mappings: coordinates on query
385     int qend;
386     GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) {
387     uptr=NULL;
388     attrs=NULL;
389     if (s<e) {
390     start=s;
391     end=e;
392     }
393     else {
394     start=e;
395     end=s;
396     }
397     if (qs<qe) {
398     qstart=qs;
399     qend=qe;
400     } else {
401     qstart=qe;
402     qend=qs;
403     }
404     score=sc;
405     phase=fr;
406     exontype=et;
407     } //constructor
408    
409     char* getAttr(GffNames* names, const char* atrname) {
410     if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
411     return attrs->getAttr(names, atrname);
412     }
413    
414     char* getAttr(int aid) {
415     if (attrs==NULL) return NULL;
416     return attrs->getAttr(aid);
417     }
418    
419     ~GffExon() { //destructor
420     if (attrs!=NULL) delete attrs;
421     }
422     };
423    
424    
425     class GffCDSeg:public GSeg {
426     public:
427     char phase;
428     int exonidx;
429     };
430     //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments
431     class GffObj:public GSeg {
432     //utility segment-merging function for addExon()
433     void expandExon(int xovl, uint segstart, uint segend,
434     char exontype, double sc, char fr, int qs, int qe);
435     protected:
436     //coordinate transformation data:
437     uint xstart; //absolute genomic coordinates of reference region
438     uint xend;
439     char xstatus; //coordinate transform status:
440     //0 : (start,end) coordinates are absolute
441     //'+' : (start,end) coords are relative to xstart..xend region
442     //'-' : (start,end) are relative to the reverse complement of xstart..xend region
443     //--
444     char* gffID; // ID name for mRNA (parent) feature
445     char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3)
446     char* geneID; //value of gene_id attribute (GTF) if present or ID attribute of a parent gene feature (GFF3)
447     unsigned int flags;
448     //-- friends:
449     friend class GffReader;
450     friend class GffExon;
451     public:
452     static GffNames* names; // dictionary storage that holds the various attribute names etc.
453     int track_id; // index of track name in names->tracks
454     int gseq_id; // index of genomic sequence name in names->gseqs
455     int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
456     int exon_ftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
457     //if ftype_id==gff_fid_mRNA then this value is ignored
458     GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
459     GPVec<GffObj> children;
460     GffObj* parent;
461     int udata; //user data, flags etc.
462     void* uptr; //user pointer (to a parent object, cluster, locus etc.)
463     GffObj* ulink; //link to another GffObj (user controlled field)
464     // mRNA specific fields:
465     bool isCDS; //just a CDS, no UTRs
466     bool partial; //partial CDS
467     uint CDstart; //CDS start coord
468     uint CDend; //CDS end coord
469     char CDphase; //initial phase for CDS start
470     bool hasErrors() { return ((flags & gfo_flag_HAS_ERRORS)!=0); }
471     void hasErrors(bool v) {
472     if (v) flags |= gfo_flag_HAS_ERRORS;
473     else flags &= ~gfo_flag_HAS_ERRORS;
474     }
475     bool fromGff3() { return ((flags & gfo_flag_FROM_GFF3)!=0); }
476     void fromGff3(bool v) {
477     if (v) flags |= gfo_flag_FROM_GFF3;
478     else flags &= ~gfo_flag_FROM_GFF3;
479     }
480     bool createdByExon() { return ((flags & gfo_flag_BY_EXON)!=0); }
481     void createdByExon(bool v) {
482     if (v) flags |= gfo_flag_BY_EXON;
483     else flags &= ~gfo_flag_BY_EXON;
484     }
485     bool isGene() { return ((flags & gfo_flag_IS_GENE)!=0); }
486     void isGene(bool v) {
487     if (v) flags |= gfo_flag_IS_GENE;
488     else flags &= ~gfo_flag_IS_GENE;
489     }
490     bool isDiscarded() { return ((flags & gfo_flag_DISCARDED)!=0); }
491     void isDiscarded(bool v) {
492     if (v) flags |= gfo_flag_DISCARDED;
493     else flags &= ~gfo_flag_DISCARDED;
494     }
495    
496     bool isUsed() { return ((flags & gfo_flag_LST_KEEP)!=0); }
497     void isUsed(bool v) {
498     if (v) flags |= gfo_flag_LST_KEEP;
499     else flags &= ~gfo_flag_LST_KEEP;
500     }
501     bool isTranscript() { return ((flags & gfo_flag_IS_TRANSCRIPT)!=0); }
502     void isTranscript(bool v) {
503     if (v) flags |= gfo_flag_IS_TRANSCRIPT;
504     else flags &= ~gfo_flag_IS_TRANSCRIPT;
505     }
506     bool promotedChildren() { return ((flags & gfo_flag_CHILDREN_PROMOTED)!=0); }
507     void promotedChildren(bool v) {
508     if (v) flags |= gfo_flag_CHILDREN_PROMOTED;
509     else flags &= ~gfo_flag_CHILDREN_PROMOTED;
510     }
511     void setLevel(byte v) {
512     if (v==0) flags &= ~gfo_flag_LEVEL_MSK;
513     else flags &= ~(((uint)v) << gfo_flagShift_LEVEL);
514     }
515     byte incLevel() {
516     uint v=((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL);
517     v++;
518     flags &= ~(v << gfo_flagShift_LEVEL);
519     return v;
520     }
521     byte getLevel() {
522     return ((byte)((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL));
523     }
524    
525     bool isValidTranscript() {
526     //return (ftype_id==gff_fid_mRNA && exons.Count()>0);
527     return (isTranscript() && exons.Count()>0);
528     }
529    
530    
531     int addExon(uint segstart, uint segend, double sc=0, char fr='.',
532     int qs=0, int qe=0, bool iscds=false, char exontype=0);
533    
534     int addExon(GffReader* reader, GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
535    
536     void removeExon(int idx);
537     void removeExon(GffExon* p);
538     char strand; //true if features are on the reverse complement strand
539     double gscore;
540     double uscore; //custom, user-computed score, if needed
541     int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
542    
543     //--------- optional data:
544     int qlen; //query length, start, end - if available
545     int qstart;
546     int qend;
547     int qcov; //query coverage - percent
548     GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
549     //constructor by gff line parsing:
550     GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true);
551     //if gfline->Parent!=NULL then this will also add the first sub-feature
552     // otherwise, only the main feature is created
553     void clearAttrs() {
554     if (attrs!=NULL) {
555     bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs);
556     delete attrs; attrs=NULL;
557     if (sharedattrs) exons[0]->attrs=NULL;
558     }
559     }
560     GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), children(1,false) {
561     //exons: sorted, free, non-unique
562     gffID=NULL;
563     uptr=NULL;
564     ulink=NULL;
565     flags=0;
566     udata=0;
567     parent=NULL;
568     ftype_id=-1;
569     exon_ftype_id=-1;
570     if (anid!=NULL) gffID=Gstrdup(anid);
571     gffnames_ref(names);
572     qlen=0;
573     qstart=0;
574     qend=0;
575     qcov=0;
576     partial=true;
577     isCDS=false;
578     CDstart=0; // hasCDS <=> CDstart>0
579     CDend=0;
580     CDphase=0;
581     gseq_id=-1;
582     track_id=-1;
583     xstart=0;
584     xend=0;
585     xstatus=0;
586     strand='.';
587     gscore=0;
588     uscore=0;
589     attrs=NULL;
590     covlen=0;
591     gene_name=NULL;
592     geneID=NULL;
593     }
594     ~GffObj() {
595     GFREE(gffID);
596     GFREE(gene_name);
597     GFREE(geneID);
598     clearAttrs();
599     gffnames_unref(names);
600     }
601     //--------------
602     GffObj* finalize(GffReader* gfr, bool mergeCloseExons=false,
603     bool keepAttrs=false, bool noExonAttr=true);
604     //complete parsing: must be called in order to merge adjacent/close proximity subfeatures
605     void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false);
606     const char* getSubfName() { //returns the generic feature type of the entries in exons array
607     int sid=exon_ftype_id;
608     if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
609     return names->feats.getName(sid);
610     }
611     bool monoFeature() {
612     return (exons.Count()==0 ||
613     (exons.Count()==1 && exon_ftype_id==ftype_id));
614     }
615    
616     bool hasCDS() { return (CDstart>0); }
617    
618     const char* getFeatureName() {
619     return names->feats.getName(ftype_id);
620     }
621     void setFeatureName(const char* feature);
622    
623     void addAttr(const char* attrname, const char* attrvalue);
624     int removeAttr(const char* attrname, const char* attrval=NULL);
625     int removeAttr(int aid, const char* attrval=NULL);
626     int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL);
627     int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL);
628     const char* getAttrName(int i) {
629     if (attrs==NULL) return NULL;
630     return names->attrs.getName(attrs->Get(i)->attr_id);
631     }
632     char* getAttr(const char* attrname, bool checkFirstExon=false) {
633     if (names==NULL || attrname==NULL) return NULL;
634     char* r=NULL;
635     if (attrs==NULL) {
636     if (!checkFirstExon) return NULL;
637     }
638     else r=attrs->getAttr(names, attrname);
639     if (r!=NULL) return r;
640     if (checkFirstExon && exons.Count()>0) {
641     r=exons[0]->getAttr(names, attrname);
642     }
643     return r;
644     }
645    
646     char* getExonAttr(GffExon* exon, const char* attrname) {
647     if (exon==NULL || attrname==NULL) return NULL;
648     return exon->getAttr(names, attrname);
649     }
650    
651     char* getExonAttr(int exonidx, const char* attrname) {
652     if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL;
653     return exons[exonidx]->getAttr(names, attrname);
654     }
655    
656     char* getAttrValue(int i) {
657     if (attrs==NULL) return NULL;
658     return attrs->Get(i)->attr_val;
659     }
660     const char* getGSeqName() {
661     return names->gseqs.getName(gseq_id);
662     }
663    
664     const char* getRefName() {
665     return names->gseqs.getName(gseq_id);
666     }
667     void setRefName(const char* newname);
668    
669     const char* getTrackName() {
670     return names->tracks.getName(track_id);
671     }
672     bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
673     //ignores strand!
674     if (s>e) swap(s,e);
675     for (int i=0;i<exons.Count();i++) {
676     if (exons[i]->overlap(s,e)) return true;
677     }
678     return false;
679     }
680     bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment
681     //if (gseq_id!=m.gseq_id) return false;
682     // ignores strand and gseq_id, must check in advance
683     for (int i=0;i<exons.Count();i++) {
684     for (int j=0;j<m.exons.Count();j++) {
685     if (exons[i]->start>m.exons[j]->end) continue;
686     if (m.exons[j]->start>exons[i]->end) break;
687     //-- overlap if we are here:
688     return true;
689     }
690     }
691     return false;
692     }
693    
694     int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) {
695     //return the exons' index for the overlapping OR ADJACENT exon
696     //ovlen, if given, will return the overlap length
697     if (s>e) swap(s,e);
698     s--;e++; //to also catch adjacent exons
699     for (int i=0;i<exons.Count();i++) {
700     if (exons[i]->start>e) break;
701     if (s>exons[i]->end) continue;
702     //-- overlap if we are here:
703     if (ovlen!=NULL) {
704     s++;e--;
705     int ovlend= (exons[i]->end>e) ? e : exons[i]->end;
706     *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1;
707     }
708     return i;
709     } //for each exon
710     *ovlen=0;
711     return -1;
712     }
713    
714     int exonOverlapLen(GffObj& m) {
715     if (start>m.end || m.start>end) return 0;
716     int i=0;
717     int j=0;
718     int ovlen=0;
719     while (i<exons.Count() && j<m.exons.Count()) {
720     uint istart=exons[i]->start;
721     uint iend=exons[i]->end;
722     uint jstart=m.exons[j]->start;
723     uint jend=m.exons[j]->end;
724     if (istart>jend) { j++; continue; }
725     if (jstart>iend) { i++; continue; }
726     //exon overlap
727     uint ovstart=GMAX(istart,jstart);
728     if (iend<jend) {
729     ovlen+=iend-ovstart+1;
730     i++;
731     }
732     else {
733     ovlen+=jend-ovstart+1;
734     j++;
735     }
736     }//while comparing exons
737     return ovlen;
738     }
739    
740     bool exonOverlap(GffObj* m) {
741     return exonOverlap(*m);
742     }
743     //---------- coordinate transformation
744     void xcoord(uint grstart, uint grend, char xstrand='+') {
745     //relative coordinate transform, and reverse-complement transform if xstrand is '-'
746     //does nothing if xstatus is the same already
747     if (xstatus) {
748     if (xstatus==xstrand && grstart==xstart && grend==xend) return;
749     unxcoord();//restore original coordinates
750     }
751     xstatus=xstrand;
752     xstart=grstart;
753     xend=grend;
754     if (CDstart>0) xcoordseg(CDstart, CDend);
755     for (int i=0;i<exons.Count();i++) {
756     xcoordseg(exons[i]->start, exons[i]->end);
757     }
758     if (xstatus=='-') {
759     exons.Reverse();
760     int flen=end-start;
761     start=xend-end+1;
762     end=start+flen;
763     }
764     else {
765     start=start-xstart+1;
766     end=end-xstart+1;
767     }
768     }
769    
770     //transform an arbitrary segment based on current xstatus/xstart-xend
771     void xcoordseg(uint& segstart, uint &segend) {
772     if (xstatus==0) return;
773     if (xstatus=='-') {
774     int flen=segend-segstart;
775     segstart=xend-segend+1;
776     segend=segstart+flen;
777     return;
778     }
779     else {
780     segstart=segstart-xstart+1;
781     segend=segend-xstart+1;
782     }
783     }
784    
785     void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true
786     if (xstatus==0) return; //nothing to do, no transformation appplied
787     if (CDstart>0) unxcoordseg(CDstart, CDend);
788     //restore all GffExon intervals too
789     for (int i=0;i<exons.Count();i++) {
790     unxcoordseg(exons[i]->start, exons[i]->end);
791     }
792     if (xstatus=='-') {
793     exons.Reverse();
794     int flen=end-start;
795     start=xend-end+1;
796     end=start+flen;
797     }
798     else {
799     start=start+xstart-1;
800     end=end+xstart-1;
801     }
802     xstatus=0;
803     }
804     void unxcoordseg(uint& astart, uint &aend) {
805     //restore an arbitrary interval -- does NOT change the transform state!
806     if (xstatus==0) return;
807     if (xstatus=='-') {
808     int flen=aend-astart;
809     astart=xend-aend+1;
810     aend=astart+flen;
811     }
812     else {
813     astart=astart+xstart-1;
814     aend=aend+xstart-1;
815     }
816     }
817     //---------------------
818     bool operator==(GffObj& d){
819     return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0);
820     }
821     bool operator>(GffObj& d){
822     if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
823     if (start==d.start) {
824     if (getLevel()==d.getLevel()) {
825     if (end==d.end) return (strcmp(gffID, d.gffID)>0);
826     else return (end>d.end);
827     } else return (getLevel()>d.getLevel());
828     } else return (start>d.start);
829     }
830     bool operator<(GffObj& d){
831     if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
832     if (start==d.start) {
833     if (getLevel()==d.getLevel()) {
834     if (end==d.end) return strcmp(gffID, d.gffID)<0;
835     else return end<d.end;
836     } else return (getLevel()<d.getLevel());
837     } else return (start<d.start);
838     }
839     char* getID() { return gffID; }
840     char* getGeneID() { return geneID; }
841     char* getGeneName() { return gene_name; }
842     void setGeneName(const char* gname) {
843     GFREE(gene_name);
844     if (gname) gene_name=Gstrdup(gname);
845     }
846     void setGeneID(const char* gene_id) {
847     GFREE(geneID);
848     if (gene_id) geneID=Gstrdup(gene_id);
849     }
850     int addSeg(GffLine* gfline);
851     int addSeg(int fnid, GffLine* gfline);
852     void getCDSegs(GArray<GffCDSeg>& cds);
853    
854     void updateExonPhase(); //for CDS-only features, updates GExon::phase
855    
856     void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
857     bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3);
858     void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
859     const char* tlabel=NULL, const char* gfparent=NULL);
860     void printGtf(FILE* fout, const char* tlabel=NULL) {
861     printGxf(fout, pgtfAny, tlabel);
862     }
863     void printGff(FILE* fout, const char* tlabel=NULL,
864     const char* gfparent=NULL) {
865     printGxf(fout, pgffAny, tlabel, gfparent);
866     }
867     void printTranscriptGff(FILE* fout, char* tlabel=NULL,
868     bool showCDS=false, const char* gfparent=NULL) {
869     if (isValidTranscript())
870     printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent);
871     }
872     void printSummary(FILE* fout=NULL);
873     void getCDS_ends(uint& cds_start, uint& cds_end);
874     void mRNA_CDS_coords(uint& cds_start, uint& cds_end);
875     char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
876     uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL);
877     char* getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst);
878     char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL);
879     //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
880     bool empty() { return (start==0); }
881     };
882    
883     typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
884     //user callback after parsing a mapping object:
885     // Returns: "done with it" status:
886     // TRUE if gobj is no longer needed so it's FREEd upon return
887     // FALSE if the user needs the gobj pointer and is responsible for
888     // collecting and freeing all GffObj objects
889    
890    
891     //GSeqStat: collect basic stats about a common underlying genomic sequence
892     // for multiple GffObj
893     class GSeqStat {
894     public:
895     int gseqid; //gseq id in the global static pool of gseqs
896     char* gseqname; //just a pointer to the name of gseq
897     //int fcount;//number of features on this gseq
898     uint mincoord;
899     uint maxcoord;
900     uint maxfeat_len; //maximum feature length on this genomic sequence
901     GffObj* maxfeat;
902     GSeqStat(int id=-1, char* name=NULL) {
903     gseqid=id;
904     gseqname=name;
905     mincoord=MAXUINT;
906     maxcoord=0;
907     maxfeat_len=0;
908     maxfeat=NULL;
909     }
910     bool operator>(GSeqStat& g) {
911     return (gseqid>g.gseqid);
912     }
913     bool operator<(GSeqStat& g) {
914     return (gseqid<g.gseqid);
915     }
916     bool operator==(GSeqStat& g) {
917     return (gseqid==g.gseqid);
918     }
919     };
920    
921    
922     int gfo_cmpByLoc(const pointer p1, const pointer p2);
923    
924     class GfList: public GList<GffObj> {
925     //just adding the option to sort by genomic sequence and coordinate
926     bool mustSort;
927     public:
928     GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) {
929     //GffObjs in this list are NOT deleted when the list is cleared
930     //-- for deallocation of these objects, call freeAll() or freeUnused() as needed
931     mustSort=sortbyloc;
932     }
933     void sortedByLoc(bool v=true) {
934     bool prev=mustSort;
935     mustSort=v;
936     if (fCount>0 && mustSort && !prev) {
937     this->setSorted((GCompareProc*)gfo_cmpByLoc);
938     }
939     }
940     void finalize(GffReader* gfr, bool mergeCloseExons,
941     bool keepAttrs=false, bool noExonAttr=true) { //if set, enforce sort by locus
942     if (mustSort) { //force (re-)sorting
943     this->setSorted(false);
944     this->setSorted((GCompareProc*)gfo_cmpByLoc);
945     }
946     int delcount=0;
947     for (int i=0;i<Count();i++) {
948     //finish the parsing for each GffObj
949     fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr);
950     }
951     if (delcount>0) this->Pack();
952     }
953     void freeAll() {
954     for (int i=0;i<fCount;i++) {
955     delete fList[i];
956     fList[i]=NULL;
957     }
958     Clear();
959     }
960     void freeUnused() {
961     for (int i=0;i<fCount;i++) {
962     if (fList[i]->isUsed()) continue;
963     //inform the children
964     for (int c=0;c<fList[i]->children.Count();c++) {
965     fList[i]->children[c]->parent=NULL;
966     }
967     delete fList[i];
968     fList[i]=NULL;
969     }
970     Clear();
971     }
972    
973     };
974    
975     class GfoHolder {
976     public:
977     int idx; //position in GffReader::gflst array
978     GffObj* gffobj;
979     GfoHolder(GffObj* gfo=NULL, int i=0) {
980     idx=i;
981     gffobj=gfo;
982     }
983     };
984    
985     class CNonExon { //utility class used in subfeature promotion
986     public:
987     int idx;
988     GffObj* parent;
989     GffExon* exon;
990     GffLine* gffline;
991     CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
992     parent=p;
993     exon=e;
994     idx=i;
995     gffline=new GffLine(gl);
996     }
997     ~CNonExon() {
998     delete gffline;
999     }
1000     };
1001    
1002    
1003     class GffReader {
1004     friend class GffObj;
1005     friend class GffLine;
1006     char* linebuf;
1007     off_t fpos;
1008     int buflen;
1009     protected:
1010     bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes
1011     FILE* fh;
1012     char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader
1013     GffNames* names; //just a pointer to the global static Gff names repository in GffObj
1014     GffLine* gffline;
1015     bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features
1016     GHash<int> discarded_ids; //for transcriptsOnly mode, keep track
1017     // of discarded parent IDs
1018     GHash<GfoHolder> phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj]
1019     GHash<int> tids; //transcript_id uniqueness
1020     char* gfoBuildId(const char* id, const char* ctg);
1021     void gfoRemove(const char* id, const char* ctg);
1022     GfoHolder* gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx);
1023     GfoHolder* gfoFind(const char* id, const char* ctg);
1024     CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name);
1025     void subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo);
1026     GfoHolder* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1027     bool keepAttr, bool noExonAttr);
1028     public:
1029     GfList gflst; //accumulate GffObjs being read
1030     GfoHolder* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
1031     GffObj* parent=NULL, GffExon* pexon=NULL);
1032     GfoHolder* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
1033     GfoHolder* updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1034     bool keepAttr);
1035     GfoHolder* updateParent(GfoHolder* newgfh, GffObj* parent);
1036     bool addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr);
1037     GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
1038     GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true),
1039     phash(true), tids(true), gflst(sortbyloc), gseqstats(true,true,true) {
1040     gff_warns=gff_show_warnings;
1041     names=NULL;
1042     gffline=NULL;
1043     transcriptsOnly=t_only;
1044     fpos=0;
1045     fname=NULL;
1046     fh=f;
1047     GMALLOC(linebuf, GFF_LINELEN);
1048     buflen=GFF_LINELEN-1;
1049     }
1050     void init(FILE *f, bool t_only=false, bool sortbyloc=false) {
1051     fname=NULL;
1052     fh=f;
1053     if (fh!=NULL) rewind(fh);
1054     fpos=0;
1055     transcriptsOnly=t_only;
1056     gflst.sortedByLoc(sortbyloc);
1057     }
1058     GffReader(char* fn, bool t_only=false, bool sort=false):discarded_ids(true), phash(true),
1059     tids(true),gflst(sort),gseqstats(true,true,true) {
1060     gff_warns=gff_show_warnings;
1061     names=NULL;
1062     fname=Gstrdup(fn);
1063     transcriptsOnly=t_only;
1064     fh=fopen(fname, "rb");
1065     fpos=0;
1066     gffline=NULL;
1067     GMALLOC(linebuf, GFF_LINELEN);
1068     buflen=GFF_LINELEN-1;
1069     }
1070    
1071     ~GffReader() {
1072     delete gffline;
1073     gffline=NULL;
1074     fpos=0;
1075     gflst.freeUnused();
1076     gflst.Clear();
1077     discarded_ids.Clear();
1078     phash.Clear();
1079     gseqstats.Clear();
1080     GFREE(fname);
1081     GFREE(linebuf);
1082     }
1083    
1084     void showWarnings(bool v=true) {
1085     gff_warns=v;
1086     gff_show_warnings=v;
1087     }
1088    
1089     GffLine* nextGffLine();
1090    
1091     // load all subfeatures, re-group them:
1092     void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true);
1093    
1094     }; // end of GffReader
1095    
1096     #endif