ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gff_utils.cpp
Revision: 59
Committed: Fri Sep 9 18:00:23 2011 UTC (12 years, 10 months ago) by gpertea
File size: 20443 byte(s)
Log Message:
finally fixed the placeGf clustering bug

Line User Rev File contents
1 gpertea 26 #include "gff_utils.h"
2    
3     extern bool verbose;
4     extern bool debugMode;
5    
6 gpertea 59 //bool debugState=false;
7    
8 gpertea 26 void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
9     if (seq==NULL) return;
10     int len=(seqlen>0)?seqlen:strlen(seq);
11     if (len<=0) return;
12     if (!defline.is_empty())
13     fprintf(f, ">%s\n",defline.chars());
14     int ilen=0;
15     for (int i=0; i < len; i++, ilen++) {
16     if (ilen == 70) {
17     fputc('\n', f);
18     ilen = 0;
19     }
20     putc(seq[i], f);
21     } //for
22     fputc('\n', f);
23     }
24    
25     int qsearch_gloci(uint x, GList<GffLocus>& loci) {
26     //binary search
27     //do the simplest tests first:
28     if (loci[0]->start>x) return 0;
29     if (loci.Last()->start<x) return -1;
30     uint istart=0;
31     int i=0;
32     int idx=-1;
33     int maxh=loci.Count()-1;
34     int l=0;
35     int h = maxh;
36     while (l <= h) {
37     i = (l+h)>>1;
38     istart=loci[i]->start;
39     if (istart < x) l = i + 1;
40     else {
41     if (istart == x) { //found matching coordinate here
42     idx=i;
43     while (idx<=maxh && loci[idx]->start==x) {
44     idx++;
45     }
46     return (idx>maxh) ? -1 : idx;
47     }
48     h = i - 1;
49     }
50     } //while
51     idx = l;
52     while (idx<=maxh && loci[idx]->start<=x) {
53     idx++;
54     }
55     return (idx>maxh) ? -1 : idx;
56     }
57    
58     int qsearch_rnas(uint x, GList<GffObj>& rnas) {
59     //binary search
60     //do the simplest tests first:
61     if (rnas[0]->start>x) return 0;
62     if (rnas.Last()->start<x) return -1;
63     uint istart=0;
64     int i=0;
65     int idx=-1;
66     int maxh=rnas.Count()-1;
67     int l=0;
68     int h = maxh;
69     while (l <= h) {
70     i = (l+h)>>1;
71     istart=rnas[i]->start;
72     if (istart < x) l = i + 1;
73     else {
74     if (istart == x) { //found matching coordinate here
75     idx=i;
76     while (idx<=maxh && rnas[idx]->start==x) {
77     idx++;
78     }
79     return (idx>maxh) ? -1 : idx;
80     }
81     h = i - 1;
82     }
83     } //while
84     idx = l;
85     while (idx<=maxh && rnas[idx]->start<=x) {
86     idx++;
87     }
88     return (idx>maxh) ? -1 : idx;
89     }
90    
91     int cmpRedundant(GffObj& a, GffObj& b) {
92     if (a.exons.Count()==b.exons.Count()) {
93     if (a.covlen==b.covlen) {
94     return strcmp(a.getID(), b.getID());
95     }
96     else return (a.covlen>b.covlen)? 1 : -1;
97     }
98     else return (a.exons.Count()>b.exons.Count())? 1: -1;
99     }
100    
101    
102     bool tMatch(GffObj& a, GffObj& b) {
103     //strict intron chain match, or single-exon perfect match
104     int imax=a.exons.Count()-1;
105     int jmax=b.exons.Count()-1;
106     int ovlen=0;
107     if (imax!=jmax) return false; //different number of introns
108    
109     if (imax==0) { //single-exon mRNAs
110     //if (equnspl) {
111     //fuzz match for single-exon transfrags:
112     // it's a match if they overlap at least 80% of max len
113     ovlen=a.exons[0]->overlapLen(b.exons[0]);
114     int maxlen=GMAX(a.covlen,b.covlen);
115     return (ovlen>=maxlen*0.8);
116     /*}
117     else {
118     //only exact match
119     ovlen=a.covlen;
120     return (a.exons[0]->start==b.exons[0]->start &&
121     a.exons[0]->end==b.exons[0]->end);
122    
123     }*/
124     }
125     //check intron overlaps
126     ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
127     ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
128     for (int i=1;i<=imax;i++) {
129     if (i<imax) ovlen+=a.exons[i]->len();
130     if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
131     (a.exons[i]->start!=b.exons[i]->start)) {
132     return false; //intron mismatch
133     }
134     }
135     return true;
136     }
137    
138    
139     bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) {
140     //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
141     //but it does not cross any intron-exon boundary of tj
142     int imax=ti.exons.Count()-1;
143     int jmax=tj.exons.Count()-1;
144     if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
145     int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
146     if (fuzzSpan) {
147     for (int j=0;j<=jmax;j++) {
148     //must NOT overlap the introns
149     if ((j>0 && ti.start<tj.exons[j]->start)
150     || (j<jmax && ti.end>tj.exons[j]->end))
151     return false;
152     if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
153     return true;
154     }
155     } else {
156     for (int j=0;j<=jmax;j++) {
157     //must NOT overlap the introns
158     if ((j>0 && ti.start<tj.exons[j]->start)
159     || (j<jmax && ti.end>tj.exons[j]->end))
160     return false;
161     //strict containment
162     if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
163     return true;
164     }
165     }
166     return false;
167     }
168    
169     GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) {
170     // matchAllIntrons==true: transcripts are considered "redundant" only if
171     // they have the exact same number of introns and same splice sites (or none)
172     // (single-exon transcripts can be also fully contained to be considered matching)
173     // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
174     // as long as no intron-exon boundaries are violated; also, a single-exon
175     // transcript will be collapsed if it's contained in one of the exons of the other
176     // fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic
177     // span of the other
178     //
179     // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
180     // (i.e. they may extend each-other in opposite directions)
181    
182     //if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned)
183     if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all
184     int imax=ti.exons.Count()-1;
185     int jmax=tj.exons.Count()-1;
186     GffObj* bigger=NULL;
187     GffObj* smaller=NULL;
188     if (matchAllIntrons) {
189     if (imax!=jmax) return false;
190     if (ti.covlen>tj.covlen) {
191     bigger=&ti;
192     if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL;
193     }
194     else { //ti.covlen<=tj.covlen
195     bigger=&tj;
196     if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL;
197     }
198     //check that all introns really match
199     for (int i=0;i<imax;i++) {
200     if (ti.exons[i]->end!=tj.exons[i]->end ||
201     ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
202     }
203     return bigger;
204     }
205     //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
206     int maxlen=0;
207     int minlen=0;
208     if (ti.covlen>tj.covlen) {
209     if (tj.exons.Count()>ti.exons.Count()) {
210     //exon count override
211     bigger=&tj;
212     smaller=&ti;
213     }
214     else {
215     bigger=&ti;
216     smaller=&tj;
217     }
218     maxlen=ti.covlen;
219     minlen=tj.covlen;
220     }
221     else { //tj has more bases
222     if (ti.exons.Count()>tj.exons.Count()) {
223     //exon count override
224     bigger=&ti;
225     smaller=&tj;
226     }
227     else {
228     bigger=&tj;
229     smaller=&ti;
230     }
231     maxlen=tj.covlen;
232     minlen=ti.covlen;
233     }
234     if (imax==0 && jmax==0) {
235     //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
236     if (fuzzSpan) {
237     return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
238     }
239     else {
240     return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
241     }
242     }
243     //containment is also considered redundancy
244     if (smaller->exons.Count()==1) {
245     //check if this single exon is contained in any of tj exons
246     //without violating any intron-exon boundaries
247     return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL);
248     }
249    
250     //--from here on: both are multi-exon transcripts, imax>0 && jmax>0
251     if (ti.exons[imax]->start<tj.exons[0]->end ||
252     tj.exons[jmax]->start<ti.exons[0]->end )
253     return NULL; //intron chains do not overlap at all
254    
255    
256     //checking full intron chain containment
257     uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
258     int i=1; //exon idx to the right of the current intron of ti
259     int j=1; //exon idx to the right of the current intron of tj
260     //find the first intron overlap:
261     while (i<=imax && j<=jmax) {
262     eistart=ti.exons[i-1]->end;
263     eiend=ti.exons[i]->start;
264     ejstart=tj.exons[j-1]->end;
265     ejend=tj.exons[j]->start;
266     if (ejend<eistart) { j++; continue; }
267     if (eiend<ejstart) { i++; continue; }
268     //we found an intron overlap
269     break;
270     }
271     if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
272     if ((i>1 && j>1) || i>imax || j>jmax) {
273     return NULL; //either no intron overlaps found at all
274     //or it's not the first intron for at least one of the transcripts
275     }
276     if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
277     if (j>i) {
278     //i==1, ti's start must not conflict with the previous intron of tj
279     if (ti.start<tj.exons[j-1]->start) return NULL;
280     //so i's first intron starts AFTER j's first intron
281     // then j must contain i, so i's last intron must end with or before j's last intron
282     if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
283     //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
284     }
285     else if (i>j) {
286     //j==1, tj's start must not conflict with the previous intron of ti
287     if (tj.start<ti.exons[i-1]->start) return NULL;
288     //so j's intron chain starts AFTER i's
289     // then i must contain j, so j's last intron must end with or before j's last intron
290     if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
291     //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
292     }
293     //now check if the rest of the introns overlap, in the same sequence
294     i++;
295     j++;
296     while (i<=imax && j<=jmax) {
297     if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
298     ti.exons[i]->start!=tj.exons[j]->start) return NULL;
299     i++;
300     j++;
301     }
302     i--;
303     j--;
304     if (i==imax && j<jmax) {
305     // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
306     if (ti.end>tj.exons[j]->end) return NULL;
307     }
308     else if (j==jmax && i<imax) {
309     if (tj.end>ti.exons[i]->end) return NULL;
310     }
311     return bigger;
312     }
313    
314    
315     int gseqCmpName(const pointer p1, const pointer p2) {
316     return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
317     }
318    
319    
320 gpertea 59 void printLocus(GffLocus* loc, const char* pre) {
321 gpertea 26 if (pre!=NULL) fprintf(stderr, "%s", pre);
322     GMessage(" [%d-%d] : ", loc->start, loc->end);
323     GMessage("%s",loc->rnas[0]->getID());
324     for (int i=1;i<loc->rnas.Count();i++) {
325     GMessage(",%s",loc->rnas[i]->getID());
326     }
327     GMessage("\n");
328     }
329    
330 gpertea 48 void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
331     //transfer CDS info to the container t if it's a larger protein
332     if (tfrom->CDstart==0) return;
333     if (t->CDstart) {
334     if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start)
335     t->CDstart=tfrom->CDstart;
336     if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end)
337     t->CDend=tfrom->CDend;
338     }
339     else { //no CDS info on container, just copy it from the contained
340 gpertea 56 t->addCDS(tfrom->CDstart, tfrom->CDend, tfrom->CDphase);
341 gpertea 48 }
342 gpertea 47 }
343    
344 gpertea 59 void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
345     bool matchAllIntrons, bool fuzzSpan) {
346 gpertea 26 GTData* tdata=new GTData(t);
347     gdata->tdata.Add(tdata);
348     int tidx=-1;
349 gpertea 59 /*
350     if (debug) {
351     GMessage(">>Placing transcript %s\n", t->getID());
352     debugState=true;
353     }
354     else debugState=false;
355     */
356 gpertea 26 if (t->exons.Count()>0)
357     tidx=gdata->rnas.Add(t); //added it in sorted order
358     else {
359     gdata->gfs.Add(t);
360     return; //nothing to do with these non-transcript objects
361     }
362     if (!doCluster) return;
363     if (gdata->loci.Count()==0) {
364     gdata->loci.Add(new GffLocus(t));
365     //GMessage(" <<make it first locus %d-%d \n",t->start, t->end);
366     return;
367     }
368 gpertea 59 /*
369 gpertea 26 //DEBUG: show available loci:
370 gpertea 59 if (debug) {
371     GMessage(" [%d loci already:\n", gdata->loci.Count());
372     for (int l=0;l<gdata->loci.Count();l++) {
373     printLocus(gdata->loci[l]);
374     }
375     }
376     */
377 gpertea 26 int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
378     //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
379     if (nidx==0) {
380     //cannot have any overlapping loci
381 gpertea 59 //if (debug) GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end);
382 gpertea 26 gdata->loci.Add(new GffLocus(t));
383     return;
384     }
385     if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
386     int lfound=0; //count of parent loci
387     GArray<int> mrgloci(false);
388     GList<GffLocus> tloci(true); //candidate parent loci to adopt this
389 gpertea 59 //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
390 gpertea 26 for (int l=nidx-1;l>=0;l--) {
391     GffLocus& loc=*(gdata->loci[l]);
392     if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
393     if (t->start>loc.end) {
394     if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
395     continue;
396     }
397 gpertea 59 if (loc.start>t->end) {
398     //this should never be the case if nidx was found correctly
399     GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
400     continue;
401     }
402     /*
403     if (debug) {
404     GMessage(" !range overlap found with locus ");
405     printLocus(&loc);
406     }
407     */
408 gpertea 26 if (loc.add_RNA(t)) {
409     //will add this transcript to loc
410     lfound++;
411     mrgloci.Add(l);
412     if (collapseRedundant) {
413     //compare to every single transcript in this locus
414     for (int ti=0;ti<loc.rnas.Count();ti++) {
415     if (loc.rnas[ti]==t) continue;
416     GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
417     //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
418     GffObj* container=NULL;
419     if (odata->replaced_by==NULL &&
420     (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
421     if (container==t) {
422     odata->replaced_by=t;
423 gpertea 48 preserveContainedCDS(t, loc.rnas[ti]);
424 gpertea 26 }
425     else {
426     tdata->replaced_by=loc.rnas[ti];
427 gpertea 48 preserveContainedCDS(loc.rnas[ti], t);
428 gpertea 26 }
429     }
430     }//for each transcript in the exon-overlapping locus
431     } //if doCollapseRedundant
432     } //overlapping locus
433 gpertea 59 } //for each existing locus
434 gpertea 26 if (lfound==0) {
435     //overlapping loci not found, create a locus with only this mRNA
436 gpertea 59 /* if (debug) {
437     GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end);
438     }
439     */
440 gpertea 26 int addidx=gdata->loci.Add(new GffLocus(t));
441     if (addidx<0) {
442 gpertea 59 //should never be the case!
443 gpertea 26 GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
444     }
445     }
446 gpertea 59 else { //found at least one overlapping locus
447     lfound--;
448     int locidx=mrgloci[lfound];
449     GffLocus& loc=*(gdata->loci[locidx]);
450     //last locus index found is also the smallest index
451     if (lfound>0) {
452     //more than one loci found parenting this mRNA, merge loci
453     /* if (debug)
454     GMessage(" merging %d loci \n",lfound);
455     */
456 gpertea 26 for (int l=0;l<lfound;l++) {
457 gpertea 59 int mlidx=mrgloci[l];
458     loc.addMerge(*(gdata->loci[mlidx]), t);
459     gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
460 gpertea 26 }
461 gpertea 59 }
462     int i=locidx;
463     while (i>0 && loc.start<gdata->loci[i-1]->start) {
464     //bubble down until it's in the proper order
465     i--;
466     gdata->loci.Swap(i,i+1);
467     }
468     }//found at least one overlapping locus
469 gpertea 26 }
470    
471     void collectLocusData(GList<GenomicSeqData>& ref_data) {
472     int locus_num=0;
473     for (int g=0;g<ref_data.Count();g++) {
474     GenomicSeqData* gdata=ref_data[g];
475     for (int l=0;l<gdata->loci.Count();l++) {
476     GffLocus& loc=*(gdata->loci[l]);
477     GHash<int> gnames(true); //gene names in this locus
478     GHash<int> geneids(true); //Entrez GeneID: numbers
479     for (int i=0;i<loc.rnas.Count();i++) {
480     GffObj& t=*(loc.rnas[i]);
481     GStr gname(t.getGeneName());
482     if (!gname.is_empty()) {
483     gname.upper();
484     int* prevg=gnames.Find(gname.chars());
485     if (prevg!=NULL) (*prevg)++;
486     else gnames.Add(gname, new int(1));
487     }
488     //parse GeneID xrefs, if any:
489     GStr xrefs(t.getAttr("xrefs"));
490     if (!xrefs.is_empty()) {
491     xrefs.startTokenize(",");
492     GStr token;
493     while (xrefs.nextToken(token)) {
494     token.upper();
495     if (token.startsWith("GENEID:")) {
496     token.cut(0,token.index(':')+1);
497     int* prevg=geneids.Find(token.chars());
498     if (prevg!=NULL) (*prevg)++;
499     else geneids.Add(token, new int(1));
500     }
501     } //for each xref
502     } //xrefs parsing
503     }//for each transcript
504     locus_num++;
505     loc.locus_num=locus_num;
506     if (gnames.Count()>0) { //collect all gene names associated to this locus
507     gnames.startIterate();
508     int* gfreq=NULL;
509     char* key=NULL;
510     while ((gfreq=gnames.NextData(key))!=NULL) {
511     loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
512     }
513     } //added collected gene_names
514     if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
515     geneids.startIterate();
516     int* gfreq=NULL;
517     char* key=NULL;
518     while ((gfreq=geneids.NextData(key))!=NULL) {
519     loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
520     }
521     }
522     } //for each locus
523     }//for each genomic sequence
524     }
525    
526    
527     void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate,
528     bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
529     GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
530     gffr->showWarnings(this->showWarnings);
531     // keepAttrs mergeCloseExons noExonAttr
532     gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
533     //int redundant=0; //redundant annotation discarded
534     if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
535     //int rna_deleted=0;
536     //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
537     for (int k=0;k<gffr->gflst.Count();k++) {
538     GffObj* m=gffr->gflst[k];
539     if (strcmp(m->getFeatureName(), "locus")==0 &&
540     m->getAttr("transcripts")!=NULL) {
541     continue;
542     }
543    
544     char* rloc=m->getAttr("locus");
545     if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
546     m->removeAttr("locus", rloc);
547     }
548     if (m->exons.Count()==0 && m->children.Count()==0) {
549     //a non-mRNA feature with no subfeatures
550     //add a dummy exon just to have the generic exon checking work
551     m->addExon(m->start,m->end);
552     }
553     GList<GffObj> gfadd(false,false);
554     if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
555     continue;
556     }
557     m->isUsed(true); //so the gffreader won't destroy it
558     int i=-1;
559     GenomicSeqData f(m->gseq_id);
560     GenomicSeqData* gdata=NULL;
561    
562     if (seqdata.Found(&f,i)) gdata=seqdata[i];
563     else { //entry not created yet for this genomic seq
564     gdata=new GenomicSeqData(m->gseq_id);
565     seqdata.Add(gdata);
566     }
567     for (int k=0;k<gfadd.Count();k++) {
568     placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
569     }
570     placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
571     } //for each read gffObj
572     //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
573     if (f!=stdin) { fclose(f); f=NULL; }
574     delete gffr;
575     }