gclib/gffread/gff_utils.cpp

#include "gff_utils.h"

extern bool verbose;
extern bool debugMode;

//bool debugState=false;

void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
 if (seq==NULL) return;
 int len=(seqlen>0)?seqlen:strlen(seq);
 if (len<=0) return;
 if (!defline.is_empty())
     fprintf(f, ">%s\n",defline.chars());
 int ilen=0;
 for (int i=0; i < len; i++, ilen++) {
   if (ilen == 70) {
     fputc('\n', f);
     ilen = 0;
     }
   putc(seq[i], f);
   } //for
 fputc('\n', f);
}

int qsearch_gloci(uint x, GList<GffLocus>& loci) {
  //binary search
  //do the simplest tests first:
  if (loci[0]->start>x) return 0;
  if (loci.Last()->start<x) return -1;
  uint istart=0;
  int i=0;
  int idx=-1;
  int maxh=loci.Count()-1;
  int l=0;
  int h = maxh;
  while (l <= h) {
     i = (l+h)>>1;
     istart=loci[i]->start;
     if (istart < x)  l = i + 1;
          else {
             if (istart == x) { //found matching coordinate here
                  idx=i;
                  while (idx<=maxh && loci[idx]->start==x) {
                     idx++;
                     }
                  return (idx>maxh) ? -1 : idx;
                  }
             h = i - 1;
             }
     } //while
 idx = l;
 while (idx<=maxh && loci[idx]->start<=x) {
    idx++;
    }
 return (idx>maxh) ? -1 : idx;
}

int qsearch_rnas(uint x, GList<GffObj>& rnas) {
  //binary search
  //do the simplest tests first:
  if (rnas[0]->start>x) return 0;
  if (rnas.Last()->start<x) return -1;
  uint istart=0;
  int i=0;
  int idx=-1;
  int maxh=rnas.Count()-1;
  int l=0;
  int h = maxh;
  while (l <= h) {
     i = (l+h)>>1;
     istart=rnas[i]->start;
     if (istart < x)  l = i + 1;
          else {
             if (istart == x) { //found matching coordinate here
                  idx=i;
                  while (idx<=maxh && rnas[idx]->start==x) {
                     idx++;
                     }
                  return (idx>maxh) ? -1 : idx;
                  }
             h = i - 1;
             }
     } //while
 idx = l;
 while (idx<=maxh && rnas[idx]->start<=x) {
    idx++;
    }
 return (idx>maxh) ? -1 : idx;
}

int cmpRedundant(GffObj& a, GffObj& b) {
  if (a.exons.Count()==b.exons.Count()) {
     if (a.covlen==b.covlen) {
       return strcmp(a.getID(), b.getID());
       }
     else return (a.covlen>b.covlen)? 1 : -1;
     }
   else return (a.exons.Count()>b.exons.Count())? 1: -1;
}


bool tMatch(GffObj& a, GffObj& b) {
  //strict intron chain match, or single-exon perfect match
  int imax=a.exons.Count()-1;
  int jmax=b.exons.Count()-1;
  int ovlen=0;
  if (imax!=jmax) return false; //different number of introns

  if (imax==0) { //single-exon mRNAs
    //if (equnspl) {
      //fuzz match for single-exon transfrags: 
      // it's a match if they overlap at least 80% of max len
      ovlen=a.exons[0]->overlapLen(b.exons[0]);
      int maxlen=GMAX(a.covlen,b.covlen);
      return (ovlen>=maxlen*0.8);
    /*}
    else {
      //only exact match
      ovlen=a.covlen;
      return (a.exons[0]->start==b.exons[0]->start &&
          a.exons[0]->end==b.exons[0]->end);
      
       }*/
     }
  //check intron overlaps
  ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
  ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
  for (int i=1;i<=imax;i++) {
    if (i<imax) ovlen+=a.exons[i]->len();
    if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
      (a.exons[i]->start!=b.exons[i]->start)) {
            return false; //intron mismatch
    }
  }
  return true;
}


bool unsplContained(GffObj& ti, GffObj&  tj, bool fuzzSpan) {
 //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
 //but it does not cross any intron-exon boundary of tj
  int imax=ti.exons.Count()-1;
  int jmax=tj.exons.Count()-1;
  if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
  int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
  if (fuzzSpan) {
    for (int j=0;j<=jmax;j++) {
       //must NOT overlap the introns
       if ((j>0 && ti.start<tj.exons[j]->start) 
          || (j<jmax && ti.end>tj.exons[j]->end))
         return false;
       if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
              return true;
       }
      } else {
    for (int j=0;j<=jmax;j++) {
       //must NOT overlap the introns
       if ((j>0 && ti.start<tj.exons[j]->start) 
          || (j<jmax && ti.end>tj.exons[j]->end))
         return false;
         //strict containment
       if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start) 
            return true;
       }
      }
 return false;
}

GffObj* redundantTranscripts(GffObj& ti, GffObj&  tj, bool matchAllIntrons, bool fuzzSpan) {
  // matchAllIntrons==true:  transcripts are considered "redundant" only if
  //                   they have the exact same number of introns and same splice sites (or none)
  //                 (single-exon transcripts can be also fully contained to be considered matching)
  // matchAllIntrons==false: an intron chain could be a subset of a "container" chain, 
  //                   as long as no intron-exon boundaries are violated; also, a single-exon 
  //                   transcript will be collapsed if it's contained in one of the exons of the other
  // fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic 
  //                  span of the other 
  // 
  // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained 
  //                 (i.e. they may extend each-other in opposite directions)
  
  //if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned)
 if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all
 int imax=ti.exons.Count()-1;
 int jmax=tj.exons.Count()-1;
 GffObj* bigger=NULL;
 GffObj* smaller=NULL;
 if (matchAllIntrons) {
   if (imax!=jmax) return false;
   if (ti.covlen>tj.covlen) {
       bigger=&ti;
       if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL;
       }
     else { //ti.covlen<=tj.covlen
       bigger=&tj;
       if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL;
       }
   //check that all introns really match
   for (int i=0;i<imax;i++) {
     if (ti.exons[i]->end!=tj.exons[i]->end || 
         ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
     }
   return bigger;
   }
 //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
 int maxlen=0;
 int minlen=0;
 if (ti.covlen>tj.covlen) {
      if (tj.exons.Count()>ti.exons.Count()) {
          //exon count override
          bigger=&tj;
          smaller=&ti;
          }
        else {
          bigger=&ti;
          smaller=&tj;
          }
      maxlen=ti.covlen;
      minlen=tj.covlen;
      }
   else { //tj has more bases
      if (ti.exons.Count()>tj.exons.Count()) {
          //exon count override
          bigger=&ti;
          smaller=&tj;
          }
        else {
          bigger=&tj;
          smaller=&ti;
          }
      maxlen=tj.covlen;
      minlen=ti.covlen;
      }
 if (imax==0 && jmax==0) {
     //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
     if (fuzzSpan) {
         return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
         }
       else {
         return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
         }
     }
 //containment is also considered redundancy
 if (smaller->exons.Count()==1) {
   //check if this single exon is contained in any of tj exons
   //without violating any intron-exon boundaries
   return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL);
   }

 //--from here on: both are multi-exon transcripts, imax>0 && jmax>0
  if (ti.exons[imax]->start<tj.exons[0]->end ||
     tj.exons[jmax]->start<ti.exons[0]->end )
         return NULL; //intron chains do not overlap at all
 
 
 //checking full intron chain containment
 uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
 int i=1; //exon idx to the right of the current intron of ti
 int j=1; //exon idx to the right of the current intron of tj
 //find the first intron overlap:
 while (i<=imax && j<=jmax) {
    eistart=ti.exons[i-1]->end;
    eiend=ti.exons[i]->start;
    ejstart=tj.exons[j-1]->end;
    ejend=tj.exons[j]->start;
    if (ejend<eistart) { j++; continue; }
    if (eiend<ejstart) { i++; continue; }
    //we found an intron overlap
    break;
    }
 if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
 if ((i>1 && j>1) || i>imax || j>jmax) {
     return NULL; //either no intron overlaps found at all
                  //or it's not the first intron for at least one of the transcripts
     }
 if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
 if (j>i) {
   //i==1, ti's start must not conflict with the previous intron of tj
   if (ti.start<tj.exons[j-1]->start) return NULL;
   //so i's first intron starts AFTER j's first intron
   // then j must contain i, so i's last intron must end with or before j's last intron
   if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
      //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
   }
  else if (i>j) {
     //j==1, tj's start must not conflict with the previous intron of ti
     if (tj.start<ti.exons[i-1]->start) return NULL;
     //so j's intron chain starts AFTER i's
     // then i must contain j, so j's last intron must end with or before j's last intron
     if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
        //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
     }
 //now check if the rest of the introns overlap, in the same sequence
 i++;
 j++;
 while (i<=imax && j<=jmax) {
  if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
      ti.exons[i]->start!=tj.exons[j]->start) return NULL;
  i++;
  j++;
  }
 i--;
 j--;
 if (i==imax && j<jmax) {
   // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
   if (ti.end>tj.exons[j]->end) return NULL;
   }
 else if (j==jmax && i<imax) {
   if (tj.end>ti.exons[i]->end) return NULL;
   }
 return bigger;
}


int gseqCmpName(const pointer p1, const pointer p2) {
 return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
}


void printLocus(GffLocus* loc, const char* pre) {
  if (pre!=NULL) fprintf(stderr, "%s", pre);
  GMessage(" [%d-%d] : ", loc->start, loc->end);
  GMessage("%s",loc->rnas[0]->getID());
  for (int i=1;i<loc->rnas.Count();i++) {
    GMessage(",%s",loc->rnas[i]->getID());
    }
  GMessage("\n");
}

void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
 //transfer CDS info to the container t if it's a larger protein
 if (tfrom->CDstart==0) return;
 if (t->CDstart) {
   if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start)
      t->CDstart=tfrom->CDstart;
   if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end)
      t->CDend=tfrom->CDend;
   }
  else { //no CDS info on container, just copy it from the contained
   t->addCDS(tfrom->CDstart, tfrom->CDend, tfrom->CDphase);
   }
}

void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
                                               bool matchAllIntrons, bool fuzzSpan) {
  GTData* tdata=new GTData(t);
  gdata->tdata.Add(tdata);
  int tidx=-1;
  /*
  if (debug) {
     GMessage(">>Placing transcript %s\n", t->getID());
     debugState=true;
     }
    else debugState=false; 
   */
  if (t->exons.Count()>0)
              tidx=gdata->rnas.Add(t); //added it in sorted order
            else {
              gdata->gfs.Add(t);
              return; //nothing to do with these non-transcript objects
              }
  if (!doCluster) return;
  if (gdata->loci.Count()==0) {
       gdata->loci.Add(new GffLocus(t));
       //GMessage("  <<make it first locus %d-%d \n",t->start, t->end);
       return;
       }
   /*    
  //DEBUG: show available loci:
   if (debug) {
    GMessage("  [%d loci already:\n", gdata->loci.Count());
    for (int l=0;l<gdata->loci.Count();l++) {
       printLocus(gdata->loci[l]);
       }
    }
  */
  int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
  //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
  if (nidx==0) {
     //cannot have any overlapping loci
     //if (debug) GMessage("  <<no ovls possible, create locus %d-%d \n",t->start, t->end);
     gdata->loci.Add(new GffLocus(t));
     return;
     }
  if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
  int lfound=0; //count of parent loci
  GArray<int> mrgloci(false);
  GList<GffLocus> tloci(true); //candidate parent loci to adopt this
  //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
  for (int l=nidx-1;l>=0;l--) {
      GffLocus& loc=*(gdata->loci[l]);
      if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
      if (t->start>loc.end) {
           if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
           continue;
           }
      if (loc.start>t->end) {
               //this should never be the case if nidx was found correctly
               GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
               continue;
               }
      /*
      if (debug) {
          GMessage(" !range overlap found with locus ");
          printLocus(&loc);
          }
      */
      if (loc.add_RNA(t)) {
         //will add this transcript to loc
         lfound++;
         mrgloci.Add(l);
         if (collapseRedundant) {
           //compare to every single transcript in this locus
           for (int ti=0;ti<loc.rnas.Count();ti++) {
                 if (loc.rnas[ti]==t) continue;
                 GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
                 //GMessage("  ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
                 GffObj* container=NULL;
                 if (odata->replaced_by==NULL && 
                      (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
                     if (container==t) {
                        odata->replaced_by=t;
                        preserveContainedCDS(t, loc.rnas[ti]);
                        }
                     else {
                        tdata->replaced_by=loc.rnas[ti];
                        preserveContainedCDS(loc.rnas[ti], t);
                        }
                     }
              }//for each transcript in the exon-overlapping locus
          } //if doCollapseRedundant
         } //overlapping locus
      } //for each existing locus
  if (lfound==0) {
      //overlapping loci not found, create a locus with only this mRNA
      /* if (debug) {
        GMessage("  overlapping locus not found, create locus %d-%d \n",t->start, t->end);
        }
      */
      int addidx=gdata->loci.Add(new GffLocus(t));
      if (addidx<0) {
         //should never be the case!
         GMessage("  WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
         }
      }
   else { //found at least one overlapping locus
     lfound--;
     int locidx=mrgloci[lfound];
     GffLocus& loc=*(gdata->loci[locidx]);
     //last locus index found is also the smallest index
     if (lfound>0) {
       //more than one loci found parenting this mRNA, merge loci
       /* if (debug)
          GMessage(" merging %d loci \n",lfound);
       */
       for (int l=0;l<lfound;l++) {
          int mlidx=mrgloci[l]; 
          loc.addMerge(*(gdata->loci[mlidx]), t);
          gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
          }
       }
     int i=locidx;  
     while (i>0 && loc.start<gdata->loci[i-1]->start) {
       //bubble down until it's in the proper order
       i--;
       gdata->loci.Swap(i,i+1);
       }
     }//found at least one overlapping locus
}

void collectLocusData(GList<GenomicSeqData>& ref_data) {
  int locus_num=0;
  for (int g=0;g<ref_data.Count();g++) {
    GenomicSeqData* gdata=ref_data[g];
    for (int l=0;l<gdata->loci.Count();l++) {
      GffLocus& loc=*(gdata->loci[l]);
      GHash<int> gnames(true); //gene names in this locus
      GHash<int> geneids(true); //Entrez GeneID: numbers
      for (int i=0;i<loc.rnas.Count();i++) {
        GffObj& t=*(loc.rnas[i]);
        GStr gname(t.getGeneName());
        if (!gname.is_empty()) {
           gname.upper();
           int* prevg=gnames.Find(gname.chars());
           if (prevg!=NULL) (*prevg)++;
                  else gnames.Add(gname, new int(1));
           }
        //parse GeneID xrefs, if any:
        GStr xrefs(t.getAttr("xrefs"));
        if (!xrefs.is_empty()) {
          xrefs.startTokenize(",");
          GStr token;
          while (xrefs.nextToken(token)) {
            token.upper();
            if (token.startsWith("GENEID:")) {
              token.cut(0,token.index(':')+1);
              int* prevg=geneids.Find(token.chars());
              if (prevg!=NULL) (*prevg)++;
                     else geneids.Add(token, new int(1));
              }
            } //for each xref
          } //xrefs parsing
        }//for each transcript
      locus_num++;
      loc.locus_num=locus_num;
      if (gnames.Count()>0) { //collect all gene names associated to this locus
         gnames.startIterate();
         int* gfreq=NULL;
         char* key=NULL;
         while ((gfreq=gnames.NextData(key))!=NULL) {
            loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
            }
         } //added collected gene_names
      if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
         geneids.startIterate();
         int* gfreq=NULL;
         char* key=NULL;
         while ((gfreq=geneids.NextData(key))!=NULL) {
           loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
            }
          }
      } //for each locus
  }//for each genomic sequence
}


void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate, 
                              bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
   GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
   gffr->showWarnings(this->showWarnings);
   //           keepAttrs   mergeCloseExons  noExonAttr
   gffr->readAll(this->fullAttributes,    this->mergeCloseExons,  this->noExonAttrs);
  //int redundant=0; //redundant annotation discarded
  if (verbose) GMessage("   .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
  //int rna_deleted=0;
  //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
  for (int k=0;k<gffr->gflst.Count();k++) {
     GffObj* m=gffr->gflst[k];
     if (strcmp(m->getFeatureName(), "locus")==0 && 
          m->getAttr("transcripts")!=NULL) {
        continue;
        }
     
     char* rloc=m->getAttr("locus");
     if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
        m->removeAttr("locus", rloc);
        }
     if (m->exons.Count()==0 && m->children.Count()==0) {
       //a non-mRNA feature with no subfeatures
       //add a dummy exon just to have the generic exon checking work
       m->addExon(m->start,m->end);
       }
     GList<GffObj> gfadd(false,false);
     if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
       continue;
       }
     m->isUsed(true); //so the gffreader won't destroy it
     int i=-1;
     GenomicSeqData f(m->gseq_id);
     GenomicSeqData* gdata=NULL;
     
     if (seqdata.Found(&f,i)) gdata=seqdata[i];
         else { //entry not created yet for this genomic seq
           gdata=new GenomicSeqData(m->gseq_id);
           seqdata.Add(gdata);
           }
    for (int k=0;k<gfadd.Count();k++) {
      placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
      }
    placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
    } //for each read gffObj
   //if (verbose) GMessage("  .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
   if (f!=stdin) { fclose(f); f=NULL; }
   delete gffr;
}
Revision:	59
Committed:	Fri Sep 9 18:00:23 2011 UTC (12 years, 10 months ago) by gpertea
File size:	20443 byte(s)
Log Message:	finally fixed the placeGf clustering bug
Line	User	Rev	File contents
1	gpertea	26	#include "gff_utils.h"
2
3			extern bool verbose;
4			extern bool debugMode;
5
6	gpertea	59	//bool debugState=false;
7
8	gpertea	26	void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
9			if (seq==NULL) return;
10			int len=(seqlen>0)?seqlen:strlen(seq);
11			if (len<=0) return;
12			if (!defline.is_empty())
13			fprintf(f, ">%s\n",defline.chars());
14			int ilen=0;
15			for (int i=0; i < len; i++, ilen++) {
16			if (ilen == 70) {
17			fputc('\n', f);
18			ilen = 0;
19			}
20			putc(seq[i], f);
21			} //for
22			fputc('\n', f);
23			}
24
25			int qsearch_gloci(uint x, GList<GffLocus>& loci) {
26			//binary search
27			//do the simplest tests first:
28			if (loci[0]->start>x) return 0;
29			if (loci.Last()->start<x) return -1;
30			uint istart=0;
31			int i=0;
32			int idx=-1;
33			int maxh=loci.Count()-1;
34			int l=0;
35			int h = maxh;
36			while (l <= h) {
37			i = (l+h)>>1;
38			istart=loci[i]->start;
39			if (istart < x) l = i + 1;
40			else {
41			if (istart == x) { //found matching coordinate here
42			idx=i;
43			while (idx<=maxh && loci[idx]->start==x) {
44			idx++;
45			}
46			return (idx>maxh) ? -1 : idx;
47			}
48			h = i - 1;
49			}
50			} //while
51			idx = l;
52			while (idx<=maxh && loci[idx]->start<=x) {
53			idx++;
54			}
55			return (idx>maxh) ? -1 : idx;
56			}
57
58			int qsearch_rnas(uint x, GList<GffObj>& rnas) {
59			//binary search
60			//do the simplest tests first:
61			if (rnas[0]->start>x) return 0;
62			if (rnas.Last()->start<x) return -1;
63			uint istart=0;
64			int i=0;
65			int idx=-1;
66			int maxh=rnas.Count()-1;
67			int l=0;
68			int h = maxh;
69			while (l <= h) {
70			i = (l+h)>>1;
71			istart=rnas[i]->start;
72			if (istart < x) l = i + 1;
73			else {
74			if (istart == x) { //found matching coordinate here
75			idx=i;
76			while (idx<=maxh && rnas[idx]->start==x) {
77			idx++;
78			}
79			return (idx>maxh) ? -1 : idx;
80			}
81			h = i - 1;
82			}
83			} //while
84			idx = l;
85			while (idx<=maxh && rnas[idx]->start<=x) {
86			idx++;
87			}
88			return (idx>maxh) ? -1 : idx;
89			}
90
91			int cmpRedundant(GffObj& a, GffObj& b) {
92			if (a.exons.Count()==b.exons.Count()) {
93			if (a.covlen==b.covlen) {
94			return strcmp(a.getID(), b.getID());
95			}
96			else return (a.covlen>b.covlen)? 1 : -1;
97			}
98			else return (a.exons.Count()>b.exons.Count())? 1: -1;
99			}
100
101
102			bool tMatch(GffObj& a, GffObj& b) {
103			//strict intron chain match, or single-exon perfect match
104			int imax=a.exons.Count()-1;
105			int jmax=b.exons.Count()-1;
106			int ovlen=0;
107			if (imax!=jmax) return false; //different number of introns
108
109			if (imax==0) { //single-exon mRNAs
110			//if (equnspl) {
111			//fuzz match for single-exon transfrags:
112			// it's a match if they overlap at least 80% of max len
113			ovlen=a.exons[0]->overlapLen(b.exons[0]);
114			int maxlen=GMAX(a.covlen,b.covlen);
115			return (ovlen>=maxlen*0.8);
116			/*}
117			else {
118			//only exact match
119			ovlen=a.covlen;
120			return (a.exons[0]->start==b.exons[0]->start &&
121			a.exons[0]->end==b.exons[0]->end);
122
123			}*/
124			}
125			//check intron overlaps
126			ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
127			ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
128			for (int i=1;i<=imax;i++) {
129			if (i<imax) ovlen+=a.exons[i]->len();
130			if ((a.exons[i-1]->end!=b.exons[i-1]->end) \|\|
131			(a.exons[i]->start!=b.exons[i]->start)) {
132			return false; //intron mismatch
133			}
134			}
135			return true;
136			}
137
138
139			bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) {
140			//returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
141			//but it does not cross any intron-exon boundary of tj
142			int imax=ti.exons.Count()-1;
143			int jmax=tj.exons.Count()-1;
144			if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
145			int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
146			if (fuzzSpan) {
147			for (int j=0;j<=jmax;j++) {
148			//must NOT overlap the introns
149			if ((j>0 && ti.start<tj.exons[j]->start)
150			\|\| (j<jmax && ti.end>tj.exons[j]->end))
151			return false;
152			if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
153			return true;
154			}
155			} else {
156			for (int j=0;j<=jmax;j++) {
157			//must NOT overlap the introns
158			if ((j>0 && ti.start<tj.exons[j]->start)
159			\|\| (j<jmax && ti.end>tj.exons[j]->end))
160			return false;
161			//strict containment
162			if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
163			return true;
164			}
165			}
166			return false;
167			}
168
169			GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) {
170			// matchAllIntrons==true: transcripts are considered "redundant" only if
171			// they have the exact same number of introns and same splice sites (or none)
172			// (single-exon transcripts can be also fully contained to be considered matching)
173			// matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
174			// as long as no intron-exon boundaries are violated; also, a single-exon
175			// transcript will be collapsed if it's contained in one of the exons of the other
176			// fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic
177			// span of the other
178			//
179			// fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
180			// (i.e. they may extend each-other in opposite directions)
181
182			//if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned)
183			if (ti.start>=tj.end \|\| tj.start>=ti.end \|\| tj.strand!=ti.strand) return NULL; //no span overlap at all
184			int imax=ti.exons.Count()-1;
185			int jmax=tj.exons.Count()-1;
186			GffObj* bigger=NULL;
187			GffObj* smaller=NULL;
188			if (matchAllIntrons) {
189			if (imax!=jmax) return false;
190			if (ti.covlen>tj.covlen) {
191			bigger=&ti;
192			if (!fuzzSpan && (ti.start>tj.start \|\| ti.end<tj.end)) return NULL;
193			}
194			else { //ti.covlen<=tj.covlen
195			bigger=&tj;
196			if (!fuzzSpan && (tj.start>ti.start \|\| tj.end<ti.end)) return NULL;
197			}
198			//check that all introns really match
199			for (int i=0;i<imax;i++) {
200			if (ti.exons[i]->end!=tj.exons[i]->end \|\|
201			ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
202			}
203			return bigger;
204			}
205			//--- matchAllIntrons==false: intron-chain containment is also considered redundancy
206			int maxlen=0;
207			int minlen=0;
208			if (ti.covlen>tj.covlen) {
209			if (tj.exons.Count()>ti.exons.Count()) {
210			//exon count override
211			bigger=&tj;
212			smaller=&ti;
213			}
214			else {
215			bigger=&ti;
216			smaller=&tj;
217			}
218			maxlen=ti.covlen;
219			minlen=tj.covlen;
220			}
221			else { //tj has more bases
222			if (ti.exons.Count()>tj.exons.Count()) {
223			//exon count override
224			bigger=&ti;
225			smaller=&tj;
226			}
227			else {
228			bigger=&tj;
229			smaller=&ti;
230			}
231			maxlen=tj.covlen;
232			minlen=ti.covlen;
233			}
234			if (imax==0 && jmax==0) {
235			//single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
236			if (fuzzSpan) {
237			return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
238			}
239			else {
240			return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
241			}
242			}
243			//containment is also considered redundancy
244			if (smaller->exons.Count()==1) {
245			//check if this single exon is contained in any of tj exons
246			//without violating any intron-exon boundaries
247			return (unsplContained(smaller, bigger, fuzzSpan) ? bigger : NULL);
248			}
249
250			//--from here on: both are multi-exon transcripts, imax>0 && jmax>0
251			if (ti.exons[imax]->start<tj.exons[0]->end \|\|
252			tj.exons[jmax]->start<ti.exons[0]->end )
253			return NULL; //intron chains do not overlap at all
254
255
256			//checking full intron chain containment
257			uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
258			int i=1; //exon idx to the right of the current intron of ti
259			int j=1; //exon idx to the right of the current intron of tj
260			//find the first intron overlap:
261			while (i<=imax && j<=jmax) {
262			eistart=ti.exons[i-1]->end;
263			eiend=ti.exons[i]->start;
264			ejstart=tj.exons[j-1]->end;
265			ejend=tj.exons[j]->start;
266			if (ejend<eistart) { j++; continue; }
267			if (eiend<ejstart) { i++; continue; }
268			//we found an intron overlap
269			break;
270			}
271			if (!fuzzSpan && (bigger->start>smaller->start \|\| bigger->end < smaller->end)) return NULL;
272			if ((i>1 && j>1) \|\| i>imax \|\| j>jmax) {
273			return NULL; //either no intron overlaps found at all
274			//or it's not the first intron for at least one of the transcripts
275			}
276			if (eistart!=ejstart \|\| eiend!=ejend) return NULL; //not an exact intron match
277			if (j>i) {
278			//i==1, ti's start must not conflict with the previous intron of tj
279			if (ti.start<tj.exons[j-1]->start) return NULL;
280			//so i's first intron starts AFTER j's first intron
281			// then j must contain i, so i's last intron must end with or before j's last intron
282			if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
283			//comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
284			}
285			else if (i>j) {
286			//j==1, tj's start must not conflict with the previous intron of ti
287			if (tj.start<ti.exons[i-1]->start) return NULL;
288			//so j's intron chain starts AFTER i's
289			// then i must contain j, so j's last intron must end with or before j's last intron
290			if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
291			//comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
292			}
293			//now check if the rest of the introns overlap, in the same sequence
294			i++;
295			j++;
296			while (i<=imax && j<=jmax) {
297			if (ti.exons[i-1]->end!=tj.exons[j-1]->end \|\|
298			ti.exons[i]->start!=tj.exons[j]->start) return NULL;
299			i++;
300			j++;
301			}
302			i--;
303			j--;
304			if (i==imax && j<jmax) {
305			// tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
306			if (ti.end>tj.exons[j]->end) return NULL;
307			}
308			else if (j==jmax && i<imax) {
309			if (tj.end>ti.exons[i]->end) return NULL;
310			}
311			return bigger;
312			}
313
314
315			int gseqCmpName(const pointer p1, const pointer p2) {
316			return strcmp(((GenomicSeqData)p1)->gseq_name, ((GenomicSeqData)p2)->gseq_name);
317			}
318
319
320	gpertea	59	void printLocus(GffLocus* loc, const char* pre) {
321	gpertea	26	if (pre!=NULL) fprintf(stderr, "%s", pre);
322			GMessage(" [%d-%d] : ", loc->start, loc->end);
323			GMessage("%s",loc->rnas[0]->getID());
324			for (int i=1;i<loc->rnas.Count();i++) {
325			GMessage(",%s",loc->rnas[i]->getID());
326			}
327			GMessage("\n");
328			}
329
330	gpertea	48	void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
331			//transfer CDS info to the container t if it's a larger protein
332			if (tfrom->CDstart==0) return;
333			if (t->CDstart) {
334			if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start)
335			t->CDstart=tfrom->CDstart;
336			if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end)
337			t->CDend=tfrom->CDend;
338			}
339			else { //no CDS info on container, just copy it from the contained
340	gpertea	56	t->addCDS(tfrom->CDstart, tfrom->CDend, tfrom->CDphase);
341	gpertea	48	}
342	gpertea	47	}
343
344	gpertea	59	void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
345			bool matchAllIntrons, bool fuzzSpan) {
346	gpertea	26	GTData* tdata=new GTData(t);
347			gdata->tdata.Add(tdata);
348			int tidx=-1;
349	gpertea	59	/*
350			if (debug) {
351			GMessage(">>Placing transcript %s\n", t->getID());
352			debugState=true;
353			}
354			else debugState=false;
355			*/
356	gpertea	26	if (t->exons.Count()>0)
357			tidx=gdata->rnas.Add(t); //added it in sorted order
358			else {
359			gdata->gfs.Add(t);
360			return; //nothing to do with these non-transcript objects
361			}
362			if (!doCluster) return;
363			if (gdata->loci.Count()==0) {
364			gdata->loci.Add(new GffLocus(t));
365			//GMessage(" <<make it first locus %d-%d \n",t->start, t->end);
366			return;
367			}
368	gpertea	59	/*
369	gpertea	26	//DEBUG: show available loci:
370	gpertea	59	if (debug) {
371			GMessage(" [%d loci already:\n", gdata->loci.Count());
372			for (int l=0;l<gdata->loci.Count();l++) {
373			printLocus(gdata->loci[l]);
374			}
375			}
376			*/
377	gpertea	26	int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
378			//GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
379			if (nidx==0) {
380			//cannot have any overlapping loci
381	gpertea	59	//if (debug) GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end);
382	gpertea	26	gdata->loci.Add(new GffLocus(t));
383			return;
384			}
385			if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
386			int lfound=0; //count of parent loci
387			GArray<int> mrgloci(false);
388			GList<GffLocus> tloci(true); //candidate parent loci to adopt this
389	gpertea	59	//if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
390	gpertea	26	for (int l=nidx-1;l>=0;l--) {
391			GffLocus& loc=*(gdata->loci[l]);
392			if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
393			if (t->start>loc.end) {
394			if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
395			continue;
396			}
397	gpertea	59	if (loc.start>t->end) {
398			//this should never be the case if nidx was found correctly
399			GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
400			continue;
401			}
402			/*
403			if (debug) {
404			GMessage(" !range overlap found with locus ");
405			printLocus(&loc);
406			}
407			*/
408	gpertea	26	if (loc.add_RNA(t)) {
409			//will add this transcript to loc
410			lfound++;
411			mrgloci.Add(l);
412			if (collapseRedundant) {
413			//compare to every single transcript in this locus
414			for (int ti=0;ti<loc.rnas.Count();ti++) {
415			if (loc.rnas[ti]==t) continue;
416			GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
417			//GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
418			GffObj* container=NULL;
419			if (odata->replaced_by==NULL &&
420			(container=redundantTranscripts(t, (loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
421			if (container==t) {
422			odata->replaced_by=t;
423	gpertea	48	preserveContainedCDS(t, loc.rnas[ti]);
424	gpertea	26	}
425			else {
426			tdata->replaced_by=loc.rnas[ti];
427	gpertea	48	preserveContainedCDS(loc.rnas[ti], t);
428	gpertea	26	}
429			}
430			}//for each transcript in the exon-overlapping locus
431			} //if doCollapseRedundant
432			} //overlapping locus
433	gpertea	59	} //for each existing locus
434	gpertea	26	if (lfound==0) {
435			//overlapping loci not found, create a locus with only this mRNA
436	gpertea	59	/* if (debug) {
437			GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end);
438			}
439			*/
440	gpertea	26	int addidx=gdata->loci.Add(new GffLocus(t));
441			if (addidx<0) {
442	gpertea	59	//should never be the case!
443	gpertea	26	GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
444			}
445			}
446	gpertea	59	else { //found at least one overlapping locus
447			lfound--;
448			int locidx=mrgloci[lfound];
449			GffLocus& loc=*(gdata->loci[locidx]);
450			//last locus index found is also the smallest index
451			if (lfound>0) {
452			//more than one loci found parenting this mRNA, merge loci
453			/* if (debug)
454			GMessage(" merging %d loci \n",lfound);
455			*/
456	gpertea	26	for (int l=0;l<lfound;l++) {
457	gpertea	59	int mlidx=mrgloci[l];
458			loc.addMerge(*(gdata->loci[mlidx]), t);
459			gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
460	gpertea	26	}
461	gpertea	59	}
462			int i=locidx;
463			while (i>0 && loc.start<gdata->loci[i-1]->start) {
464			//bubble down until it's in the proper order
465			i--;
466			gdata->loci.Swap(i,i+1);
467			}
468			}//found at least one overlapping locus
469	gpertea	26	}
470
471			void collectLocusData(GList<GenomicSeqData>& ref_data) {
472			int locus_num=0;
473			for (int g=0;g<ref_data.Count();g++) {
474			GenomicSeqData* gdata=ref_data[g];
475			for (int l=0;l<gdata->loci.Count();l++) {
476			GffLocus& loc=*(gdata->loci[l]);
477			GHash<int> gnames(true); //gene names in this locus
478			GHash<int> geneids(true); //Entrez GeneID: numbers
479			for (int i=0;i<loc.rnas.Count();i++) {
480			GffObj& t=*(loc.rnas[i]);
481			GStr gname(t.getGeneName());
482			if (!gname.is_empty()) {
483			gname.upper();
484			int* prevg=gnames.Find(gname.chars());
485			if (prevg!=NULL) (*prevg)++;
486			else gnames.Add(gname, new int(1));
487			}
488			//parse GeneID xrefs, if any:
489			GStr xrefs(t.getAttr("xrefs"));
490			if (!xrefs.is_empty()) {
491			xrefs.startTokenize(",");
492			GStr token;
493			while (xrefs.nextToken(token)) {
494			token.upper();
495			if (token.startsWith("GENEID:")) {
496			token.cut(0,token.index(':')+1);
497			int* prevg=geneids.Find(token.chars());
498			if (prevg!=NULL) (*prevg)++;
499			else geneids.Add(token, new int(1));
500			}
501			} //for each xref
502			} //xrefs parsing
503			}//for each transcript
504			locus_num++;
505			loc.locus_num=locus_num;
506			if (gnames.Count()>0) { //collect all gene names associated to this locus
507			gnames.startIterate();
508			int* gfreq=NULL;
509			char* key=NULL;
510			while ((gfreq=gnames.NextData(key))!=NULL) {
511			loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
512			}
513			} //added collected gene_names
514			if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
515			geneids.startIterate();
516			int* gfreq=NULL;
517			char* key=NULL;
518			while ((gfreq=geneids.NextData(key))!=NULL) {
519			loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
520			}
521			}
522			} //for each locus
523			}//for each genomic sequence
524			}
525
526
527			void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate,
528			bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
529			GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
530			gffr->showWarnings(this->showWarnings);
531			// keepAttrs mergeCloseExons noExonAttr
532			gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
533			//int redundant=0; //redundant annotation discarded
534			if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
535			//int rna_deleted=0;
536			//add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
537			for (int k=0;k<gffr->gflst.Count();k++) {
538			GffObj* m=gffr->gflst[k];
539			if (strcmp(m->getFeatureName(), "locus")==0 &&
540			m->getAttr("transcripts")!=NULL) {
541			continue;
542			}
543
544			char* rloc=m->getAttr("locus");
545			if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
546			m->removeAttr("locus", rloc);
547			}
548			if (m->exons.Count()==0 && m->children.Count()==0) {
549			//a non-mRNA feature with no subfeatures
550			//add a dummy exon just to have the generic exon checking work
551			m->addExon(m->start,m->end);
552			}
553			GList<GffObj> gfadd(false,false);
554			if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
555			continue;
556			}
557			m->isUsed(true); //so the gffreader won't destroy it
558			int i=-1;
559			GenomicSeqData f(m->gseq_id);
560			GenomicSeqData* gdata=NULL;
561
562			if (seqdata.Found(&f,i)) gdata=seqdata[i];
563			else { //entry not created yet for this genomic seq
564			gdata=new GenomicSeqData(m->gseq_id);
565			seqdata.Add(gdata);
566			}
567			for (int k=0;k<gfadd.Count();k++) {
568			placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
569			}
570			placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
571			} //for each read gffObj
572			//if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
573			if (f!=stdin) { fclose(f); f=NULL; }
574			delete gffr;
575			}