ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 17
Committed: Mon Jul 18 20:57:05 2011 UTC (13 years, 1 month ago) by gpertea
File size: 35262 byte(s)
Log Message:
sync with local source

Line User Rev File contents
1 gpertea 17 #include "gff_utils.h"
2 gpertea 6 #include "GArgs.h"
3     #include <ctype.h>
4 gpertea 17 // don't care about cdb compression
5     //#ifdef ENABLE_COMPRESSION
6     //#undef ENABLE_COMPRESSION
7     //#endif
8     //#include "GCdbYank.h"
9 gpertea 6
10     #define USAGE "Usage:\n\
11 gpertea 17 gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12     [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end>] \n\
13     [-CTVNJMKQAFGRUVBHZWTOLE] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tr_cds.fa>]\n\
14     [-i <maxintron>] \n\
15     Filters and/or converts GFF3/GTF2 records.\n\
16     <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17 gpertea 6 \n\
18     Options:\n\
19 gpertea 17 -g full path to a multi-fasta file with the genomic sequences\n\
20     for all input mappings, OR a directory with single-fasta files\n\
21     (one per genomic sequence, with file names matching sequence names)\n\
22     -s <seq_info.fsize> is a tab-delimited file providing this info\n\
23 gpertea 6 for each of the mapped sequences:\n\
24     <seq-name> <seq-length> <seq-description>\n\
25 gpertea 17 (useful for -A option with mRNA/EST/protein mappings)\n\
26 gpertea 6 -i discard transcripts having an intron larger than <maxintron>\n\
27     -r only show transcripts crossing coordinate range <start>..<end>\n\
28 gpertea 17 (on chromosome/contig <chr>, strand <strand> if provided)\n\
29 gpertea 6 -R for -r option, discard all transcripts that are not fully \n\
30     contained within given range\n\
31     -U discard single-exon transcripts\n\
32 gpertea 17 -C coding only: discard mRNAs that have no CDS feature\n\
33     -F full GFF attribute preservation (all attributes are shown)\n\
34 gpertea 6 -G only parse additional exon attributes from the first exon\n\
35     and move them to the mRNA level (useful for GTF input)\n\
36 gpertea 17 -A use the description field from <seq_info.fsize> and add it\n\
37     as the value for a 'descr' attribute to the GFF record\n\
38 gpertea 6 \n\
39 gpertea 17 -O process also non-transcript GFF records (by default non-transcript\n\
40     records are ignored)\n\
41 gpertea 6 -V discard any mRNAs with CDS having in-frame stop codons\n\
42     -H for -V option, check and adjust the starting CDS phase\n\
43     if the original phase leads to a translation with an \n\
44     in-frame stop codon\n\
45     -B for -V option, single-exon transcripts are also checked on the\n\
46     opposite strand\n\
47 gpertea 17 -N discard multi-exon mRNAs that have any intron with a non-canonical\n\
48     splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49     -J discard any mRNAs that either lack initial START codon\n\
50 gpertea 6 or the terminal STOP codon, or have an in-frame stop codon\n\
51     (only print mRNAs with a fulll, valid CDS)\n\
52     \n\
53 gpertea 17 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54     transcripts (those with the same exact introns and fully contained)\n\
55     --cluster-only: same as --merge but without collapsing matching transcripts\n\
56     -K for -M option: also collapse shorter, fully contained transcripts\n\
57     with fewer introns than the container\n\
58     -Q for -M option, remove the containment restriction:\n\
59     (multi-exon transcripts will be collapsed if just their introns match,\n\
60     while single-exon transcripts can partially overlap (80%))\n\
61     \n\
62     -E expose (warn about) duplicate transcript IDs and other potential \n\
63     problems with the given GFF/GTF records\n\
64     -Z merge close exons into a single exon (for intron size<4)\n\
65     -w write a fasta file with spliced exons for each GFF transcript\n\
66     -x write a fasta file with spliced CDS for each GFF transcript\n\
67     -W for -w and -x options, also write for each fasta record the exon\n\
68 gpertea 6 coordinates projected onto the spliced sequence\n\
69 gpertea 17 -y write a protein fasta file with the translation of CDS for each record\n\
70     -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
71     -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
72     this format:\n\
73     <original_ref_ID> <new_ref_ID>\n\
74     GFF records on reference sequences that are not found among the\n\
75     <original_ref_ID> entries in this file will be filtered out\n\
76     -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
77     (use -o- for printing to stdout)\n\
78     -t use <trackname> in the second column of each GFF output line\n\
79     -T -o option will output GTF format instead of GFF3\n\
80     "
81 gpertea 6
82 gpertea 17
83     class SeqInfo { //populated from the -s option of gffread
84     public:
85     int len;
86     char* descr;
87     SeqInfo( int l, char* s) {
88     len=l;
89     if (s==NULL) {
90     descr=NULL;
91     } else {
92     descr=Gstrdup(s);
93     }
94     }
95     ~SeqInfo() {
96     GFREE(descr);
97     }
98     };
99    
100     class RefTran {
101     public:
102     char* new_name;
103     RefTran(char *ns) {
104     new_name=NULL;
105     if (ns!=NULL)
106     new_name=Gstrdup(ns);
107     }
108     ~RefTran() {
109     GFREE(new_name);
110     }
111     };
112    
113 gpertea 6 FILE* ffasta=NULL;
114     FILE* f_in=NULL;
115     FILE* f_out=NULL;
116     FILE* f_w=NULL; //fasta with spliced exons (transcripts)
117     FILE* f_x=NULL; //fasta with spliced CDS
118     FILE* f_y=NULL; //fasta with translated CDS
119     bool wCDSonly=false;
120    
121     bool validCDSonly=false; // translation with no in-frame STOP
122     bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
123     bool altPhases=false; //if original phase fails translation validation,
124     //try the other 2 phases until one makes it
125     bool mRNAOnly=true;
126     bool spliceCheck=false; //only known splice-sites
127    
128     bool fullCDSonly=false; // starts with START, ends with STOP codon
129     bool fullattr=false;
130 gpertea 17 //bool sortByLoc=false; // if the GFF output should be sorted by location
131     bool ensembl_convert=false; //-L, assisst in converting Ensembl GTF to GFF3
132    
133    
134     //GStr gseqpath;
135     //GStr gcdbfa;
136     //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
137     //GFaSeqGet* faseq=NULL;
138     //GCdbYank* gcdb=NULL;
139     //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
140 gpertea 6 bool fmtGTF=false;
141     bool addDescr=false;
142     //bool protmap=false;
143     bool multiExon=false;
144     bool writeExonSegs=false;
145     char* tracklabel=NULL;
146     int maxintron=999000000;
147     bool mergeCloseExons=false;
148     //range filter:
149     char* rfltGSeq=NULL;
150 gpertea 17 char rfltStrand=0;
151 gpertea 6 uint rfltStart=0;
152     uint rfltEnd=MAX_UINT;
153     bool rfltWithin=false; //check for full containment within given range
154     bool noExonAttr=false;
155    
156 gpertea 17 bool doCluster=false;
157     bool doCollapseRedundant=false;
158 gpertea 6
159 gpertea 17 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
160 gpertea 6
161     //hash with sequence info
162     GHash<SeqInfo> seqinfo;
163     GHash<int> isoCounter; //counts the valid isoforms
164 gpertea 17 GHash<RefTran> reftbl;
165     GHash<GeneInfo> gene_ids;
166     //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
167 gpertea 6
168     bool debugMode=false;
169     bool verbose=false;
170    
171 gpertea 17 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
172     GLineReader fr(f);
173     while (!fr.isEof()) {
174     char* line=fr.getLine();
175     if (line==NULL) break;
176     char* id=line;
177     char* lenstr=NULL;
178     char* text=NULL;
179     char* p=line;
180     while (*p!=0 && !isspace(*p)) p++;
181     if (*p==0) continue;
182     *p=0;p++;
183     while (*p==' ' || *p=='\t') p++;
184     if (*p==0) continue;
185     lenstr=p;
186     while (*p!=0 && !isspace(*p)) p++;
187     if (*p!=0) { *p=0;p++; }
188     while (*p==' ' || *p=='\t') p++;
189     if (*p!=0) text=p; //else text remains NULL
190     int len=0;
191     if (!parseInt(lenstr,len)) {
192     GMessage("Warning: could not parse sequence length: %s %s\n",
193     id, lenstr);
194     continue;
195     }
196     // --- here we have finished parsing the line
197     si.Add(id, new SeqInfo(len,text));
198     } //while lines
199     }
200    
201     void loadRefTable(FILE* f, GHash<RefTran>& rt) {
202     GLineReader fr(f);
203     char* line=NULL;
204     while ((line=fr.getLine())) {
205     char* orig_id=line;
206     char* p=line;
207     while (*p!=0 && !isspace(*p)) p++;
208     if (*p==0) continue;
209     *p=0;p++;//split the line here
210     while (*p==' ' || *p=='\t') p++;
211     if (*p==0) continue;
212     rt.Add(orig_id, new RefTran(p));
213     } //while lines
214     }
215    
216 gpertea 6 char* getSeqDescr(char* seqid) {
217     static char charbuf[128];
218     if (seqinfo.Count()==0) return NULL;
219     char* suf=rstrchr(seqid, '.');
220     if (suf!=NULL) *suf=0;
221     SeqInfo* seqd=seqinfo.Find(seqid);
222     if (suf!=NULL) *suf='.';
223     if (seqd!=NULL) {
224     GStr s(seqd->descr);
225     //cleanup some Uniref gunk
226     if (s[0]=='[') {
227     int r=s.index(']');
228     if (r>=0 && r<8 && isdigit(s[1]))
229     s.remove(0,r+1);
230     }
231     if (s.length()>80) {
232     int r=s.index(';');
233     if (r>5) s.cut(r);
234     }
235     if (s.length()>127) {
236     s.cut(127);
237     int r=s.rindex(' ');
238     if (r>0) s.cut(r);
239     }
240     strcpy(charbuf, s.chars());
241     return charbuf;
242     }
243     else return NULL;
244     }
245    
246     char* getSeqName(char* seqid) {
247     static char charbuf[128];
248     char* suf=rstrchr(seqid, '.');
249     if (suf!=NULL) *suf=0;
250     strcpy(charbuf, seqid);
251     if (suf!=NULL) *suf='.';
252     return charbuf;
253     }
254    
255 gpertea 17 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
256     if (gfasta.fastaPath==NULL) return NULL;
257     return gfasta.fetch(gffrec.gseq_id);
258 gpertea 6 }
259    
260    
261 gpertea 17 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
262 gpertea 6 //adj>0 => extedn CDS, adj<0 => shrink CDS
263     //when CDS is expanded, exons have to be checked too and
264     // expanded accordingly if they had the same boundary
265 gpertea 17 int realadj=0;
266     if (gffrec.strand=='-') {
267     if ((int)gffrec.CDstart>adj) {
268    
269     gffrec.CDstart-=adj;
270     realadj=adj;
271     if (adj<0) { //restore
272     if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
273     gffrec.exons.First()->start-=adj;
274     gffrec.start=gffrec.exons.First()->start;
275     gffrec.covlen+=adj;
276 gpertea 6 }
277 gpertea 17 }
278     else if (gffrec.exons.First()->start>=gffrec.CDstart) {
279     gffrec.exons.First()->start-=adj;
280     gffrec.start=gffrec.exons.First()->start;
281     gffrec.covlen+=adj;
282     }
283 gpertea 6 }
284     }
285     else {
286 gpertea 17 realadj=adj;
287     gffrec.CDend+=adj;
288     if (adj<0) {//restore
289     if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
290     gffrec.exons.Last()->end+=adj;
291     gffrec.end=gffrec.exons.Last()->end;
292     gffrec.covlen+=adj;
293     }
294     }
295     else if (gffrec.exons.Last()->end<=gffrec.CDend) {
296     gffrec.exons.Last()->end+=adj;
297     gffrec.end=gffrec.exons.Last()->end;
298     gffrec.covlen+=adj;
299 gpertea 6 }
300     }
301     if (seglst!=NULL) seglst->Last()->end+=adj;
302 gpertea 17 return realadj;
303 gpertea 6 }
304    
305 gpertea 17 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
306     //returns true if the transcript passed the filter
307     char* gname=gffrec.getGeneName();
308     if (gname==NULL) gname=gffrec.getGeneID();
309     GStr defline(gffrec.getID());
310     if (f_out && !fmtGTF) {
311     const char* tn=NULL;
312     if ((tn=gffrec.getAttr("transcript_name"))!=NULL) {
313     gffrec.addAttr("Name", tn);
314     gffrec.removeAttr("transcript_name");
315     }
316     }
317     if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
318     const char* tn=gffrec.getTrackName();
319     gffrec.addAttr("type", tn);
320     //bool is_gene=false;
321     bool is_pseudo=false;
322     if (strcmp(tn, "protein_coding")==0 || gffrec.hasCDS())
323     gffrec.setFeatureName("mRNA");
324     else {
325     if (strcmp(tn, "processed_transcript")==0)
326     gffrec.setFeatureName("proc_RNA");
327     else {
328     //is_gene=endsWith(tn, "gene");
329     is_pseudo=strifind(tn, "pseudo");
330     if (is_pseudo) {
331     gffrec.setFeatureName("pseudo_RNA");
332     }
333     else if (endsWith(tn, "RNA")) {
334     gffrec.setFeatureName(tn);
335     } else gffrec.setFeatureName("misc_RNA");
336     }
337     }
338 gpertea 6 }
339 gpertea 17 if (gname && strcmp(gname, gffrec.getID())!=0) {
340     int* isonum=isoCounter.Find(gname);
341     if (isonum==NULL) {
342     isonum=new int(1);
343     isoCounter.Add(gname,isonum);
344     }
345     else (*isonum)++;
346     defline.appendfmt(" gene=%s", gname);
347     }
348 gpertea 6 int seqlen=0;
349 gpertea 17
350 gpertea 6 const char* tlabel=tracklabel;
351 gpertea 17 if (tlabel==NULL) tlabel=gffrec.getTrackName();
352 gpertea 6 //defline.appendfmt(" track:%s",tlabel);
353     char* cdsnt = NULL;
354     char* cdsaa = NULL;
355     int aalen=0;
356 gpertea 17 for (int i=1;i<gffrec.exons.Count();i++) {
357     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
358     if (ilen>4000000)
359 gpertea 6 GMessage("Warning: very large intron (%d) for transcript %s\n",
360 gpertea 17 ilen, gffrec.getID());
361 gpertea 6 if (ilen>maxintron) {
362 gpertea 17 return false;
363 gpertea 6 }
364     }
365     GList<GSeg> seglst(false,true);
366 gpertea 17 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
367     if (spliceCheck && gffrec.exons.Count()>1) {
368 gpertea 6 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
369     if (faseq==NULL) GError("Error: no genomic sequence available!\n");
370 gpertea 17 int glen=gffrec.end-gffrec.start+1;
371     const char* gseq=faseq->subseq(gffrec.start, glen);
372     bool revcompl=(gffrec.strand=='-');
373 gpertea 6 bool ssValid=true;
374 gpertea 17 for (int e=1;e<gffrec.exons.Count();e++) {
375     const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
376     int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
377 gpertea 6 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
378     GSpliceSite donorSite(intron,intronlen, false, revcompl);
379     //GMessage("%c intron %d-%d : %s .. %s\n",
380 gpertea 17 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
381 gpertea 6 if (acceptorSite=="AG") { // GT-AG or GC-AG
382     if (!donorSite.canonicalDonor()) {
383     ssValid=false;break;
384     }
385     }
386     else if (acceptorSite=="AC") { //
387     if (donorSite!="AT") { ssValid=false; break; }
388     }
389     else { ssValid=false; break; }
390     }
391     //GFREE(gseq);
392     if (!ssValid) {
393     if (verbose)
394 gpertea 17 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
395     return false; //don't print this one!
396 gpertea 6 }
397     }
398    
399     bool trprint=true;
400 gpertea 17 int stopCodonAdjust=0;
401 gpertea 6 int mCDphase=0;
402 gpertea 17 bool hasStop=false;
403     if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
404     mCDphase = gffrec.CDphase-'0';
405 gpertea 6 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
406     if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
407     //if (protmap && fullCDSonly) {
408 gpertea 17 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
409    
410 gpertea 6 if (validCDSonly) { //make sure the stop codon is always included
411 gpertea 17 //adjust_stopcodon(gffrec,3);
412     stopCodonAdjust=adjust_stopcodon(gffrec,3);
413 gpertea 6 }
414     int strandNum=0;
415     int phaseNum=0;
416     CDS_CHECK:
417 gpertea 17 cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
418 gpertea 6 if (cdsnt==NULL) trprint=false;
419     if (validCDSonly) {
420     cdsaa=translateDNA(cdsnt, aalen, seqlen);
421     char* p=strchr(cdsaa,'.');
422 gpertea 17 hasStop=false;
423 gpertea 6 if (p!=NULL) {
424     if (p-cdsaa>=aalen-2) { //stop found as the last codon
425     *p='0';//remove it
426     hasStop=true;
427     if (aalen-2==p-cdsaa) {
428     //previous to last codon is the stop codon
429     //so correct the CDS stop accordingly
430 gpertea 17 adjust_stopcodon(gffrec,-3, &seglst);
431     stopCodonAdjust=0; //clear artificial stop adjustment
432 gpertea 6 seqlen-=3;
433     cdsnt[seqlen]=0;
434     }
435     aalen=p-cdsaa;
436     }
437     else {//stop found before the last codon
438     trprint=false;
439     }
440     }//stop codon found
441     if (trprint==false) { //failed CDS validity check
442     //in-frame stop codon found
443     if (altPhases && phaseNum<3) {
444     phaseNum++;
445 gpertea 17 gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
446 gpertea 6 GFREE(cdsaa);
447     goto CDS_CHECK;
448     }
449 gpertea 17 if (gffrec.exons.Count()==1 && bothStrands) {
450 gpertea 6 strandNum++;
451     phaseNum=0;
452     if (strandNum<2) {
453     GFREE(cdsaa);
454 gpertea 17 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
455 gpertea 6 goto CDS_CHECK; //repeat the CDS check for a different frame
456     }
457     }
458 gpertea 17 if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
459 gpertea 6 } //has in-frame STOP
460     if (fullCDSonly) {
461     if (!hasStop || cdsaa[0]!='M') trprint=false;
462     }
463     } // CDS check requested
464     } //translation or codon check/output was requested
465     if (!trprint) {
466     GFREE(cdsnt);
467     GFREE(cdsaa);
468 gpertea 17 return false;
469 gpertea 6 }
470 gpertea 17 if (stopCodonAdjust>0 && !hasStop) {
471     //restore stop codon location
472     adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
473     if (cdsnt!=NULL && seqlen>0) {
474     seqlen-=stopCodonAdjust;
475     cdsnt[seqlen]=0;
476     }
477     if (cdsaa!=NULL) aalen--;
478     }
479    
480 gpertea 6 if (f_y!=NULL) { //CDS translation fasta output requested
481     //char*
482     if (cdsaa==NULL) { //translate now if not done before
483     cdsaa=translateDNA(cdsnt, aalen, seqlen);
484     }
485 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
486 gpertea 6 //append all attributes found for each transcripts
487 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
488 gpertea 6 defline.append(" ");
489 gpertea 17 defline.append(gffrec.getAttrName(i));
490 gpertea 6 defline.append("=");
491 gpertea 17 defline.append(gffrec.getAttrValue(i));
492 gpertea 6 }
493     }
494     printFasta(f_y, defline, cdsaa, aalen);
495     }
496     if (f_x!=NULL) { //CDS only
497     if (writeExonSegs) {
498     defline.append(" loc:");
499 gpertea 17 defline.append(gffrec.getGSeqName());
500     defline.appendfmt("(%c)",gffrec.strand);
501 gpertea 6 //warning: not CDS coordinates are written here, but the exon ones
502 gpertea 17 defline+=(int)gffrec.start;
503 gpertea 6 defline+=(char)'-';
504 gpertea 17 defline+=(int)gffrec.end;
505 gpertea 6 // -- here these are CDS substring coordinates on the spliced sequence:
506     defline.append(" segs:");
507     for (int i=0;i<seglst.Count();i++) {
508     if (i>0) defline.append(",");
509     defline+=(int)seglst[i]->start;
510     defline.append("-");
511     defline+=(int)seglst[i]->end;
512     }
513     }
514 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
515 gpertea 6 //append all attributes found for each transcript
516 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
517 gpertea 6 defline.append(" ");
518 gpertea 17 defline.append(gffrec.getAttrName(i));
519 gpertea 6 defline.append("=");
520 gpertea 17 defline.append(gffrec.getAttrValue(i));
521 gpertea 6 }
522     }
523     printFasta(f_x, defline, cdsnt, seqlen);
524     }
525     GFREE(cdsnt);
526     GFREE(cdsaa);
527     if (f_w!=NULL) { //write spliced exons
528     uint cds_start=0;
529     uint cds_end=0;
530     seglst.Clear();
531 gpertea 17 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
532 gpertea 6 if (exont!=NULL) {
533 gpertea 17 if (gffrec.CDstart>0) {
534 gpertea 6 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
535     }
536     if (writeExonSegs) {
537     defline.append(" loc:");
538 gpertea 17 defline.append(gffrec.getGSeqName());
539 gpertea 6 defline+=(char)'|';
540 gpertea 17 defline+=(int)gffrec.start;
541 gpertea 6 defline+=(char)'-';
542 gpertea 17 defline+=(int)gffrec.end;
543 gpertea 6 defline+=(char)'|';
544 gpertea 17 defline+=(char)gffrec.strand;
545 gpertea 6 defline.append(" exons:");
546 gpertea 17 for (int i=0;i<gffrec.exons.Count();i++) {
547 gpertea 6 if (i>0) defline.append(",");
548 gpertea 17 defline+=(int)gffrec.exons[i]->start;
549 gpertea 6 defline.append("-");
550 gpertea 17 defline+=(int)gffrec.exons[i]->end;
551 gpertea 6 }
552     defline.append(" segs:");
553     for (int i=0;i<seglst.Count();i++) {
554     if (i>0) defline.append(",");
555     defline+=(int)seglst[i]->start;
556     defline.append("-");
557     defline+=(int)seglst[i]->end;
558     }
559     }
560 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
561 gpertea 6 //append all attributes found for each transcripts
562 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
563 gpertea 6 defline.append(" ");
564 gpertea 17 defline.append(gffrec.getAttrName(i));
565 gpertea 6 defline.append("=");
566 gpertea 17 defline.append(gffrec.getAttrValue(i));
567 gpertea 6 }
568     }
569     printFasta(f_w, defline, exont, seqlen);
570     GFREE(exont);
571     }
572     } //writing f_w (spliced exons)
573 gpertea 17 return true;
574 gpertea 6 }
575    
576     void openfw(FILE* &f, GArgs& args, char opt) {
577     GStr s=args.getOpt(opt);
578     if (!s.is_empty()) {
579     if (s=='-')
580     f=stdout;
581     else {
582     f=fopen(s,"w");
583     if (f==NULL) GError("Error creating file: %s\n", s.chars());
584     }
585     }
586     }
587    
588     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
589     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
590    
591 gpertea 17 void printGff3Header(FILE* f, GArgs& args) {
592     fprintf(f, "# ");
593     args.printCmdLine(f);
594     fprintf(f, "##gff-version 3\n");
595     //for (int i=0;i<gseqdata.Count();i++) {
596     //
597     //}
598     }
599    
600     bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
601     if (reftbl.Count()>0) {
602     GStr refname(gffrec->getRefName());
603     RefTran* rt=reftbl.Find(refname.chars());
604     if (rt==NULL && refname[-2]=='.' && isdigit(refname[-1])) {
605     //try removing the version
606     refname.cut(-2);
607     //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
608     rt=reftbl.Find(refname.chars());
609     }
610     if (rt) {
611     gffrec->setRefName(rt->new_name);
612     }
613     else return false; //discard, ref seq not in the given translation table
614     }
615     if (mRNAOnly && gffrec->isDiscarded()) {
616     //discard generic "gene" or "locus" features with no other detailed subfeatures
617     //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
618     return false;
619     }
620     /*
621     if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
622     //a non-mRNA feature with no subfeatures
623     //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
624     //--this could be a single "pseudogene" entry or another genomic region without exons
625     //
626     gffrec->addExon(gffrec->start,gffrec->end);
627     }
628     */
629     if (rfltGSeq!=NULL) { //filter by gseqName
630     if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
631     return false;
632     }
633     }
634     if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
635     return false;
636     }
637     //check coordinates
638     if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
639     if (rfltWithin) {
640     if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
641     return false;
642     }
643     }
644     else {
645     if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
646     return false;
647     }
648     }
649     }
650     if (multiExon && gffrec->exons.Count()<=1) {
651     return false;
652     }
653     if (wCDSonly && gffrec->CDstart==0) {
654     return false;
655     }
656     if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
657     //keep track of chr|gene_id data -- coordinate range
658     char* geneid=gffrec->getGeneID();
659     if (geneid!=NULL) {
660     GeneInfo* ginfo=gene_ids.Find(geneid);
661     if (ginfo==NULL) {//first time seeing this gene ID
662     GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
663     gene_ids.Add(geneid, geneinfo);
664     if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
665     }
666     else ginfo->update(gffrec);
667     }
668     }
669     return true;
670     }
671    
672    
673 gpertea 6 int main(int argc, char * const argv[]) {
674 gpertea 17 GArgs args(argc, argv,
675     "debug;merge;cluster-only;help;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
676     args.printError(USAGE, true);
677     if (args.getOpt('h') || args.getOpt("help")) {
678     GMessage("%s",USAGE);
679     exit(1);
680     }
681     debugMode=(args.getOpt("debug")!=NULL);
682 gpertea 6 mRNAOnly=(args.getOpt('O')==NULL);
683 gpertea 17 //sortByLoc=(args.getOpt('S')!=NULL);
684 gpertea 6 addDescr=(args.getOpt('A')!=NULL);
685     verbose=(args.getOpt('v')!=NULL);
686     wCDSonly=(args.getOpt('C')!=NULL);
687     validCDSonly=(args.getOpt('V')!=NULL);
688     altPhases=(args.getOpt('H')!=NULL);
689     fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
690     bothStrands=(args.getOpt('B')!=NULL);
691 gpertea 17 fullCDSonly=(args.getOpt('J')!=NULL);
692 gpertea 6 spliceCheck=(args.getOpt('N')!=NULL);
693 gpertea 17 bool matchAllIntrons=(args.getOpt('K')==NULL);
694     bool fuzzSpan=(args.getOpt('Q')!=NULL);
695     if (args.getOpt('M') || args.getOpt("merge")) {
696     doCluster=true;
697     doCollapseRedundant=true;
698     }
699     else {
700     if (!matchAllIntrons || fuzzSpan) {
701     GMessage("%s",USAGE);
702     GMessage("Error: -K or -Q options require -M/--merge option!\n");
703     exit(1);
704     }
705     }
706     if (args.getOpt("cluster-only")) {
707     doCluster=true;
708     doCollapseRedundant=false;
709     if (!matchAllIntrons || fuzzSpan) {
710     GMessage("%s",USAGE);
711     GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
712     exit(1);
713     }
714     }
715 gpertea 6 //protmap=(args.getOpt('P')!=NULL);
716     if (fullCDSonly) validCDSonly=true;
717 gpertea 17 if (verbose) {
718     fprintf(stderr, "Command line was:\n");
719     args.printCmdLine(stderr);
720     }
721    
722 gpertea 6 fullattr=(args.getOpt('F')!=NULL);
723     if (args.getOpt('G')==NULL)
724     noExonAttr=!fullattr;
725     else {
726     noExonAttr=true;
727     fullattr=true;
728     }
729 gpertea 17 ensembl_convert=(args.getOpt('L')!=NULL);
730     if (ensembl_convert) {
731     fullattr=true;
732     noExonAttr=false;
733     //sortByLoc=true;
734     }
735    
736 gpertea 6 mergeCloseExons=(args.getOpt('Z')!=NULL);
737     multiExon=(args.getOpt('U')!=NULL);
738     writeExonSegs=(args.getOpt('W')!=NULL);
739     tracklabel=args.getOpt('t');
740 gpertea 17 GFastaDb gfasta(args.getOpt('g'));
741     //if (gfasta.fastaPath!=NULL)
742     // sortByLoc=true; //enforce sorting by chromosome/contig
743 gpertea 6 GStr s=args.getOpt('i');
744     if (!s.is_empty()) maxintron=s.asInt();
745 gpertea 17
746     FILE* f_repl=NULL;
747     s=args.getOpt('d');
748     if (!s.is_empty()) {
749     if (s=="-") f_repl=stdout;
750     else {
751     f_repl=fopen(s.chars(), "w");
752     if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
753     }
754     }
755    
756 gpertea 6 rfltWithin=(args.getOpt('R')!=NULL);
757     s=args.getOpt('r');
758     if (!s.is_empty()) {
759     s.trim();
760 gpertea 17 if (s[0]=='+' || s[0]=='-') {
761     rfltStrand=s[0];
762     s.cut(0,1);
763     }
764 gpertea 6 int isep=s.index(':');
765 gpertea 17 if (isep>0) { //gseq name given
766     if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
767     isep--;
768     rfltStrand=s[isep];
769     s.cut(isep,1);
770     }
771     if (isep>0)
772     rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
773 gpertea 6 s.cut(0,isep+1);
774     }
775     GStr gsend;
776 gpertea 17 char slast=s[s.length()-1];
777     if (rfltStrand==0 && (slast=='+' || slast=='-')) {
778     s.chomp(slast);
779     rfltStrand=slast;
780     }
781 gpertea 6 if (s.index("..")>=0) gsend=s.split("..");
782 gpertea 17 else gsend=s.split('-');
783 gpertea 6 if (!s.is_empty()) rfltStart=(uint)s.asInt();
784 gpertea 17 if (!gsend.is_empty()) {
785     rfltEnd=(uint)gsend.asInt();
786     if (rfltEnd==0) rfltEnd=MAX_UINT;
787     }
788    
789 gpertea 6 } //gseq/range filtering
790     else {
791     if (rfltWithin)
792     GError("Error: option -R doesn't make sense without -r!\n");
793     }
794 gpertea 17 s=args.getOpt('m');
795     if (!s.is_empty()) {
796     FILE* ft=fopen(s,"r");
797     if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
798     loadRefTable(ft, reftbl);
799     fclose(ft);
800     }
801 gpertea 6 s=args.getOpt('s');
802     if (!s.is_empty()) {
803 gpertea 17 FILE* fsize=fopen(s,"r");
804     if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
805     loadSeqInfo(fsize, seqinfo);
806     fclose(fsize);
807     }
808    
809 gpertea 6 openfw(f_out, args, 'o');
810     //if (f_out==NULL) f_out=stdout;
811 gpertea 17 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
812 gpertea 6 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
813    
814     openfw(f_w, args, 'w');
815     openfw(f_x, args, 'x');
816     openfw(f_y, args, 'y');
817     if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
818     //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
819 gpertea 17
820     int numfiles = args.startNonOpt();
821     //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
822     int out_counter=0; //number of records printed
823     while (true) {
824     GStr infile;
825     if (numfiles) {
826     infile=args.nextNonOpt();
827     if (infile.is_empty()) break;
828     if (infile=="-") { f_in=stdin; infile="stdin"; }
829     else
830     if ((f_in=fopen(infile, "r"))==NULL)
831     GError("Error: cannot open input file %s!\n",infile.chars());
832 gpertea 6 }
833 gpertea 17 else
834     infile="-";
835     GffLoader gffloader(infile.chars());
836     gffloader.transcriptsOnly=mRNAOnly;
837     gffloader.fullAttributes=fullattr;
838     gffloader.noExonAttrs=noExonAttr;
839     gffloader.mergeCloseExons=mergeCloseExons;
840     gffloader.showWarnings=(args.getOpt('E')!=NULL);
841     gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
842     matchAllIntrons, fuzzSpan);
843     if (doCluster)
844     collectLocusData(g_data);
845     if (numfiles==0) break;
846     }
847    
848     GStr loctrack("gffcl");
849     if (tracklabel) loctrack=tracklabel;
850     g_data.setSorted(&gseqCmpName);
851     if (doCluster) {
852     //grouped in loci
853     for (int g=0;g<g_data.Count();g++) {
854     GenomicSeqData* gdata=g_data[g];
855     for (int l=0;l<gdata->loci.Count();l++) {
856     GffLocus& loc=*(gdata->loci[l]);
857     //check all non-replaced transcripts in this locus:
858     int numvalid=0;
859     int idxfirstvalid=-1;
860     for (int i=0;i<loc.rnas.Count();i++) {
861     GffObj& t=*(loc.rnas[i]);
862     GTData* tdata=(GTData*)(t.uptr);
863     if (tdata->replaced_by!=NULL) {
864     if (f_repl && (t.udata & 8)==0) {
865     //t.udata|=8;
866     fprintf(f_repl, "%s", t.getID());
867     GTData* rby=tdata;
868     while (rby->replaced_by!=NULL) {
869     fprintf(f_repl," => %s", rby->replaced_by->getID());
870     rby->rna->udata|=8;
871     rby=(GTData*)(rby->replaced_by->uptr);
872     }
873     fprintf(f_repl, "\n");
874     }
875     continue;
876     }
877     if (process_transcript(gfasta, t)) {
878     t.udata|=4; //tag it as valid
879     numvalid++;
880     if (idxfirstvalid<0) idxfirstvalid=i;
881     }
882 gpertea 6 }
883 gpertea 17
884     if (f_out && numvalid>0) {
885     GStr locname("RLOC_");
886     locname.appendfmt("%08d",loc.locus_num);
887     if (!fmtGTF) {
888     if (out_counter==0)
889     printGff3Header(f_out, args);
890     fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
891     loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
892     locname.chars(), locname.chars());
893     //const char* loc_gname=loc.getGeneName();
894     if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
895     fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
896     for (int i=1;i<loc.gene_names.Count();i++) {
897     fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
898     }
899     }
900     if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
901     fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
902     for (int i=1;i<loc.gene_ids.Count();i++) {
903     fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
904     }
905     }
906     fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
907     for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
908     fprintf(f_out, ",%s",loc.rnas[i]->getID());
909     }
910     fprintf(f_out, "\n");
911     }
912     //now print all valid, non-replaced transcripts in this locus:
913     for (int i=0;i<loc.rnas.Count();i++) {
914     GffObj& t=*(loc.rnas[i]);
915     GTData* tdata=(GTData*)(t.uptr);
916     if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
917     t.addAttr("locus", locname.chars());
918     out_counter++;
919     if (fmtGTF) t.printGtf(f_out, tracklabel);
920     else {
921     //print the parent first, if any
922     if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
923     GTData* pdata=(GTData*)(t.parent->uptr);
924     if (pdata->geneinfo!=NULL)
925     pdata->geneinfo->finalize();
926     t.parent->addAttr("locus", locname.chars());
927     t.parent->printGff(f_out, tracklabel);
928     t.parent->udata|=4;
929     }
930     t.printGff(f_out, tracklabel);
931     }
932     }
933     } //have valid transcripts to print
934     }//for each locus
935     if (f_out && !mRNAOnly) {
936     //final pass through the non-transcripts, in case any of them were not printed
937     //TODO: order broken, these should be interspersed among the rnas in the correct order!
938     for (int m=0;m<gdata->gfs.Count();m++) {
939     GffObj& t=*(gdata->gfs[m]);
940     if ((t.udata&4)==0) { //never printed
941     t.udata|=4;
942     if (fmtGTF) t.printGtf(f_out, tracklabel);
943     else t.printGff(f_out, tracklabel);
944     }
945     } //for each non-transcript
946 gpertea 6 }
947 gpertea 17 } //for each genomic sequence
948     }
949     else {
950     //not grouped into loci, print the rnas with their parents, if any
951     int numvalid=0;
952     for (int g=0;g<g_data.Count();g++) {
953     GenomicSeqData* gdata=g_data[g];
954     for (int m=0;m<gdata->rnas.Count();m++) {
955     GffObj& t=*(gdata->rnas[m]);
956     GTData* tdata=(GTData*)(t.uptr);
957     if (tdata->replaced_by!=NULL) continue;
958     if (process_transcript(gfasta, t)) {
959     t.udata|=4; //tag it as valid
960     numvalid++;
961     if (f_out) {
962     if (tdata->geneinfo) tdata->geneinfo->finalize();
963     out_counter++;
964     if (fmtGTF) t.printGtf(f_out, tracklabel);
965     else {
966     if (out_counter==1)
967     printGff3Header(f_out, args);
968     //print the parent first, if any
969     if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
970     GTData* pdata=(GTData*)(t.parent->uptr);
971     if (pdata->geneinfo!=NULL)
972     pdata->geneinfo->finalize();
973     t.parent->printGff(f_out, tracklabel);
974     t.parent->udata|=4;
975     }
976     t.printGff(f_out, tracklabel);
977     }
978     }//GFF/GTF output requested
979     } //valid transcript
980     } //for each rna
981     if (f_out && !mRNAOnly) {
982     //final pass through the non-transcripts, in case any of them were not printed
983     //TODO: order broken, these should be interspersed among the rnas in the correct order!
984     for (int m=0;m<gdata->gfs.Count();m++) {
985     GffObj& t=*(gdata->gfs[m]);
986     if ((t.udata&4)==0) { //never printed
987     t.udata|=4;
988     if (fmtGTF) t.printGtf(f_out, tracklabel);
989     else t.printGff(f_out, tracklabel);
990     }
991     } //for each non-transcript
992 gpertea 6 }
993 gpertea 17 } //for each genomic seq
994 gpertea 6 }
995 gpertea 17 if (f_repl && f_repl!=stdout) fclose(f_repl);
996 gpertea 6 seqinfo.Clear();
997 gpertea 17 //if (faseq!=NULL) delete faseq;
998     //if (gcdb!=NULL) delete gcdb;
999 gpertea 6 GFREE(rfltGSeq);
1000     FRCLOSE(f_in);
1001     FWCLOSE(f_out);
1002     FWCLOSE(f_w);
1003     FWCLOSE(f_x);
1004     FWCLOSE(f_y);
1005     }