ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 116
Committed: Mon Nov 7 21:25:56 2011 UTC (12 years, 11 months ago) by gpertea
File size: 35749 byte(s)
Log Message:
Ensembl conversion corrections

Line File contents
1 #include "gff_utils.h"
2 #include "GArgs.h"
3 #include <ctype.h>
4 // don't care about cdb compression
5 //#ifdef ENABLE_COMPRESSION
6 //#undef ENABLE_COMPRESSION
7 //#endif
8 //#include "GCdbYank.h"
9
10 #define USAGE "Usage:\n\
11 gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12 [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
13 [-CTVNJMKQAFGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
14 [-i <maxintron>] \n\
15 Filters and/or converts GFF3/GTF2 records.\n\
16 <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17 \n\
18 Options:\n\
19 -g full path to a multi-fasta file with the genomic sequences\n\
20 for all input mappings, OR a directory with single-fasta files\n\
21 (one per genomic sequence, with file names matching sequence names)\n\
22 -s <seq_info.fsize> is a tab-delimited file providing this info\n\
23 for each of the mapped sequences:\n\
24 <seq-name> <seq-length> <seq-description>\n\
25 (useful for -A option with mRNA/EST/protein mappings)\n\
26 -i discard transcripts having an intron larger than <maxintron>\n\
27 -r only show transcripts overlapping coordinate range <start>..<end>\n\
28 (on chromosome/contig <chr>, strand <strand> if provided)\n\
29 -R for -r option, discard all transcripts that are not fully \n\
30 contained within the given range\n\
31 -U discard single-exon transcripts\n\
32 -C coding only: discard mRNAs that have no CDS feature\n\
33 -F full GFF attribute preservation (all attributes are shown)\n\
34 -G only parse additional exon attributes from the first exon\n\
35 and move them to the mRNA level (useful for GTF input)\n\
36 -A use the description field from <seq_info.fsize> and add it\n\
37 as the value for a 'descr' attribute to the GFF record\n\
38 \n\
39 -O process also non-transcript GFF records (by default non-transcript\n\
40 records are ignored)\n\
41 -V discard any mRNAs with CDS having in-frame stop codons\n\
42 -H for -V option, check and adjust the starting CDS phase\n\
43 if the original phase leads to a translation with an \n\
44 in-frame stop codon\n\
45 -B for -V option, single-exon transcripts are also checked on the\n\
46 opposite strand\n\
47 -N discard multi-exon mRNAs that have any intron with a non-canonical\n\
48 splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49 -J discard any mRNAs that either lack initial START codon\n\
50 or the terminal STOP codon, or have an in-frame stop codon\n\
51 (only print mRNAs with a fulll, valid CDS)\n\
52 \n\
53 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54 transcripts (those with the same exact introns and fully contained)\n\
55 -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n\
56 --cluster-only: same as --merge but without collapsing matching transcripts\n\
57 -K for -M option: also collapse shorter, fully contained transcripts\n\
58 with fewer introns than the container\n\
59 -Q for -M option, remove the containment restriction:\n\
60 (multi-exon transcripts will be collapsed if just their introns match,\n\
61 while single-exon transcripts can partially overlap (80%))\n\
62 \n\
63 -E expose (warn about) duplicate transcript IDs and other potential \n\
64 problems with the given GFF/GTF records\n\
65 -Z merge close exons into a single exon (for intron size<4)\n\
66 -w write a fasta file with spliced exons for each GFF transcript\n\
67 -x write a fasta file with spliced CDS for each GFF transcript\n\
68 -W for -w and -x options, also write for each fasta record the exon\n\
69 coordinates projected onto the spliced sequence\n\
70 -y write a protein fasta file with the translation of CDS for each record\n\
71 -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
72 -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
73 this format:\n\
74 <original_ref_ID> <new_ref_ID>\n\
75 GFF records on reference sequences that are not found among the\n\
76 <original_ref_ID> entries in this file will be filtered out\n\
77 -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
78 (use -o- for printing to stdout)\n\
79 -t use <trackname> in the second column of each GFF output line\n\
80 -T -o option will output GTF format instead of GFF3\n\
81 "
82
83
84 class SeqInfo { //populated from the -s option of gffread
85 public:
86 int len;
87 char* descr;
88 SeqInfo( int l, char* s) {
89 len=l;
90 if (s==NULL) {
91 descr=NULL;
92 } else {
93 descr=Gstrdup(s);
94 }
95 }
96 ~SeqInfo() {
97 GFREE(descr);
98 }
99 };
100
101 class RefTran {
102 public:
103 char* new_name;
104 RefTran(char *ns) {
105 new_name=NULL;
106 if (ns!=NULL)
107 new_name=Gstrdup(ns);
108 }
109 ~RefTran() {
110 GFREE(new_name);
111 }
112 };
113
114 FILE* ffasta=NULL;
115 FILE* f_in=NULL;
116 FILE* f_out=NULL;
117 FILE* f_w=NULL; //fasta with spliced exons (transcripts)
118 FILE* f_x=NULL; //fasta with spliced CDS
119 FILE* f_y=NULL; //fasta with translated CDS
120 bool wCDSonly=false;
121
122 bool validCDSonly=false; // translation with no in-frame STOP
123 bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
124 bool altPhases=false; //if original phase fails translation validation,
125 //try the other 2 phases until one makes it
126 bool mRNAOnly=true;
127 bool spliceCheck=false; //only known splice-sites
128
129 bool fullCDSonly=false; // starts with START, ends with STOP codon
130 bool fullattr=false;
131 //bool sortByLoc=false; // if the GFF output should be sorted by location
132 bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3
133
134
135 //GStr gseqpath;
136 //GStr gcdbfa;
137 //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
138 //GFaSeqGet* faseq=NULL;
139 //GCdbYank* gcdb=NULL;
140 //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
141 bool fmtGTF=false;
142 bool addDescr=false;
143 //bool protmap=false;
144 bool multiExon=false;
145 bool writeExonSegs=false;
146 char* tracklabel=NULL;
147 int maxintron=999000000;
148 bool mergeCloseExons=false;
149 //range filter:
150 char* rfltGSeq=NULL;
151 char rfltStrand=0;
152 uint rfltStart=0;
153 uint rfltEnd=MAX_UINT;
154 bool rfltWithin=false; //check for full containment within given range
155 bool noExonAttr=false;
156
157 bool doCluster=false;
158 bool doCollapseRedundant=false;
159
160 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
161
162 //hash with sequence info
163 GHash<SeqInfo> seqinfo;
164 GHash<int> isoCounter; //counts the valid isoforms
165 GHash<RefTran> reftbl;
166 GHash<GeneInfo> gene_ids;
167 //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
168
169 bool debugMode=false;
170 bool verbose=false;
171
172 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
173 GLineReader fr(f);
174 while (!fr.isEof()) {
175 char* line=fr.getLine();
176 if (line==NULL) break;
177 char* id=line;
178 char* lenstr=NULL;
179 char* text=NULL;
180 char* p=line;
181 while (*p!=0 && !isspace(*p)) p++;
182 if (*p==0) continue;
183 *p=0;p++;
184 while (*p==' ' || *p=='\t') p++;
185 if (*p==0) continue;
186 lenstr=p;
187 while (*p!=0 && !isspace(*p)) p++;
188 if (*p!=0) { *p=0;p++; }
189 while (*p==' ' || *p=='\t') p++;
190 if (*p!=0) text=p; //else text remains NULL
191 int len=0;
192 if (!parseInt(lenstr,len)) {
193 GMessage("Warning: could not parse sequence length: %s %s\n",
194 id, lenstr);
195 continue;
196 }
197 // --- here we have finished parsing the line
198 si.Add(id, new SeqInfo(len,text));
199 } //while lines
200 }
201
202 void loadRefTable(FILE* f, GHash<RefTran>& rt) {
203 GLineReader fr(f);
204 char* line=NULL;
205 while ((line=fr.getLine())) {
206 char* orig_id=line;
207 char* p=line;
208 while (*p!=0 && !isspace(*p)) p++;
209 if (*p==0) continue;
210 *p=0;p++;//split the line here
211 while (*p==' ' || *p=='\t') p++;
212 if (*p==0) continue;
213 rt.Add(orig_id, new RefTran(p));
214 } //while lines
215 }
216
217 char* getSeqDescr(char* seqid) {
218 static char charbuf[128];
219 if (seqinfo.Count()==0) return NULL;
220 char* suf=rstrchr(seqid, '.');
221 if (suf!=NULL) *suf=0;
222 SeqInfo* seqd=seqinfo.Find(seqid);
223 if (suf!=NULL) *suf='.';
224 if (seqd!=NULL) {
225 GStr s(seqd->descr);
226 //cleanup some Uniref gunk
227 if (s[0]=='[') {
228 int r=s.index(']');
229 if (r>=0 && r<8 && isdigit(s[1]))
230 s.remove(0,r+1);
231 }
232 if (s.length()>80) {
233 int r=s.index(';');
234 if (r>5) s.cut(r);
235 }
236 if (s.length()>127) {
237 s.cut(127);
238 int r=s.rindex(' ');
239 if (r>0) s.cut(r);
240 }
241 strcpy(charbuf, s.chars());
242 return charbuf;
243 }
244 else return NULL;
245 }
246
247 char* getSeqName(char* seqid) {
248 static char charbuf[128];
249 char* suf=rstrchr(seqid, '.');
250 if (suf!=NULL) *suf=0;
251 strcpy(charbuf, seqid);
252 if (suf!=NULL) *suf='.';
253 return charbuf;
254 }
255
256 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
257 if (gfasta.fastaPath==NULL) return NULL;
258 return gfasta.fetch(gffrec.gseq_id);
259 }
260
261
262 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
263 //adj>0 => extedn CDS, adj<0 => shrink CDS
264 //when CDS is expanded, exons have to be checked too and
265 // expanded accordingly if they had the same boundary
266 int realadj=0;
267 if (gffrec.strand=='-') {
268 if ((int)gffrec.CDstart>adj) {
269
270 gffrec.CDstart-=adj;
271 realadj=adj;
272 if (adj<0) { //restore
273 if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
274 gffrec.exons.First()->start-=adj;
275 gffrec.start=gffrec.exons.First()->start;
276 gffrec.covlen+=adj;
277 }
278 }
279 else if (gffrec.exons.First()->start>=gffrec.CDstart) {
280 gffrec.exons.First()->start-=adj;
281 gffrec.start=gffrec.exons.First()->start;
282 gffrec.covlen+=adj;
283 }
284 }
285 }
286 else {
287 realadj=adj;
288 gffrec.CDend+=adj;
289 if (adj<0) {//restore
290 if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
291 gffrec.exons.Last()->end+=adj;
292 gffrec.end=gffrec.exons.Last()->end;
293 gffrec.covlen+=adj;
294 }
295 }
296 else if (gffrec.exons.Last()->end<=gffrec.CDend) {
297 gffrec.exons.Last()->end+=adj;
298 gffrec.end=gffrec.exons.Last()->end;
299 gffrec.covlen+=adj;
300 }
301 }
302 if (seglst!=NULL) seglst->Last()->end+=adj;
303 return realadj;
304 }
305
306 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
307 //returns true if the transcript passed the filter
308 char* gname=gffrec.getGeneName();
309 if (gname==NULL) gname=gffrec.getGeneID();
310 GStr defline(gffrec.getID());
311 if (f_out && !fmtGTF) {
312 const char* tname=NULL;
313 if ((tname=gffrec.getAttr("transcript_name"))!=NULL) {
314 gffrec.addAttr("Name", tname);
315 gffrec.removeAttr("transcript_name");
316 }
317 }
318 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
319 const char* biotype=gffrec.getAttr("gene_biotype");
320 if (biotype) {
321 gffrec.addAttr("type", biotype);
322 gffrec.removeAttr("gene_biotype");
323 }
324 else { //old Ensembl files lacking gene_biotype
325 gffrec.addAttr("type", gffrec.getTrackName());
326 }
327
328 //bool is_gene=false;
329 bool is_pseudo=false;
330 if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
331 gffrec.setFeatureName("mRNA");
332 else {
333 if (strcmp(biotype, "processed_transcript")==0)
334 gffrec.setFeatureName("proc_RNA");
335 else {
336 //is_gene=endsWith(biotype, "gene");
337 is_pseudo=strifind(biotype, "pseudo");
338 if (is_pseudo) {
339 gffrec.setFeatureName("pseudo_RNA");
340 }
341 else if (endsWith(biotype, "RNA")) {
342 gffrec.setFeatureName(biotype);
343 } else gffrec.setFeatureName("misc_RNA");
344 }
345 }
346 }
347 if (gname && strcmp(gname, gffrec.getID())!=0) {
348 int* isonum=isoCounter.Find(gname);
349 if (isonum==NULL) {
350 isonum=new int(1);
351 isoCounter.Add(gname,isonum);
352 }
353 else (*isonum)++;
354 defline.appendfmt(" gene=%s", gname);
355 }
356 int seqlen=0;
357
358 const char* tlabel=tracklabel;
359 if (tlabel==NULL) tlabel=gffrec.getTrackName();
360 //defline.appendfmt(" track:%s",tlabel);
361 char* cdsnt = NULL;
362 char* cdsaa = NULL;
363 int aalen=0;
364 for (int i=1;i<gffrec.exons.Count();i++) {
365 int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
366 if (ilen>4000000)
367 GMessage("Warning: very large intron (%d) for transcript %s\n",
368 ilen, gffrec.getID());
369 if (ilen>maxintron) {
370 return false;
371 }
372 }
373 GList<GSeg> seglst(false,true);
374 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
375 if (spliceCheck && gffrec.exons.Count()>1) {
376 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
377 if (faseq==NULL) GError("Error: no genomic sequence available!\n");
378 int glen=gffrec.end-gffrec.start+1;
379 const char* gseq=faseq->subseq(gffrec.start, glen);
380 bool revcompl=(gffrec.strand=='-');
381 bool ssValid=true;
382 for (int e=1;e<gffrec.exons.Count();e++) {
383 const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
384 int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
385 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
386 GSpliceSite donorSite(intron,intronlen, false, revcompl);
387 //GMessage("%c intron %d-%d : %s .. %s\n",
388 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
389 if (acceptorSite=="AG") { // GT-AG or GC-AG
390 if (!donorSite.canonicalDonor()) {
391 ssValid=false;break;
392 }
393 }
394 else if (acceptorSite=="AC") { //
395 if (donorSite!="AT") { ssValid=false; break; }
396 }
397 else { ssValid=false; break; }
398 }
399 //GFREE(gseq);
400 if (!ssValid) {
401 if (verbose)
402 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
403 return false; //don't print this one!
404 }
405 }
406
407 bool trprint=true;
408 int stopCodonAdjust=0;
409 int mCDphase=0;
410 bool hasStop=false;
411 if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
412 mCDphase = gffrec.CDphase-'0';
413 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
414 if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
415 //if (protmap && fullCDSonly) {
416 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
417
418 if (validCDSonly) { //make sure the stop codon is always included
419 //adjust_stopcodon(gffrec,3);
420 stopCodonAdjust=adjust_stopcodon(gffrec,3);
421 }
422 int strandNum=0;
423 int phaseNum=0;
424 CDS_CHECK:
425 cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
426 if (cdsnt==NULL) trprint=false;
427 if (validCDSonly) {
428 cdsaa=translateDNA(cdsnt, aalen, seqlen);
429 char* p=strchr(cdsaa,'.');
430 hasStop=false;
431 if (p!=NULL) {
432 if (p-cdsaa>=aalen-2) { //stop found as the last codon
433 *p='0';//remove it
434 hasStop=true;
435 if (aalen-2==p-cdsaa) {
436 //previous to last codon is the stop codon
437 //so correct the CDS stop accordingly
438 adjust_stopcodon(gffrec,-3, &seglst);
439 stopCodonAdjust=0; //clear artificial stop adjustment
440 seqlen-=3;
441 cdsnt[seqlen]=0;
442 }
443 aalen=p-cdsaa;
444 }
445 else {//stop found before the last codon
446 trprint=false;
447 }
448 }//stop codon found
449 if (trprint==false) { //failed CDS validity check
450 //in-frame stop codon found
451 if (altPhases && phaseNum<3) {
452 phaseNum++;
453 gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
454 GFREE(cdsaa);
455 goto CDS_CHECK;
456 }
457 if (gffrec.exons.Count()==1 && bothStrands) {
458 strandNum++;
459 phaseNum=0;
460 if (strandNum<2) {
461 GFREE(cdsaa);
462 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
463 goto CDS_CHECK; //repeat the CDS check for a different frame
464 }
465 }
466 if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
467 } //has in-frame STOP
468 if (fullCDSonly) {
469 if (!hasStop || cdsaa[0]!='M') trprint=false;
470 }
471 } // CDS check requested
472 } //translation or codon check/output was requested
473 if (!trprint) {
474 GFREE(cdsnt);
475 GFREE(cdsaa);
476 return false;
477 }
478 if (stopCodonAdjust>0 && !hasStop) {
479 //restore stop codon location
480 adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
481 if (cdsnt!=NULL && seqlen>0) {
482 seqlen-=stopCodonAdjust;
483 cdsnt[seqlen]=0;
484 }
485 if (cdsaa!=NULL) aalen--;
486 }
487
488 if (f_y!=NULL) { //CDS translation fasta output requested
489 //char*
490 if (cdsaa==NULL) { //translate now if not done before
491 cdsaa=translateDNA(cdsnt, aalen, seqlen);
492 }
493 if (fullattr && gffrec.attrs!=NULL) {
494 //append all attributes found for each transcripts
495 for (int i=0;i<gffrec.attrs->Count();i++) {
496 defline.append(" ");
497 defline.append(gffrec.getAttrName(i));
498 defline.append("=");
499 defline.append(gffrec.getAttrValue(i));
500 }
501 }
502 printFasta(f_y, defline, cdsaa, aalen);
503 }
504 if (f_x!=NULL) { //CDS only
505 if (writeExonSegs) {
506 defline.append(" loc:");
507 defline.append(gffrec.getGSeqName());
508 defline.appendfmt("(%c)",gffrec.strand);
509 //warning: not CDS coordinates are written here, but the exon ones
510 defline+=(int)gffrec.start;
511 defline+=(char)'-';
512 defline+=(int)gffrec.end;
513 // -- here these are CDS substring coordinates on the spliced sequence:
514 defline.append(" segs:");
515 for (int i=0;i<seglst.Count();i++) {
516 if (i>0) defline.append(",");
517 defline+=(int)seglst[i]->start;
518 defline.append("-");
519 defline+=(int)seglst[i]->end;
520 }
521 }
522 if (fullattr && gffrec.attrs!=NULL) {
523 //append all attributes found for each transcript
524 for (int i=0;i<gffrec.attrs->Count();i++) {
525 defline.append(" ");
526 defline.append(gffrec.getAttrName(i));
527 defline.append("=");
528 defline.append(gffrec.getAttrValue(i));
529 }
530 }
531 printFasta(f_x, defline, cdsnt, seqlen);
532 }
533 GFREE(cdsnt);
534 GFREE(cdsaa);
535 if (f_w!=NULL) { //write spliced exons
536 uint cds_start=0;
537 uint cds_end=0;
538 seglst.Clear();
539 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
540 if (exont!=NULL) {
541 if (gffrec.CDstart>0) {
542 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
543 }
544 if (writeExonSegs) {
545 defline.append(" loc:");
546 defline.append(gffrec.getGSeqName());
547 defline+=(char)'|';
548 defline+=(int)gffrec.start;
549 defline+=(char)'-';
550 defline+=(int)gffrec.end;
551 defline+=(char)'|';
552 defline+=(char)gffrec.strand;
553 defline.append(" exons:");
554 for (int i=0;i<gffrec.exons.Count();i++) {
555 if (i>0) defline.append(",");
556 defline+=(int)gffrec.exons[i]->start;
557 defline.append("-");
558 defline+=(int)gffrec.exons[i]->end;
559 }
560 defline.append(" segs:");
561 for (int i=0;i<seglst.Count();i++) {
562 if (i>0) defline.append(",");
563 defline+=(int)seglst[i]->start;
564 defline.append("-");
565 defline+=(int)seglst[i]->end;
566 }
567 }
568 if (fullattr && gffrec.attrs!=NULL) {
569 //append all attributes found for each transcripts
570 for (int i=0;i<gffrec.attrs->Count();i++) {
571 defline.append(" ");
572 defline.append(gffrec.getAttrName(i));
573 defline.append("=");
574 defline.append(gffrec.getAttrValue(i));
575 }
576 }
577 printFasta(f_w, defline, exont, seqlen);
578 GFREE(exont);
579 }
580 } //writing f_w (spliced exons)
581 return true;
582 }
583
584 void openfw(FILE* &f, GArgs& args, char opt) {
585 GStr s=args.getOpt(opt);
586 if (!s.is_empty()) {
587 if (s=='-')
588 f=stdout;
589 else {
590 f=fopen(s,"w");
591 if (f==NULL) GError("Error creating file: %s\n", s.chars());
592 }
593 }
594 }
595
596 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
597 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
598
599 void printGff3Header(FILE* f, GArgs& args) {
600 fprintf(f, "# ");
601 args.printCmdLine(f);
602 fprintf(f, "##gff-version 3\n");
603 //for (int i=0;i<gseqdata.Count();i++) {
604 //
605 //}
606 }
607
608 bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
609 if (reftbl.Count()>0) {
610 GStr refname(gffrec->getRefName());
611 RefTran* rt=reftbl.Find(refname.chars());
612 if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
613 //try removing the version suffix
614 refname.cut(-2);
615 //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
616 rt=reftbl.Find(refname.chars());
617 }
618 if (rt) {
619 gffrec->setRefName(rt->new_name);
620 }
621 else return false; //discard, ref seq not in the given translation table
622 }
623 if (mRNAOnly && gffrec->isDiscarded()) {
624 //discard generic "gene" or "locus" features with no other detailed subfeatures
625 //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
626 return false;
627 }
628 /*
629 if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
630 //a non-mRNA feature with no subfeatures
631 //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
632 //--this could be a single "pseudogene" entry or another genomic region without exons
633 //
634 gffrec->addExon(gffrec->start,gffrec->end);
635 }
636 */
637 if (rfltGSeq!=NULL) { //filter by gseqName
638 if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
639 return false;
640 }
641 }
642 if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
643 return false;
644 }
645 //check coordinates
646 if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
647 if (rfltWithin) {
648 if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
649 return false; //not within query range
650 }
651 }
652 else {
653 if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
654 return false;
655 }
656 }
657 }
658 if (multiExon && gffrec->exons.Count()<=1) {
659 return false;
660 }
661 if (wCDSonly && gffrec->CDstart==0) {
662 return false;
663 }
664 if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
665 //keep track of chr|gene_id data -- coordinate range
666 char* geneid=gffrec->getGeneID();
667 if (geneid!=NULL) {
668 GeneInfo* ginfo=gene_ids.Find(geneid);
669 if (ginfo==NULL) {//first time seeing this gene ID
670 GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
671 gene_ids.Add(geneid, geneinfo);
672 if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
673 }
674 else ginfo->update(gffrec);
675 }
676 }
677 return true;
678 }
679
680
681 int main(int argc, char * const argv[]) {
682 GArgs args(argc, argv,
683 "debug;merge;cluster-only;help;force-exons;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
684 args.printError(USAGE, true);
685 if (args.getOpt('h') || args.getOpt("help")) {
686 GMessage("%s",USAGE);
687 exit(1);
688 }
689 debugMode=(args.getOpt("debug")!=NULL);
690 bool forceExons=(args.getOpt("force-exons")!=NULL);
691 mRNAOnly=(args.getOpt('O')==NULL);
692 //sortByLoc=(args.getOpt('S')!=NULL);
693 addDescr=(args.getOpt('A')!=NULL);
694 verbose=(args.getOpt('v')!=NULL);
695 wCDSonly=(args.getOpt('C')!=NULL);
696 validCDSonly=(args.getOpt('V')!=NULL);
697 altPhases=(args.getOpt('H')!=NULL);
698 fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
699 bothStrands=(args.getOpt('B')!=NULL);
700 fullCDSonly=(args.getOpt('J')!=NULL);
701 spliceCheck=(args.getOpt('N')!=NULL);
702 bool matchAllIntrons=(args.getOpt('K')==NULL);
703 bool fuzzSpan=(args.getOpt('Q')!=NULL);
704 if (args.getOpt('M') || args.getOpt("merge")) {
705 doCluster=true;
706 doCollapseRedundant=true;
707 }
708 else {
709 if (!matchAllIntrons || fuzzSpan) {
710 GMessage("%s",USAGE);
711 GMessage("Error: -K or -Q options require -M/--merge option!\n");
712 exit(1);
713 }
714 }
715 if (args.getOpt("cluster-only")) {
716 doCluster=true;
717 doCollapseRedundant=false;
718 if (!matchAllIntrons || fuzzSpan) {
719 GMessage("%s",USAGE);
720 GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
721 exit(1);
722 }
723 }
724 if (fullCDSonly) validCDSonly=true;
725 if (verbose) {
726 fprintf(stderr, "Command line was:\n");
727 args.printCmdLine(stderr);
728 }
729
730 fullattr=(args.getOpt('F')!=NULL);
731 if (args.getOpt('G')==NULL)
732 noExonAttr=!fullattr;
733 else {
734 noExonAttr=true;
735 fullattr=true;
736 }
737 ensembl_convert=(args.getOpt('L')!=NULL);
738 if (ensembl_convert) {
739 fullattr=true;
740 noExonAttr=false;
741 //sortByLoc=true;
742 }
743
744 mergeCloseExons=(args.getOpt('Z')!=NULL);
745 multiExon=(args.getOpt('U')!=NULL);
746 writeExonSegs=(args.getOpt('W')!=NULL);
747 tracklabel=args.getOpt('t');
748 GFastaDb gfasta(args.getOpt('g'));
749 //if (gfasta.fastaPath!=NULL)
750 // sortByLoc=true; //enforce sorting by chromosome/contig
751 GStr s=args.getOpt('i');
752 if (!s.is_empty()) maxintron=s.asInt();
753
754 FILE* f_repl=NULL;
755 s=args.getOpt('d');
756 if (!s.is_empty()) {
757 if (s=="-") f_repl=stdout;
758 else {
759 f_repl=fopen(s.chars(), "w");
760 if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
761 }
762 }
763
764 rfltWithin=(args.getOpt('R')!=NULL);
765 s=args.getOpt('r');
766 if (!s.is_empty()) {
767 s.trim();
768 if (s[0]=='+' || s[0]=='-') {
769 rfltStrand=s[0];
770 s.cut(0,1);
771 }
772 int isep=s.index(':');
773 if (isep>0) { //gseq name given
774 if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
775 isep--;
776 rfltStrand=s[isep];
777 s.cut(isep,1);
778 }
779 if (isep>0)
780 rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
781 s.cut(0,isep+1);
782 }
783 GStr gsend;
784 char slast=s[s.length()-1];
785 if (rfltStrand==0 && (slast=='+' || slast=='-')) {
786 s.chomp(slast);
787 rfltStrand=slast;
788 }
789 if (s.index("..")>=0) gsend=s.split("..");
790 else gsend=s.split('-');
791 if (!s.is_empty()) rfltStart=(uint)s.asInt();
792 if (!gsend.is_empty()) {
793 rfltEnd=(uint)gsend.asInt();
794 if (rfltEnd==0) rfltEnd=MAX_UINT;
795 }
796 } //gseq/range filtering
797 else {
798 if (rfltWithin)
799 GError("Error: option -R requires -r!\n");
800 //if (rfltWholeTranscript)
801 // GError("Error: option -P requires -r!\n");
802 }
803 s=args.getOpt('m');
804 if (!s.is_empty()) {
805 FILE* ft=fopen(s,"r");
806 if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
807 loadRefTable(ft, reftbl);
808 fclose(ft);
809 }
810 s=args.getOpt('s');
811 if (!s.is_empty()) {
812 FILE* fsize=fopen(s,"r");
813 if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
814 loadSeqInfo(fsize, seqinfo);
815 fclose(fsize);
816 }
817
818 openfw(f_out, args, 'o');
819 //if (f_out==NULL) f_out=stdout;
820 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
821 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
822
823 openfw(f_w, args, 'w');
824 openfw(f_x, args, 'x');
825 openfw(f_y, args, 'y');
826 if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
827 //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
828
829 int numfiles = args.startNonOpt();
830 //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
831 int out_counter=0; //number of records printed
832 while (true) {
833 GStr infile;
834 if (numfiles) {
835 infile=args.nextNonOpt();
836 if (infile.is_empty()) break;
837 if (infile=="-") { f_in=stdin; infile="stdin"; }
838 else
839 if ((f_in=fopen(infile, "r"))==NULL)
840 GError("Error: cannot open input file %s!\n",infile.chars());
841 }
842 else
843 infile="-";
844 GffLoader gffloader(infile.chars());
845 gffloader.transcriptsOnly=mRNAOnly;
846 gffloader.fullAttributes=fullattr;
847 gffloader.noExonAttrs=noExonAttr;
848 gffloader.mergeCloseExons=mergeCloseExons;
849 gffloader.showWarnings=(args.getOpt('E')!=NULL);
850 gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
851 matchAllIntrons, fuzzSpan, forceExons);
852 if (doCluster)
853 collectLocusData(g_data);
854 if (numfiles==0) break;
855 }
856
857 GStr loctrack("gffcl");
858 if (tracklabel) loctrack=tracklabel;
859 g_data.setSorted(&gseqCmpName);
860 if (doCluster) {
861 //grouped in loci
862 for (int g=0;g<g_data.Count();g++) {
863 GenomicSeqData* gdata=g_data[g];
864 for (int l=0;l<gdata->loci.Count();l++) {
865 GffLocus& loc=*(gdata->loci[l]);
866 //check all non-replaced transcripts in this locus:
867 int numvalid=0;
868 int idxfirstvalid=-1;
869 for (int i=0;i<loc.rnas.Count();i++) {
870 GffObj& t=*(loc.rnas[i]);
871 GTData* tdata=(GTData*)(t.uptr);
872 if (tdata->replaced_by!=NULL) {
873 if (f_repl && (t.udata & 8)==0) {
874 //t.udata|=8;
875 fprintf(f_repl, "%s", t.getID());
876 GTData* rby=tdata;
877 while (rby->replaced_by!=NULL) {
878 fprintf(f_repl," => %s", rby->replaced_by->getID());
879 rby->rna->udata|=8;
880 rby=(GTData*)(rby->replaced_by->uptr);
881 }
882 fprintf(f_repl, "\n");
883 }
884 continue;
885 }
886 if (process_transcript(gfasta, t)) {
887 t.udata|=4; //tag it as valid
888 numvalid++;
889 if (idxfirstvalid<0) idxfirstvalid=i;
890 }
891 }
892
893 if (f_out && numvalid>0) {
894 GStr locname("RLOC_");
895 locname.appendfmt("%08d",loc.locus_num);
896 if (!fmtGTF) {
897 if (out_counter==0)
898 printGff3Header(f_out, args);
899 fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
900 loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
901 locname.chars(), locname.chars());
902 //const char* loc_gname=loc.getGeneName();
903 if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
904 fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
905 for (int i=1;i<loc.gene_names.Count();i++) {
906 fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
907 }
908 }
909 if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
910 fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
911 for (int i=1;i<loc.gene_ids.Count();i++) {
912 fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
913 }
914 }
915 fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
916 for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
917 fprintf(f_out, ",%s",loc.rnas[i]->getID());
918 }
919 fprintf(f_out, "\n");
920 }
921 //now print all valid, non-replaced transcripts in this locus:
922 for (int i=0;i<loc.rnas.Count();i++) {
923 GffObj& t=*(loc.rnas[i]);
924 GTData* tdata=(GTData*)(t.uptr);
925 if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
926 t.addAttr("locus", locname.chars());
927 out_counter++;
928 if (fmtGTF) t.printGtf(f_out, tracklabel);
929 else {
930 //print the parent first, if any
931 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
932 GTData* pdata=(GTData*)(t.parent->uptr);
933 if (pdata->geneinfo!=NULL)
934 pdata->geneinfo->finalize();
935 t.parent->addAttr("locus", locname.chars());
936 t.parent->printGff(f_out, tracklabel);
937 t.parent->udata|=4;
938 }
939 t.printGff(f_out, tracklabel);
940 }
941 }
942 } //have valid transcripts to print
943 }//for each locus
944 if (f_out && !mRNAOnly) {
945 //final pass through the non-transcripts, in case any of them were not printed
946 //TODO: order broken, these should be interspersed among the rnas in the correct order!
947 for (int m=0;m<gdata->gfs.Count();m++) {
948 GffObj& t=*(gdata->gfs[m]);
949 if ((t.udata&4)==0) { //never printed
950 t.udata|=4;
951 if (fmtGTF) t.printGtf(f_out, tracklabel);
952 else t.printGff(f_out, tracklabel);
953 }
954 } //for each non-transcript
955 }
956 } //for each genomic sequence
957 }
958 else {
959 //not grouped into loci, print the rnas with their parents, if any
960 int numvalid=0;
961 for (int g=0;g<g_data.Count();g++) {
962 GenomicSeqData* gdata=g_data[g];
963 for (int m=0;m<gdata->rnas.Count();m++) {
964 GffObj& t=*(gdata->rnas[m]);
965 GTData* tdata=(GTData*)(t.uptr);
966 if (tdata->replaced_by!=NULL) continue;
967 if (process_transcript(gfasta, t)) {
968 t.udata|=4; //tag it as valid
969 numvalid++;
970 if (f_out) {
971 if (tdata->geneinfo) tdata->geneinfo->finalize();
972 out_counter++;
973 if (fmtGTF) t.printGtf(f_out, tracklabel);
974 else {
975 if (out_counter==1)
976 printGff3Header(f_out, args);
977 //print the parent first, if any
978 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
979 GTData* pdata=(GTData*)(t.parent->uptr);
980 if (pdata->geneinfo!=NULL)
981 pdata->geneinfo->finalize();
982 t.parent->printGff(f_out, tracklabel);
983 t.parent->udata|=4;
984 }
985 t.printGff(f_out, tracklabel);
986 }
987 }//GFF/GTF output requested
988 } //valid transcript
989 } //for each rna
990 if (f_out && !mRNAOnly) {
991 //final pass through the non-transcripts, in case any of them were not printed
992 //TODO: order broken, these should be interspersed among the rnas in the correct order!
993 for (int m=0;m<gdata->gfs.Count();m++) {
994 GffObj& t=*(gdata->gfs[m]);
995 if ((t.udata&4)==0) { //never printed
996 t.udata|=4;
997 if (fmtGTF) t.printGtf(f_out, tracklabel);
998 else t.printGff(f_out, tracklabel);
999 }
1000 } //for each non-transcript
1001 }
1002 } //for each genomic seq
1003 }
1004 if (f_repl && f_repl!=stdout) fclose(f_repl);
1005 seqinfo.Clear();
1006 //if (faseq!=NULL) delete faseq;
1007 //if (gcdb!=NULL) delete gcdb;
1008 GFREE(rfltGSeq);
1009 FRCLOSE(f_in);
1010 FWCLOSE(f_out);
1011 FWCLOSE(f_w);
1012 FWCLOSE(f_x);
1013 FWCLOSE(f_y);
1014 }