ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 17
Committed: Mon Jul 18 20:57:05 2011 UTC (13 years, 2 months ago) by gpertea
File size: 35262 byte(s)
Log Message:
sync with local source

Line File contents
1 #include "gff_utils.h"
2 #include "GArgs.h"
3 #include <ctype.h>
4 // don't care about cdb compression
5 //#ifdef ENABLE_COMPRESSION
6 //#undef ENABLE_COMPRESSION
7 //#endif
8 //#include "GCdbYank.h"
9
10 #define USAGE "Usage:\n\
11 gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12 [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end>] \n\
13 [-CTVNJMKQAFGRUVBHZWTOLE] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tr_cds.fa>]\n\
14 [-i <maxintron>] \n\
15 Filters and/or converts GFF3/GTF2 records.\n\
16 <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17 \n\
18 Options:\n\
19 -g full path to a multi-fasta file with the genomic sequences\n\
20 for all input mappings, OR a directory with single-fasta files\n\
21 (one per genomic sequence, with file names matching sequence names)\n\
22 -s <seq_info.fsize> is a tab-delimited file providing this info\n\
23 for each of the mapped sequences:\n\
24 <seq-name> <seq-length> <seq-description>\n\
25 (useful for -A option with mRNA/EST/protein mappings)\n\
26 -i discard transcripts having an intron larger than <maxintron>\n\
27 -r only show transcripts crossing coordinate range <start>..<end>\n\
28 (on chromosome/contig <chr>, strand <strand> if provided)\n\
29 -R for -r option, discard all transcripts that are not fully \n\
30 contained within given range\n\
31 -U discard single-exon transcripts\n\
32 -C coding only: discard mRNAs that have no CDS feature\n\
33 -F full GFF attribute preservation (all attributes are shown)\n\
34 -G only parse additional exon attributes from the first exon\n\
35 and move them to the mRNA level (useful for GTF input)\n\
36 -A use the description field from <seq_info.fsize> and add it\n\
37 as the value for a 'descr' attribute to the GFF record\n\
38 \n\
39 -O process also non-transcript GFF records (by default non-transcript\n\
40 records are ignored)\n\
41 -V discard any mRNAs with CDS having in-frame stop codons\n\
42 -H for -V option, check and adjust the starting CDS phase\n\
43 if the original phase leads to a translation with an \n\
44 in-frame stop codon\n\
45 -B for -V option, single-exon transcripts are also checked on the\n\
46 opposite strand\n\
47 -N discard multi-exon mRNAs that have any intron with a non-canonical\n\
48 splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49 -J discard any mRNAs that either lack initial START codon\n\
50 or the terminal STOP codon, or have an in-frame stop codon\n\
51 (only print mRNAs with a fulll, valid CDS)\n\
52 \n\
53 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54 transcripts (those with the same exact introns and fully contained)\n\
55 --cluster-only: same as --merge but without collapsing matching transcripts\n\
56 -K for -M option: also collapse shorter, fully contained transcripts\n\
57 with fewer introns than the container\n\
58 -Q for -M option, remove the containment restriction:\n\
59 (multi-exon transcripts will be collapsed if just their introns match,\n\
60 while single-exon transcripts can partially overlap (80%))\n\
61 \n\
62 -E expose (warn about) duplicate transcript IDs and other potential \n\
63 problems with the given GFF/GTF records\n\
64 -Z merge close exons into a single exon (for intron size<4)\n\
65 -w write a fasta file with spliced exons for each GFF transcript\n\
66 -x write a fasta file with spliced CDS for each GFF transcript\n\
67 -W for -w and -x options, also write for each fasta record the exon\n\
68 coordinates projected onto the spliced sequence\n\
69 -y write a protein fasta file with the translation of CDS for each record\n\
70 -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
71 -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
72 this format:\n\
73 <original_ref_ID> <new_ref_ID>\n\
74 GFF records on reference sequences that are not found among the\n\
75 <original_ref_ID> entries in this file will be filtered out\n\
76 -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
77 (use -o- for printing to stdout)\n\
78 -t use <trackname> in the second column of each GFF output line\n\
79 -T -o option will output GTF format instead of GFF3\n\
80 "
81
82
83 class SeqInfo { //populated from the -s option of gffread
84 public:
85 int len;
86 char* descr;
87 SeqInfo( int l, char* s) {
88 len=l;
89 if (s==NULL) {
90 descr=NULL;
91 } else {
92 descr=Gstrdup(s);
93 }
94 }
95 ~SeqInfo() {
96 GFREE(descr);
97 }
98 };
99
100 class RefTran {
101 public:
102 char* new_name;
103 RefTran(char *ns) {
104 new_name=NULL;
105 if (ns!=NULL)
106 new_name=Gstrdup(ns);
107 }
108 ~RefTran() {
109 GFREE(new_name);
110 }
111 };
112
113 FILE* ffasta=NULL;
114 FILE* f_in=NULL;
115 FILE* f_out=NULL;
116 FILE* f_w=NULL; //fasta with spliced exons (transcripts)
117 FILE* f_x=NULL; //fasta with spliced CDS
118 FILE* f_y=NULL; //fasta with translated CDS
119 bool wCDSonly=false;
120
121 bool validCDSonly=false; // translation with no in-frame STOP
122 bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
123 bool altPhases=false; //if original phase fails translation validation,
124 //try the other 2 phases until one makes it
125 bool mRNAOnly=true;
126 bool spliceCheck=false; //only known splice-sites
127
128 bool fullCDSonly=false; // starts with START, ends with STOP codon
129 bool fullattr=false;
130 //bool sortByLoc=false; // if the GFF output should be sorted by location
131 bool ensembl_convert=false; //-L, assisst in converting Ensembl GTF to GFF3
132
133
134 //GStr gseqpath;
135 //GStr gcdbfa;
136 //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
137 //GFaSeqGet* faseq=NULL;
138 //GCdbYank* gcdb=NULL;
139 //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
140 bool fmtGTF=false;
141 bool addDescr=false;
142 //bool protmap=false;
143 bool multiExon=false;
144 bool writeExonSegs=false;
145 char* tracklabel=NULL;
146 int maxintron=999000000;
147 bool mergeCloseExons=false;
148 //range filter:
149 char* rfltGSeq=NULL;
150 char rfltStrand=0;
151 uint rfltStart=0;
152 uint rfltEnd=MAX_UINT;
153 bool rfltWithin=false; //check for full containment within given range
154 bool noExonAttr=false;
155
156 bool doCluster=false;
157 bool doCollapseRedundant=false;
158
159 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
160
161 //hash with sequence info
162 GHash<SeqInfo> seqinfo;
163 GHash<int> isoCounter; //counts the valid isoforms
164 GHash<RefTran> reftbl;
165 GHash<GeneInfo> gene_ids;
166 //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
167
168 bool debugMode=false;
169 bool verbose=false;
170
171 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
172 GLineReader fr(f);
173 while (!fr.isEof()) {
174 char* line=fr.getLine();
175 if (line==NULL) break;
176 char* id=line;
177 char* lenstr=NULL;
178 char* text=NULL;
179 char* p=line;
180 while (*p!=0 && !isspace(*p)) p++;
181 if (*p==0) continue;
182 *p=0;p++;
183 while (*p==' ' || *p=='\t') p++;
184 if (*p==0) continue;
185 lenstr=p;
186 while (*p!=0 && !isspace(*p)) p++;
187 if (*p!=0) { *p=0;p++; }
188 while (*p==' ' || *p=='\t') p++;
189 if (*p!=0) text=p; //else text remains NULL
190 int len=0;
191 if (!parseInt(lenstr,len)) {
192 GMessage("Warning: could not parse sequence length: %s %s\n",
193 id, lenstr);
194 continue;
195 }
196 // --- here we have finished parsing the line
197 si.Add(id, new SeqInfo(len,text));
198 } //while lines
199 }
200
201 void loadRefTable(FILE* f, GHash<RefTran>& rt) {
202 GLineReader fr(f);
203 char* line=NULL;
204 while ((line=fr.getLine())) {
205 char* orig_id=line;
206 char* p=line;
207 while (*p!=0 && !isspace(*p)) p++;
208 if (*p==0) continue;
209 *p=0;p++;//split the line here
210 while (*p==' ' || *p=='\t') p++;
211 if (*p==0) continue;
212 rt.Add(orig_id, new RefTran(p));
213 } //while lines
214 }
215
216 char* getSeqDescr(char* seqid) {
217 static char charbuf[128];
218 if (seqinfo.Count()==0) return NULL;
219 char* suf=rstrchr(seqid, '.');
220 if (suf!=NULL) *suf=0;
221 SeqInfo* seqd=seqinfo.Find(seqid);
222 if (suf!=NULL) *suf='.';
223 if (seqd!=NULL) {
224 GStr s(seqd->descr);
225 //cleanup some Uniref gunk
226 if (s[0]=='[') {
227 int r=s.index(']');
228 if (r>=0 && r<8 && isdigit(s[1]))
229 s.remove(0,r+1);
230 }
231 if (s.length()>80) {
232 int r=s.index(';');
233 if (r>5) s.cut(r);
234 }
235 if (s.length()>127) {
236 s.cut(127);
237 int r=s.rindex(' ');
238 if (r>0) s.cut(r);
239 }
240 strcpy(charbuf, s.chars());
241 return charbuf;
242 }
243 else return NULL;
244 }
245
246 char* getSeqName(char* seqid) {
247 static char charbuf[128];
248 char* suf=rstrchr(seqid, '.');
249 if (suf!=NULL) *suf=0;
250 strcpy(charbuf, seqid);
251 if (suf!=NULL) *suf='.';
252 return charbuf;
253 }
254
255 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
256 if (gfasta.fastaPath==NULL) return NULL;
257 return gfasta.fetch(gffrec.gseq_id);
258 }
259
260
261 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
262 //adj>0 => extedn CDS, adj<0 => shrink CDS
263 //when CDS is expanded, exons have to be checked too and
264 // expanded accordingly if they had the same boundary
265 int realadj=0;
266 if (gffrec.strand=='-') {
267 if ((int)gffrec.CDstart>adj) {
268
269 gffrec.CDstart-=adj;
270 realadj=adj;
271 if (adj<0) { //restore
272 if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
273 gffrec.exons.First()->start-=adj;
274 gffrec.start=gffrec.exons.First()->start;
275 gffrec.covlen+=adj;
276 }
277 }
278 else if (gffrec.exons.First()->start>=gffrec.CDstart) {
279 gffrec.exons.First()->start-=adj;
280 gffrec.start=gffrec.exons.First()->start;
281 gffrec.covlen+=adj;
282 }
283 }
284 }
285 else {
286 realadj=adj;
287 gffrec.CDend+=adj;
288 if (adj<0) {//restore
289 if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
290 gffrec.exons.Last()->end+=adj;
291 gffrec.end=gffrec.exons.Last()->end;
292 gffrec.covlen+=adj;
293 }
294 }
295 else if (gffrec.exons.Last()->end<=gffrec.CDend) {
296 gffrec.exons.Last()->end+=adj;
297 gffrec.end=gffrec.exons.Last()->end;
298 gffrec.covlen+=adj;
299 }
300 }
301 if (seglst!=NULL) seglst->Last()->end+=adj;
302 return realadj;
303 }
304
305 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
306 //returns true if the transcript passed the filter
307 char* gname=gffrec.getGeneName();
308 if (gname==NULL) gname=gffrec.getGeneID();
309 GStr defline(gffrec.getID());
310 if (f_out && !fmtGTF) {
311 const char* tn=NULL;
312 if ((tn=gffrec.getAttr("transcript_name"))!=NULL) {
313 gffrec.addAttr("Name", tn);
314 gffrec.removeAttr("transcript_name");
315 }
316 }
317 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
318 const char* tn=gffrec.getTrackName();
319 gffrec.addAttr("type", tn);
320 //bool is_gene=false;
321 bool is_pseudo=false;
322 if (strcmp(tn, "protein_coding")==0 || gffrec.hasCDS())
323 gffrec.setFeatureName("mRNA");
324 else {
325 if (strcmp(tn, "processed_transcript")==0)
326 gffrec.setFeatureName("proc_RNA");
327 else {
328 //is_gene=endsWith(tn, "gene");
329 is_pseudo=strifind(tn, "pseudo");
330 if (is_pseudo) {
331 gffrec.setFeatureName("pseudo_RNA");
332 }
333 else if (endsWith(tn, "RNA")) {
334 gffrec.setFeatureName(tn);
335 } else gffrec.setFeatureName("misc_RNA");
336 }
337 }
338 }
339 if (gname && strcmp(gname, gffrec.getID())!=0) {
340 int* isonum=isoCounter.Find(gname);
341 if (isonum==NULL) {
342 isonum=new int(1);
343 isoCounter.Add(gname,isonum);
344 }
345 else (*isonum)++;
346 defline.appendfmt(" gene=%s", gname);
347 }
348 int seqlen=0;
349
350 const char* tlabel=tracklabel;
351 if (tlabel==NULL) tlabel=gffrec.getTrackName();
352 //defline.appendfmt(" track:%s",tlabel);
353 char* cdsnt = NULL;
354 char* cdsaa = NULL;
355 int aalen=0;
356 for (int i=1;i<gffrec.exons.Count();i++) {
357 int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
358 if (ilen>4000000)
359 GMessage("Warning: very large intron (%d) for transcript %s\n",
360 ilen, gffrec.getID());
361 if (ilen>maxintron) {
362 return false;
363 }
364 }
365 GList<GSeg> seglst(false,true);
366 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
367 if (spliceCheck && gffrec.exons.Count()>1) {
368 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
369 if (faseq==NULL) GError("Error: no genomic sequence available!\n");
370 int glen=gffrec.end-gffrec.start+1;
371 const char* gseq=faseq->subseq(gffrec.start, glen);
372 bool revcompl=(gffrec.strand=='-');
373 bool ssValid=true;
374 for (int e=1;e<gffrec.exons.Count();e++) {
375 const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
376 int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
377 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
378 GSpliceSite donorSite(intron,intronlen, false, revcompl);
379 //GMessage("%c intron %d-%d : %s .. %s\n",
380 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
381 if (acceptorSite=="AG") { // GT-AG or GC-AG
382 if (!donorSite.canonicalDonor()) {
383 ssValid=false;break;
384 }
385 }
386 else if (acceptorSite=="AC") { //
387 if (donorSite!="AT") { ssValid=false; break; }
388 }
389 else { ssValid=false; break; }
390 }
391 //GFREE(gseq);
392 if (!ssValid) {
393 if (verbose)
394 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
395 return false; //don't print this one!
396 }
397 }
398
399 bool trprint=true;
400 int stopCodonAdjust=0;
401 int mCDphase=0;
402 bool hasStop=false;
403 if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
404 mCDphase = gffrec.CDphase-'0';
405 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
406 if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
407 //if (protmap && fullCDSonly) {
408 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
409
410 if (validCDSonly) { //make sure the stop codon is always included
411 //adjust_stopcodon(gffrec,3);
412 stopCodonAdjust=adjust_stopcodon(gffrec,3);
413 }
414 int strandNum=0;
415 int phaseNum=0;
416 CDS_CHECK:
417 cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
418 if (cdsnt==NULL) trprint=false;
419 if (validCDSonly) {
420 cdsaa=translateDNA(cdsnt, aalen, seqlen);
421 char* p=strchr(cdsaa,'.');
422 hasStop=false;
423 if (p!=NULL) {
424 if (p-cdsaa>=aalen-2) { //stop found as the last codon
425 *p='0';//remove it
426 hasStop=true;
427 if (aalen-2==p-cdsaa) {
428 //previous to last codon is the stop codon
429 //so correct the CDS stop accordingly
430 adjust_stopcodon(gffrec,-3, &seglst);
431 stopCodonAdjust=0; //clear artificial stop adjustment
432 seqlen-=3;
433 cdsnt[seqlen]=0;
434 }
435 aalen=p-cdsaa;
436 }
437 else {//stop found before the last codon
438 trprint=false;
439 }
440 }//stop codon found
441 if (trprint==false) { //failed CDS validity check
442 //in-frame stop codon found
443 if (altPhases && phaseNum<3) {
444 phaseNum++;
445 gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
446 GFREE(cdsaa);
447 goto CDS_CHECK;
448 }
449 if (gffrec.exons.Count()==1 && bothStrands) {
450 strandNum++;
451 phaseNum=0;
452 if (strandNum<2) {
453 GFREE(cdsaa);
454 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
455 goto CDS_CHECK; //repeat the CDS check for a different frame
456 }
457 }
458 if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
459 } //has in-frame STOP
460 if (fullCDSonly) {
461 if (!hasStop || cdsaa[0]!='M') trprint=false;
462 }
463 } // CDS check requested
464 } //translation or codon check/output was requested
465 if (!trprint) {
466 GFREE(cdsnt);
467 GFREE(cdsaa);
468 return false;
469 }
470 if (stopCodonAdjust>0 && !hasStop) {
471 //restore stop codon location
472 adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
473 if (cdsnt!=NULL && seqlen>0) {
474 seqlen-=stopCodonAdjust;
475 cdsnt[seqlen]=0;
476 }
477 if (cdsaa!=NULL) aalen--;
478 }
479
480 if (f_y!=NULL) { //CDS translation fasta output requested
481 //char*
482 if (cdsaa==NULL) { //translate now if not done before
483 cdsaa=translateDNA(cdsnt, aalen, seqlen);
484 }
485 if (fullattr && gffrec.attrs!=NULL) {
486 //append all attributes found for each transcripts
487 for (int i=0;i<gffrec.attrs->Count();i++) {
488 defline.append(" ");
489 defline.append(gffrec.getAttrName(i));
490 defline.append("=");
491 defline.append(gffrec.getAttrValue(i));
492 }
493 }
494 printFasta(f_y, defline, cdsaa, aalen);
495 }
496 if (f_x!=NULL) { //CDS only
497 if (writeExonSegs) {
498 defline.append(" loc:");
499 defline.append(gffrec.getGSeqName());
500 defline.appendfmt("(%c)",gffrec.strand);
501 //warning: not CDS coordinates are written here, but the exon ones
502 defline+=(int)gffrec.start;
503 defline+=(char)'-';
504 defline+=(int)gffrec.end;
505 // -- here these are CDS substring coordinates on the spliced sequence:
506 defline.append(" segs:");
507 for (int i=0;i<seglst.Count();i++) {
508 if (i>0) defline.append(",");
509 defline+=(int)seglst[i]->start;
510 defline.append("-");
511 defline+=(int)seglst[i]->end;
512 }
513 }
514 if (fullattr && gffrec.attrs!=NULL) {
515 //append all attributes found for each transcript
516 for (int i=0;i<gffrec.attrs->Count();i++) {
517 defline.append(" ");
518 defline.append(gffrec.getAttrName(i));
519 defline.append("=");
520 defline.append(gffrec.getAttrValue(i));
521 }
522 }
523 printFasta(f_x, defline, cdsnt, seqlen);
524 }
525 GFREE(cdsnt);
526 GFREE(cdsaa);
527 if (f_w!=NULL) { //write spliced exons
528 uint cds_start=0;
529 uint cds_end=0;
530 seglst.Clear();
531 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
532 if (exont!=NULL) {
533 if (gffrec.CDstart>0) {
534 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
535 }
536 if (writeExonSegs) {
537 defline.append(" loc:");
538 defline.append(gffrec.getGSeqName());
539 defline+=(char)'|';
540 defline+=(int)gffrec.start;
541 defline+=(char)'-';
542 defline+=(int)gffrec.end;
543 defline+=(char)'|';
544 defline+=(char)gffrec.strand;
545 defline.append(" exons:");
546 for (int i=0;i<gffrec.exons.Count();i++) {
547 if (i>0) defline.append(",");
548 defline+=(int)gffrec.exons[i]->start;
549 defline.append("-");
550 defline+=(int)gffrec.exons[i]->end;
551 }
552 defline.append(" segs:");
553 for (int i=0;i<seglst.Count();i++) {
554 if (i>0) defline.append(",");
555 defline+=(int)seglst[i]->start;
556 defline.append("-");
557 defline+=(int)seglst[i]->end;
558 }
559 }
560 if (fullattr && gffrec.attrs!=NULL) {
561 //append all attributes found for each transcripts
562 for (int i=0;i<gffrec.attrs->Count();i++) {
563 defline.append(" ");
564 defline.append(gffrec.getAttrName(i));
565 defline.append("=");
566 defline.append(gffrec.getAttrValue(i));
567 }
568 }
569 printFasta(f_w, defline, exont, seqlen);
570 GFREE(exont);
571 }
572 } //writing f_w (spliced exons)
573 return true;
574 }
575
576 void openfw(FILE* &f, GArgs& args, char opt) {
577 GStr s=args.getOpt(opt);
578 if (!s.is_empty()) {
579 if (s=='-')
580 f=stdout;
581 else {
582 f=fopen(s,"w");
583 if (f==NULL) GError("Error creating file: %s\n", s.chars());
584 }
585 }
586 }
587
588 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
589 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
590
591 void printGff3Header(FILE* f, GArgs& args) {
592 fprintf(f, "# ");
593 args.printCmdLine(f);
594 fprintf(f, "##gff-version 3\n");
595 //for (int i=0;i<gseqdata.Count();i++) {
596 //
597 //}
598 }
599
600 bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
601 if (reftbl.Count()>0) {
602 GStr refname(gffrec->getRefName());
603 RefTran* rt=reftbl.Find(refname.chars());
604 if (rt==NULL && refname[-2]=='.' && isdigit(refname[-1])) {
605 //try removing the version
606 refname.cut(-2);
607 //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
608 rt=reftbl.Find(refname.chars());
609 }
610 if (rt) {
611 gffrec->setRefName(rt->new_name);
612 }
613 else return false; //discard, ref seq not in the given translation table
614 }
615 if (mRNAOnly && gffrec->isDiscarded()) {
616 //discard generic "gene" or "locus" features with no other detailed subfeatures
617 //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
618 return false;
619 }
620 /*
621 if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
622 //a non-mRNA feature with no subfeatures
623 //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
624 //--this could be a single "pseudogene" entry or another genomic region without exons
625 //
626 gffrec->addExon(gffrec->start,gffrec->end);
627 }
628 */
629 if (rfltGSeq!=NULL) { //filter by gseqName
630 if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
631 return false;
632 }
633 }
634 if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
635 return false;
636 }
637 //check coordinates
638 if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
639 if (rfltWithin) {
640 if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
641 return false;
642 }
643 }
644 else {
645 if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
646 return false;
647 }
648 }
649 }
650 if (multiExon && gffrec->exons.Count()<=1) {
651 return false;
652 }
653 if (wCDSonly && gffrec->CDstart==0) {
654 return false;
655 }
656 if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
657 //keep track of chr|gene_id data -- coordinate range
658 char* geneid=gffrec->getGeneID();
659 if (geneid!=NULL) {
660 GeneInfo* ginfo=gene_ids.Find(geneid);
661 if (ginfo==NULL) {//first time seeing this gene ID
662 GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
663 gene_ids.Add(geneid, geneinfo);
664 if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
665 }
666 else ginfo->update(gffrec);
667 }
668 }
669 return true;
670 }
671
672
673 int main(int argc, char * const argv[]) {
674 GArgs args(argc, argv,
675 "debug;merge;cluster-only;help;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
676 args.printError(USAGE, true);
677 if (args.getOpt('h') || args.getOpt("help")) {
678 GMessage("%s",USAGE);
679 exit(1);
680 }
681 debugMode=(args.getOpt("debug")!=NULL);
682 mRNAOnly=(args.getOpt('O')==NULL);
683 //sortByLoc=(args.getOpt('S')!=NULL);
684 addDescr=(args.getOpt('A')!=NULL);
685 verbose=(args.getOpt('v')!=NULL);
686 wCDSonly=(args.getOpt('C')!=NULL);
687 validCDSonly=(args.getOpt('V')!=NULL);
688 altPhases=(args.getOpt('H')!=NULL);
689 fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
690 bothStrands=(args.getOpt('B')!=NULL);
691 fullCDSonly=(args.getOpt('J')!=NULL);
692 spliceCheck=(args.getOpt('N')!=NULL);
693 bool matchAllIntrons=(args.getOpt('K')==NULL);
694 bool fuzzSpan=(args.getOpt('Q')!=NULL);
695 if (args.getOpt('M') || args.getOpt("merge")) {
696 doCluster=true;
697 doCollapseRedundant=true;
698 }
699 else {
700 if (!matchAllIntrons || fuzzSpan) {
701 GMessage("%s",USAGE);
702 GMessage("Error: -K or -Q options require -M/--merge option!\n");
703 exit(1);
704 }
705 }
706 if (args.getOpt("cluster-only")) {
707 doCluster=true;
708 doCollapseRedundant=false;
709 if (!matchAllIntrons || fuzzSpan) {
710 GMessage("%s",USAGE);
711 GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
712 exit(1);
713 }
714 }
715 //protmap=(args.getOpt('P')!=NULL);
716 if (fullCDSonly) validCDSonly=true;
717 if (verbose) {
718 fprintf(stderr, "Command line was:\n");
719 args.printCmdLine(stderr);
720 }
721
722 fullattr=(args.getOpt('F')!=NULL);
723 if (args.getOpt('G')==NULL)
724 noExonAttr=!fullattr;
725 else {
726 noExonAttr=true;
727 fullattr=true;
728 }
729 ensembl_convert=(args.getOpt('L')!=NULL);
730 if (ensembl_convert) {
731 fullattr=true;
732 noExonAttr=false;
733 //sortByLoc=true;
734 }
735
736 mergeCloseExons=(args.getOpt('Z')!=NULL);
737 multiExon=(args.getOpt('U')!=NULL);
738 writeExonSegs=(args.getOpt('W')!=NULL);
739 tracklabel=args.getOpt('t');
740 GFastaDb gfasta(args.getOpt('g'));
741 //if (gfasta.fastaPath!=NULL)
742 // sortByLoc=true; //enforce sorting by chromosome/contig
743 GStr s=args.getOpt('i');
744 if (!s.is_empty()) maxintron=s.asInt();
745
746 FILE* f_repl=NULL;
747 s=args.getOpt('d');
748 if (!s.is_empty()) {
749 if (s=="-") f_repl=stdout;
750 else {
751 f_repl=fopen(s.chars(), "w");
752 if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
753 }
754 }
755
756 rfltWithin=(args.getOpt('R')!=NULL);
757 s=args.getOpt('r');
758 if (!s.is_empty()) {
759 s.trim();
760 if (s[0]=='+' || s[0]=='-') {
761 rfltStrand=s[0];
762 s.cut(0,1);
763 }
764 int isep=s.index(':');
765 if (isep>0) { //gseq name given
766 if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
767 isep--;
768 rfltStrand=s[isep];
769 s.cut(isep,1);
770 }
771 if (isep>0)
772 rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
773 s.cut(0,isep+1);
774 }
775 GStr gsend;
776 char slast=s[s.length()-1];
777 if (rfltStrand==0 && (slast=='+' || slast=='-')) {
778 s.chomp(slast);
779 rfltStrand=slast;
780 }
781 if (s.index("..")>=0) gsend=s.split("..");
782 else gsend=s.split('-');
783 if (!s.is_empty()) rfltStart=(uint)s.asInt();
784 if (!gsend.is_empty()) {
785 rfltEnd=(uint)gsend.asInt();
786 if (rfltEnd==0) rfltEnd=MAX_UINT;
787 }
788
789 } //gseq/range filtering
790 else {
791 if (rfltWithin)
792 GError("Error: option -R doesn't make sense without -r!\n");
793 }
794 s=args.getOpt('m');
795 if (!s.is_empty()) {
796 FILE* ft=fopen(s,"r");
797 if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
798 loadRefTable(ft, reftbl);
799 fclose(ft);
800 }
801 s=args.getOpt('s');
802 if (!s.is_empty()) {
803 FILE* fsize=fopen(s,"r");
804 if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
805 loadSeqInfo(fsize, seqinfo);
806 fclose(fsize);
807 }
808
809 openfw(f_out, args, 'o');
810 //if (f_out==NULL) f_out=stdout;
811 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
812 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
813
814 openfw(f_w, args, 'w');
815 openfw(f_x, args, 'x');
816 openfw(f_y, args, 'y');
817 if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
818 //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
819
820 int numfiles = args.startNonOpt();
821 //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
822 int out_counter=0; //number of records printed
823 while (true) {
824 GStr infile;
825 if (numfiles) {
826 infile=args.nextNonOpt();
827 if (infile.is_empty()) break;
828 if (infile=="-") { f_in=stdin; infile="stdin"; }
829 else
830 if ((f_in=fopen(infile, "r"))==NULL)
831 GError("Error: cannot open input file %s!\n",infile.chars());
832 }
833 else
834 infile="-";
835 GffLoader gffloader(infile.chars());
836 gffloader.transcriptsOnly=mRNAOnly;
837 gffloader.fullAttributes=fullattr;
838 gffloader.noExonAttrs=noExonAttr;
839 gffloader.mergeCloseExons=mergeCloseExons;
840 gffloader.showWarnings=(args.getOpt('E')!=NULL);
841 gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
842 matchAllIntrons, fuzzSpan);
843 if (doCluster)
844 collectLocusData(g_data);
845 if (numfiles==0) break;
846 }
847
848 GStr loctrack("gffcl");
849 if (tracklabel) loctrack=tracklabel;
850 g_data.setSorted(&gseqCmpName);
851 if (doCluster) {
852 //grouped in loci
853 for (int g=0;g<g_data.Count();g++) {
854 GenomicSeqData* gdata=g_data[g];
855 for (int l=0;l<gdata->loci.Count();l++) {
856 GffLocus& loc=*(gdata->loci[l]);
857 //check all non-replaced transcripts in this locus:
858 int numvalid=0;
859 int idxfirstvalid=-1;
860 for (int i=0;i<loc.rnas.Count();i++) {
861 GffObj& t=*(loc.rnas[i]);
862 GTData* tdata=(GTData*)(t.uptr);
863 if (tdata->replaced_by!=NULL) {
864 if (f_repl && (t.udata & 8)==0) {
865 //t.udata|=8;
866 fprintf(f_repl, "%s", t.getID());
867 GTData* rby=tdata;
868 while (rby->replaced_by!=NULL) {
869 fprintf(f_repl," => %s", rby->replaced_by->getID());
870 rby->rna->udata|=8;
871 rby=(GTData*)(rby->replaced_by->uptr);
872 }
873 fprintf(f_repl, "\n");
874 }
875 continue;
876 }
877 if (process_transcript(gfasta, t)) {
878 t.udata|=4; //tag it as valid
879 numvalid++;
880 if (idxfirstvalid<0) idxfirstvalid=i;
881 }
882 }
883
884 if (f_out && numvalid>0) {
885 GStr locname("RLOC_");
886 locname.appendfmt("%08d",loc.locus_num);
887 if (!fmtGTF) {
888 if (out_counter==0)
889 printGff3Header(f_out, args);
890 fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
891 loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
892 locname.chars(), locname.chars());
893 //const char* loc_gname=loc.getGeneName();
894 if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
895 fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
896 for (int i=1;i<loc.gene_names.Count();i++) {
897 fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
898 }
899 }
900 if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
901 fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
902 for (int i=1;i<loc.gene_ids.Count();i++) {
903 fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
904 }
905 }
906 fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
907 for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
908 fprintf(f_out, ",%s",loc.rnas[i]->getID());
909 }
910 fprintf(f_out, "\n");
911 }
912 //now print all valid, non-replaced transcripts in this locus:
913 for (int i=0;i<loc.rnas.Count();i++) {
914 GffObj& t=*(loc.rnas[i]);
915 GTData* tdata=(GTData*)(t.uptr);
916 if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
917 t.addAttr("locus", locname.chars());
918 out_counter++;
919 if (fmtGTF) t.printGtf(f_out, tracklabel);
920 else {
921 //print the parent first, if any
922 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
923 GTData* pdata=(GTData*)(t.parent->uptr);
924 if (pdata->geneinfo!=NULL)
925 pdata->geneinfo->finalize();
926 t.parent->addAttr("locus", locname.chars());
927 t.parent->printGff(f_out, tracklabel);
928 t.parent->udata|=4;
929 }
930 t.printGff(f_out, tracklabel);
931 }
932 }
933 } //have valid transcripts to print
934 }//for each locus
935 if (f_out && !mRNAOnly) {
936 //final pass through the non-transcripts, in case any of them were not printed
937 //TODO: order broken, these should be interspersed among the rnas in the correct order!
938 for (int m=0;m<gdata->gfs.Count();m++) {
939 GffObj& t=*(gdata->gfs[m]);
940 if ((t.udata&4)==0) { //never printed
941 t.udata|=4;
942 if (fmtGTF) t.printGtf(f_out, tracklabel);
943 else t.printGff(f_out, tracklabel);
944 }
945 } //for each non-transcript
946 }
947 } //for each genomic sequence
948 }
949 else {
950 //not grouped into loci, print the rnas with their parents, if any
951 int numvalid=0;
952 for (int g=0;g<g_data.Count();g++) {
953 GenomicSeqData* gdata=g_data[g];
954 for (int m=0;m<gdata->rnas.Count();m++) {
955 GffObj& t=*(gdata->rnas[m]);
956 GTData* tdata=(GTData*)(t.uptr);
957 if (tdata->replaced_by!=NULL) continue;
958 if (process_transcript(gfasta, t)) {
959 t.udata|=4; //tag it as valid
960 numvalid++;
961 if (f_out) {
962 if (tdata->geneinfo) tdata->geneinfo->finalize();
963 out_counter++;
964 if (fmtGTF) t.printGtf(f_out, tracklabel);
965 else {
966 if (out_counter==1)
967 printGff3Header(f_out, args);
968 //print the parent first, if any
969 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
970 GTData* pdata=(GTData*)(t.parent->uptr);
971 if (pdata->geneinfo!=NULL)
972 pdata->geneinfo->finalize();
973 t.parent->printGff(f_out, tracklabel);
974 t.parent->udata|=4;
975 }
976 t.printGff(f_out, tracklabel);
977 }
978 }//GFF/GTF output requested
979 } //valid transcript
980 } //for each rna
981 if (f_out && !mRNAOnly) {
982 //final pass through the non-transcripts, in case any of them were not printed
983 //TODO: order broken, these should be interspersed among the rnas in the correct order!
984 for (int m=0;m<gdata->gfs.Count();m++) {
985 GffObj& t=*(gdata->gfs[m]);
986 if ((t.udata&4)==0) { //never printed
987 t.udata|=4;
988 if (fmtGTF) t.printGtf(f_out, tracklabel);
989 else t.printGff(f_out, tracklabel);
990 }
991 } //for each non-transcript
992 }
993 } //for each genomic seq
994 }
995 if (f_repl && f_repl!=stdout) fclose(f_repl);
996 seqinfo.Clear();
997 //if (faseq!=NULL) delete faseq;
998 //if (gcdb!=NULL) delete gcdb;
999 GFREE(rfltGSeq);
1000 FRCLOSE(f_in);
1001 FWCLOSE(f_out);
1002 FWCLOSE(f_w);
1003 FWCLOSE(f_x);
1004 FWCLOSE(f_y);
1005 }