ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
(Generate patch)
# Line 1 | Line 1
1 + #include "gff_utils.h"
2   #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include "GFaSeqGet.h"
6 #include "gff.h"
3   #include <ctype.h>
4 < // do not care about cdb compression
5 < #ifdef ENABLE_COMPRESSION
6 < #undef ENABLE_COMPRESSION
7 < #endif
8 < #include "GCdbYank.h"
4 > // don't care about cdb compression
5 > //#ifdef ENABLE_COMPRESSION
6 > //#undef ENABLE_COMPRESSION
7 > //#endif
8 > //#include "GCdbYank.h"
9  
10   #define USAGE "Usage:\n\
11 < gffread <input_gff..> [-g <genomic_seq_fasta> | <dir>][-s <seq_info.fsize>]\n\
12 < [-o <outfile.gff3>] [-t <tname>] [-r [<chr>:]<start>..<end>] [-i <maxintron>]\n\
13 < [-CTVNMAFGRUVBHSZWTO] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tran_cds.fa>]\n\
14 < \n\
15 < Filters and/or converts GFF/GTF records.\n\
11 > gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12 > [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end>] \n\
13 > [-CTVNJMKQAFGRUVBHZWTOLE] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tr_cds.fa>]\n\
14 > [-i <maxintron>] \n\
15 > Filters and/or converts GFF3/GTF2 records.\n\
16 > <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17   \n\
18   Options:\n\
19 <  -g  full path to one fasta file with the genomic sequence\n\
20 <      for all input mappings, OR a path to the directory where\n\
21 <      genomic sequences can be found as single-fasta files\n\
22 <  -s  for mRNA/EST/protein mappings a tab-delimited file provides this info\n\
19 >  -g  full path to a multi-fasta file with the genomic sequences\n\
20 >      for all input mappings, OR a directory with single-fasta files\n\
21 >      (one per genomic sequence, with file names matching sequence names)\n\
22 >  -s  <seq_info.fsize> is a tab-delimited file providing this info\n\
23        for each of the mapped sequences:\n\
24        <seq-name> <seq-length> <seq-description>\n\
25 +      (useful for -A option with mRNA/EST/protein mappings)\n\
26    -i  discard transcripts having an intron larger than <maxintron>\n\
27    -r  only show transcripts crossing coordinate range <start>..<end>\n\
28 <      (on chromosome/contig <chr> if provided)\n\
28 >      (on chromosome/contig <chr>, strand <strand> if provided)\n\
29    -R  for -r option, discard all transcripts that are not fully \n\
30        contained within given range\n\
31    -U  discard single-exon transcripts\n\
32 <  -C  discard mRNAs that have no CDS feature\n\
33 <  -F  keep all attributes from last column of GFF/GTF\n\
32 >  -C  coding only: discard mRNAs that have no CDS feature\n\
33 >  -F  full GFF attribute preservation (all attributes are shown)\n\
34    -G  only parse additional exon attributes from the first exon\n\
35        and move them to the mRNA level (useful for GTF input)\n\
36 <  -A  use the description field from <seq_info.fsize> and add it as\n\
37 <      a descr=.. attribute to the mRNA and/or Gene record\n\
36 >  -A  use the description field from <seq_info.fsize> and add it\n\
37 >      as the value for a 'descr' attribute to the GFF record\n\
38    \n\
39 <  -O  process other (non-mRNA) GFF/GTF records (default is discard non-mRNA\n\
40 <      data). \n\
43 <      Note: non-mRNA records must have only one subfeature per parent feature\n\
39 >  -O  process also non-transcript GFF records (by default non-transcript\n\
40 >      records are ignored)\n\
41    -V  discard any mRNAs with CDS having in-frame stop codons\n\
42    -H  for -V option, check and adjust the starting CDS phase\n\
43        if the original phase leads to a translation with an \n\
44        in-frame stop codon\n\
45    -B  for -V option, single-exon transcripts are also checked on the\n\
46        opposite strand\n\
47 <  -N  only show multi-exon mRNAs if all their introns have the \n\
48 <      typical splice site consensus ( GT-AG, GC-AG or AT-AC )\n\
49 <  -M  discard any mRNAs that either lack initial START codon\n\
47 >  -N  discard multi-exon mRNAs that have any intron with a non-canonical\n\
48 >      splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49 >  -J  discard any mRNAs that either lack initial START codon\n\
50        or the terminal STOP codon, or have an in-frame stop codon\n\
51        (only print mRNAs with a fulll, valid CDS)\n\
52   \n\
53 <  -S  sort output GFF records by genomic sequence and start coordinate\n\
54 <      note that this option is automatically enabled by -g option\n\
55 <  -Z  merge close exons into a single exon (intron size<4)\n\
56 <  -t  use <trackname> in the second column of each GFF/GTF output line\n\
57 <  -w  write a fasta file with spliced exons for each mapping\n\
58 <  -x  write a fasta file with spliced CDS for each mapping\n\
59 <  -W  for -w and -x options, also write for each fasta record the exon \n\
53 >  -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54 >       transcripts (those with the same exact introns and fully contained)\n\
55 >  --cluster-only: same as --merge but without collapsing matching transcripts\n\
56 >  -K  for -M option: also collapse shorter, fully contained transcripts\n\
57 >      with fewer introns than the container\n\
58 >  -Q  for -M option, remove the containment restriction:\n\
59 >      (multi-exon transcripts will be collapsed if just their introns match,\n\
60 >      while single-exon transcripts can partially overlap (80%))\n\
61 > \n\
62 >  -E  expose (warn about) duplicate transcript IDs and other potential \n\
63 >      problems with the given GFF/GTF records\n\
64 >  -Z  merge close exons into a single exon (for intron size<4)\n\
65 >  -w  write a fasta file with spliced exons for each GFF transcript\n\
66 >  -x  write a fasta file with spliced CDS for each GFF transcript\n\
67 >  -W  for -w and -x options, also write for each fasta record the exon\n\
68        coordinates projected onto the spliced sequence\n\
69 <  -y  write a protein fasta file with the CDS translation for each mapping\n\
70 <  -o  the output gff3 file with the 'filtered' entries\n\
71 <  -T  output GTF format instead of GFF3\n\
72 <  \n\
73 < "
69 >  -y  write a protein fasta file with the translation of CDS for each record\n\
70 >  -L  Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
71 >  -m  <chr_replace> is a reference (genomic) sequence replacement table with\n\
72 >      this format:\n\
73 >      <original_ref_ID> <new_ref_ID>\n\
74 >      GFF records on reference sequences that are not found among the\n\
75 >      <original_ref_ID> entries in this file will be filtered out\n\
76 >  -o  the \"filtered\" GFF records will be written to <outfile.gff>\n\
77 >      (use -o- for printing to stdout)\n\
78 >  -t  use <trackname> in the second column of each GFF output line\n\
79 >  -T  -o option will output GTF format instead of GFF3\n\
80 > "
81 >
82 >
83 > class SeqInfo { //populated from the -s option of gffread
84 > public:
85 >  int len;
86 >  char* descr;
87 >  SeqInfo( int l, char* s) {
88 >   len=l;
89 >   if (s==NULL) {
90 >     descr=NULL;
91 >     }   else {
92 >     descr=Gstrdup(s);
93 >     }
94 >   }
95 >  ~SeqInfo() {
96 >   GFREE(descr);
97 >   }
98 > };
99 >
100 > class RefTran {
101 > public:
102 >   char* new_name;
103 >   RefTran(char *ns) {
104 >      new_name=NULL;
105 >      if (ns!=NULL)
106 >         new_name=Gstrdup(ns);
107 >      }
108 >   ~RefTran() {
109 >      GFREE(new_name);
110 >      }
111 > };
112  
113   FILE* ffasta=NULL;
114   FILE* f_in=NULL;
# Line 84 | Line 127
127  
128   bool fullCDSonly=false; // starts with START, ends with STOP codon
129   bool fullattr=false;
130 < bool sortByLoc=false; // if the GFF output should be sorted by location
131 < GStr gseqpath;
132 < bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
130 > //bool sortByLoc=false; // if the GFF output should be sorted by location
131 > bool ensembl_convert=false; //-L, assisst in converting Ensembl GTF to GFF3
132 >
133 >
134 > //GStr gseqpath;
135 > //GStr gcdbfa;
136 > //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
137 > //GFaSeqGet* faseq=NULL;
138 > //GCdbYank* gcdb=NULL;
139 > //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
140   bool fmtGTF=false;
91 int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
92 GFaSeqGet* faseq=NULL;
93 GCdbYank* gcdb=NULL;
94 GStr gcdbfa;
141   bool addDescr=false;
142   //bool protmap=false;
143   bool multiExon=false;
# Line 101 | Line 147
147   bool mergeCloseExons=false;
148   //range filter:
149   char* rfltGSeq=NULL;
150 + char rfltStrand=0;
151   uint rfltStart=0;
152   uint rfltEnd=MAX_UINT;
153   bool rfltWithin=false; //check for full containment within given range
154   bool noExonAttr=false;
108 class SeqInfo {
109 public:
110  int len;
111  char* descr;
112  SeqInfo( int l, char* s) {
113   len=l;
114   if (s==NULL) {
115     descr=NULL;
116     }   else {
117     descr=Gstrdup(s);
118     }
119   }
120  ~SeqInfo() {
121   GFREE(descr);
122   }
123 };
155  
156 + bool doCluster=false;
157 + bool doCollapseRedundant=false;
158  
159 < class GSpliceSite {
127 < public:
128 <  char nt[3];
129 <  GSpliceSite(const char* c, bool revc=false) {
130 <    nt[2]=0;
131 <    if (c==NULL) {
132 <      nt[0]=0;
133 <      nt[1]=0;
134 <      return;
135 <      }
136 <    if (revc) {
137 <      nt[0]=toupper(ntComplement(c[1]));
138 <      nt[1]=toupper(ntComplement(c[0]));
139 <      }
140 <    else {
141 <      nt[0]=toupper(c[0]);
142 <      nt[1]=toupper(c[1]);
143 <      }
144 <    }
145 <
146 <  GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
147 <    nt[2]=0;
148 <    if (intron==NULL || intronlen==0)
149 <       GError("Error: invalid intron or intron len for GSpliceSite()!\n");
150 <    const char* c=intron;
151 <    if (revc) {
152 <      if (!getAcceptor) c+=intronlen-2;
153 <      nt[0]=toupper(ntComplement(c[1]));
154 <      nt[1]=toupper(ntComplement(c[0]));
155 <      }
156 <    else { //on forward strand
157 <      if (getAcceptor) c+=intronlen-2;
158 <      nt[0]=toupper(c[0]);
159 <      nt[1]=toupper(c[1]);
160 <      }//forward strand
161 <    }
162 <
163 <  GSpliceSite(const char n1, const char n2) {
164 <    nt[2]=0;
165 <    nt[0]=toupper(n1);
166 <    nt[1]=toupper(n2);
167 <    }
168 <  bool canonicalDonor() {
169 <    return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
170 <    }
171 <  bool operator==(GSpliceSite& c) {
172 <    return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
173 <    }
174 <  bool operator==(GSpliceSite* c) {
175 <    return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
176 <    }
177 <  bool operator==(const char* c) {
178 <    //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
179 <    //assumes given const nucleotides are uppercase already!
180 <    return (nt[0]==c[0] && nt[1]==c[1]);
181 <    }
182 <  bool operator!=(const char* c) {
183 <    //assumes given const nucleotides are uppercase already!
184 <    return (nt[0]!=c[0] || nt[1]!=c[1]);
185 <    }
186 < };
159 > GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
160  
161   //hash with sequence info
162   GHash<SeqInfo> seqinfo;
163   GHash<int> isoCounter; //counts the valid isoforms
164 + GHash<RefTran> reftbl;
165 + GHash<GeneInfo> gene_ids;
166 +  //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
167  
168   bool debugMode=false;
169   bool verbose=false;
170  
171 + void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
172 +  GLineReader fr(f);
173 +  while (!fr.isEof()) {
174 +      char* line=fr.getLine();
175 +      if (line==NULL) break;
176 +      char* id=line;
177 +      char* lenstr=NULL;
178 +      char* text=NULL;
179 +      char* p=line;
180 +      while (*p!=0 && !isspace(*p)) p++;
181 +      if (*p==0) continue;
182 +      *p=0;p++;
183 +      while (*p==' ' || *p=='\t') p++;
184 +      if (*p==0) continue;
185 +      lenstr=p;
186 +      while (*p!=0 && !isspace(*p)) p++;
187 +      if (*p!=0) { *p=0;p++; }
188 +      while (*p==' ' || *p=='\t') p++;
189 +      if (*p!=0) text=p; //else text remains NULL
190 +      int len=0;
191 +      if (!parseInt(lenstr,len)) {
192 +         GMessage("Warning: could not parse sequence length: %s %s\n",
193 +                  id, lenstr);
194 +         continue;
195 +         }
196 +      // --- here we have finished parsing the line
197 +      si.Add(id, new SeqInfo(len,text));
198 +      } //while lines
199 + }
200 +
201 + void loadRefTable(FILE* f, GHash<RefTran>& rt) {
202 +  GLineReader fr(f);
203 +  char* line=NULL;
204 +  while ((line=fr.getLine())) {
205 +      char* orig_id=line;
206 +      char* p=line;
207 +      while (*p!=0 && !isspace(*p)) p++;
208 +      if (*p==0) continue;
209 +      *p=0;p++;//split the line here
210 +      while (*p==' ' || *p=='\t') p++;
211 +      if (*p==0) continue;
212 +      rt.Add(orig_id, new RefTran(p));
213 +      } //while lines
214 + }
215 +
216   char* getSeqDescr(char* seqid) {
217   static char charbuf[128];
218   if (seqinfo.Count()==0) return NULL;
# Line 231 | Line 252
252    return charbuf;
253   }
254  
255 < void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1) {
256 < if (seq==NULL) return;
257 < int len=(seqlen>0)?seqlen:strlen(seq);
237 < if (len<=0) return;
238 < if (!defline.is_empty())
239 <     fprintf(f, ">%s\n",defline.chars());
240 < int ilen=0;
241 < for (int i=0; i < len; i++, ilen++) {
242 <   if (ilen == 70) {
243 <     fputc('\n', f);
244 <     ilen = 0;
245 <     }
246 <   putc(seq[i], f);
247 <   } //for
248 < fputc('\n', f);
249 < }
250 <
251 < void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
252 <  GLineReader fr(f);
253 <  while (!fr.isEof()) {
254 <      char* line=fr.getLine();
255 <      if (line==NULL) break;
256 <      char* id=line;
257 <      char* lenstr=NULL;
258 <      char* text=NULL;
259 <      char* p=line;
260 <      while (*p!=0 && !isspace(*p)) p++;
261 <      if (*p==0) continue;
262 <      *p=0;p++;
263 <      while (*p==' ' || *p=='\t') p++;
264 <      if (*p==0) continue;
265 <      lenstr=p;
266 <      while (*p!=0 && !isspace(*p)) p++;
267 <      if (*p!=0) { *p=0;p++; }
268 <      while (*p==' ' || *p=='\t') p++;
269 <      if (*p!=0) text=p; //else text remains NULL
270 <      int len=0;
271 <      if (!parseInt(lenstr,len)) {
272 <         GMessage("Warning: could not parse sequence length: %s %s\n",
273 <                  id, lenstr);
274 <         continue;
275 <         }
276 <      // --- here we have finished parsing the line
277 <      si.Add(id, new SeqInfo(len,text));
278 <      } //while lines
255 > GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
256 >  if (gfasta.fastaPath==NULL) return NULL;
257 >  return gfasta.fetch(gffrec.gseq_id);
258   }
259  
260  
261 < GFaSeqGet* openFastaSeq(const char* gseqname) {
283 <  //this is only called when either we have a cidx or a directory in gseqpath
284 <  GFaSeqGet* r=NULL;
285 <  if (gcdb!=NULL) {
286 <     off_t rpos=gcdb->getRecordPos(gseqname);
287 <     if (rpos<0) // genomic sequence not found
288 <        GError("Error: cannot find genomic sequence '%s' in %s\n",gseqname, gseqpath.chars());
289 <     return new GFaSeqGet(gcdbfa.chars(),rpos, false);// WARNING: does not validate FASTA line-len uniformity!
290 <     }
291 <   else {
292 <     GStr s(gseqpath);
293 <     s.append('/');
294 <     s.append(gseqname);
295 <     if (debugMode) GMessage("[D] >> opening fasta file for genomic sequence %s..\n",gseqname);
296 <     if (fileExists(s.chars())==2) {
297 <         r=new GFaSeqGet(s.chars(),false); // WARNING: does not validate FASTA line-len uniformity!
298 <         r->loadall();
299 <         if (debugMode) GMessage("[D]    << loaded.\n");
300 <         return r;
301 <         }
302 <     s.append(".fa");
303 <     if (fileExists(s.chars())==2) {
304 <         r=new GFaSeqGet(s.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
305 <         r->loadall();
306 <         if (debugMode) GMessage("[D]    << loaded.\n");
307 <         return r;
308 <         }
309 <     s.append("sta");
310 <     if (fileExists(s.chars())==2) {
311 <         return new GFaSeqGet(s.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
312 <         r->loadall();
313 <         if (debugMode) GMessage("[D]    << loaded.\n");
314 <         return r;
315 <         }
316 <     GError("Error: cannot load genomic sequence %s from directory %s !\n",
317 <          gseqname, gseqpath.chars());
318 <     }
319 < return NULL;
320 < }
321 <
322 <
323 < GFaSeqGet* fastaSeqGet(GffObj& mrna) {
324 < if (gseqpath.is_empty()) return NULL;
325 < if (multiGSeq && mrna.gseq_id!=gseq_id) {
326 <     if (faseq!=NULL) {
327 <         delete faseq;
328 <         faseq=NULL;
329 <         }
330 <     faseq=openFastaSeq(mrna.getGSeqName());
331 <     gseq_id=mrna.gseq_id;
332 <     }
333 < return faseq;
334 < }
335 <
336 < void adjust_stopcodon(GffObj& mrna, int adj, GList<GSeg>* seglst=NULL) {
261 > int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
262   //adj>0 => extedn CDS,  adj<0 => shrink CDS
263   //when CDS is expanded, exons have to be checked too and
264   // expanded accordingly if they had the same boundary
265 <  if (mrna.strand=='-') {
266 <       if ((int)mrna.CDstart>adj) {
267 <           mrna.CDstart-=adj;
268 <             if (adj>0 && mrna.exons.First()->start>=mrna.CDstart) {
269 <                 mrna.exons.First()->start-=adj;
270 <                 mrna.start=mrna.exons.First()->start;
271 <                 mrna.covlen+=adj;
265 >  int realadj=0;
266 >  if (gffrec.strand=='-') {
267 >       if ((int)gffrec.CDstart>adj) {
268 >
269 >           gffrec.CDstart-=adj;
270 >           realadj=adj;
271 >           if (adj<0) { //restore
272 >              if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
273 >                 gffrec.exons.First()->start-=adj;
274 >                 gffrec.start=gffrec.exons.First()->start;
275 >                 gffrec.covlen+=adj;
276 >                 }
277 >              }
278 >           else if (gffrec.exons.First()->start>=gffrec.CDstart) {
279 >                 gffrec.exons.First()->start-=adj;
280 >                 gffrec.start=gffrec.exons.First()->start;
281 >                 gffrec.covlen+=adj;
282                   }
283               }
284            }
285          else {
286 <         mrna.CDend+=adj;
287 <         if (adj>0 && mrna.exons.Last()->end<=mrna.CDend) {
288 <             mrna.exons.Last()->end+=adj;
289 <             mrna.end=mrna.exons.Last()->end;
290 <             mrna.covlen+=adj;
286 >         realadj=adj;
287 >         gffrec.CDend+=adj;
288 >         if (adj<0) {//restore
289 >           if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
290 >                        gffrec.exons.Last()->end+=adj;
291 >                        gffrec.end=gffrec.exons.Last()->end;
292 >                        gffrec.covlen+=adj;
293 >                        }
294 >          }
295 >         else if (gffrec.exons.Last()->end<=gffrec.CDend) {
296 >             gffrec.exons.Last()->end+=adj;
297 >             gffrec.end=gffrec.exons.Last()->end;
298 >             gffrec.covlen+=adj;
299               }
300           }
301    if (seglst!=NULL) seglst->Last()->end+=adj;
302 +  return realadj;
303   }
304  
305 < void process_mRNA(GffObj& mrna) {
306 < GStr gene(mrna.getGene());
307 < GStr defline(mrna.getID());
308 < if (!gene.is_empty() && strcmp(gene.chars(),mrna.getID())!=0) {
309 <   int* isonum=isoCounter.Find(gene.chars());
310 <   if (isonum==NULL) {
311 <      isonum=new int(1);
312 <      isoCounter.Add(mrna.getGene(),isonum);
305 > bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
306 > //returns true if the transcript passed the filter
307 > char* gname=gffrec.getGeneName();
308 > if (gname==NULL) gname=gffrec.getGeneID();
309 > GStr defline(gffrec.getID());
310 > if (f_out && !fmtGTF) {
311 >     const char* tn=NULL;
312 >     if ((tn=gffrec.getAttr("transcript_name"))!=NULL) {
313 >        gffrec.addAttr("Name", tn);
314 >        gffrec.removeAttr("transcript_name");
315 >        }
316 >     }
317 > if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
318 >      const char* tn=gffrec.getTrackName();
319 >      gffrec.addAttr("type", tn);
320 >      //bool is_gene=false;
321 >      bool is_pseudo=false;
322 >      if (strcmp(tn, "protein_coding")==0 || gffrec.hasCDS())
323 >                gffrec.setFeatureName("mRNA");
324 >       else {
325 >          if (strcmp(tn, "processed_transcript")==0)
326 >              gffrec.setFeatureName("proc_RNA");
327 >            else {
328 >              //is_gene=endsWith(tn, "gene");
329 >              is_pseudo=strifind(tn, "pseudo");
330 >              if (is_pseudo) {
331 >                   gffrec.setFeatureName("pseudo_RNA");
332 >                   }
333 >                else if (endsWith(tn, "RNA")) {
334 >                   gffrec.setFeatureName(tn);
335 >                   } else gffrec.setFeatureName("misc_RNA");
336 >              }
337 >          }
338        }
339 <     else (*isonum)++;
340 <  defline.appendfmt(" gene=%s", gene.chars());
341 <  }
339 > if (gname && strcmp(gname, gffrec.getID())!=0) {
340 >   int* isonum=isoCounter.Find(gname);
341 >   if  (isonum==NULL) {
342 >       isonum=new int(1);
343 >       isoCounter.Add(gname,isonum);
344 >       }
345 >      else (*isonum)++;
346 >   defline.appendfmt(" gene=%s", gname);
347 >   }
348    int seqlen=0;
349 +
350    const char* tlabel=tracklabel;
351 <  if (tlabel==NULL) tlabel=mrna.getTrackName();
351 >  if (tlabel==NULL) tlabel=gffrec.getTrackName();
352    //defline.appendfmt(" track:%s",tlabel);
353    char* cdsnt = NULL;
354    char* cdsaa = NULL;
355    int aalen=0;
356 <  for (int i=1;i<mrna.exons.Count();i++) {
357 <     int ilen=mrna.exons[i]->start-mrna.exons[i-1]->end-1;
358 <     if (ilen>2000000)
356 >  for (int i=1;i<gffrec.exons.Count();i++) {
357 >     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
358 >     if (ilen>4000000)
359              GMessage("Warning: very large intron (%d) for transcript %s\n",
360 <                           ilen, mrna.getID());
360 >                           ilen, gffrec.getID());
361       if (ilen>maxintron) {
362 <         return;
362 >         return false;
363           }
364       }
365    GList<GSeg> seglst(false,true);
366 <  fastaSeqGet(mrna);
367 <  if (spliceCheck && mrna.exons.Count()>1) {
366 >  GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
367 >  if (spliceCheck && gffrec.exons.Count()>1) {
368      //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
369      if (faseq==NULL) GError("Error: no genomic sequence available!\n");
370 <    int glen=mrna.end-mrna.start+1;
371 <    const char* gseq=faseq->subseq(mrna.start, glen);
372 <    bool revcompl=(mrna.strand=='-');
370 >    int glen=gffrec.end-gffrec.start+1;
371 >    const char* gseq=faseq->subseq(gffrec.start, glen);
372 >    bool revcompl=(gffrec.strand=='-');
373      bool ssValid=true;
374 <    for (int e=1;e<mrna.exons.Count();e++) {
375 <      const char* intron=gseq+mrna.exons[e-1]->end+1-mrna.start;
376 <      //debug
401 <      //uint istart=mrna.exons[e-1]->end+1;
402 <      //uint iend=mrna.exons[e]->start-1;
403 <      int intronlen=mrna.exons[e]->start-mrna.exons[e-1]->end-1;
374 >    for (int e=1;e<gffrec.exons.Count();e++) {
375 >      const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
376 >      int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
377        GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
378        GSpliceSite    donorSite(intron,intronlen, false, revcompl);
379        //GMessage("%c intron %d-%d : %s .. %s\n",
380 <      //           mrna.strand, istart, iend, donorSite.nt, acceptorSite.nt);
380 >      //           gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
381        if (acceptorSite=="AG") { // GT-AG or GC-AG
382           if (!donorSite.canonicalDonor()) {
383              ssValid=false;break;
# Line 418 | Line 391
391      //GFREE(gseq);
392      if (!ssValid) {
393        if (verbose)
394 <         GMessage("Invalid splice sites found for '%s'\n",mrna.getID());
395 <      return; //don't print this one!
394 >         GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
395 >      return false; //don't print this one!
396        }
397      }
398  
399    bool trprint=true;
400 +  int stopCodonAdjust=0;
401    int mCDphase=0;
402 <  if (mrna.CDphase=='1' || mrna.CDphase=='2')
403 <      mCDphase = mrna.CDphase-'0';
402 >  bool hasStop=false;
403 >  if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
404 >      mCDphase = gffrec.CDphase-'0';
405    if (f_y!=NULL || f_x!=NULL || validCDSonly) {
406      if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
407      //if (protmap && fullCDSonly) {
408 <    //if (protmap && (fullCDSonly ||  (mrna.qlen>0 && mrna.qend==mrna.qlen))) {
408 >    //if (protmap && (fullCDSonly ||  (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
409 >    
410      if (validCDSonly) { //make sure the stop codon is always included
411 <      adjust_stopcodon(mrna,3);
411 >      //adjust_stopcodon(gffrec,3);
412 >      stopCodonAdjust=adjust_stopcodon(gffrec,3);
413        }
414      int strandNum=0;
415      int phaseNum=0;
416    CDS_CHECK:
417 <    cdsnt=mrna.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
417 >    cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
418      if (cdsnt==NULL) trprint=false;
419      if (validCDSonly) {
420         cdsaa=translateDNA(cdsnt, aalen, seqlen);
421         char* p=strchr(cdsaa,'.');
422 <       bool hasStop=false;
422 >       hasStop=false;
423         if (p!=NULL) {
424              if (p-cdsaa>=aalen-2) { //stop found as the last codon
425                      *p='0';//remove it
# Line 450 | Line 427
427                      if (aalen-2==p-cdsaa) {
428                        //previous to last codon is the stop codon
429                        //so correct the CDS stop accordingly
430 <                      adjust_stopcodon(mrna,-3, &seglst);
430 >                      adjust_stopcodon(gffrec,-3, &seglst);
431 >                      stopCodonAdjust=0; //clear artificial stop adjustment
432                        seqlen-=3;
433                        cdsnt[seqlen]=0;
434                        }
# Line 464 | Line 442
442           //in-frame stop codon found
443           if (altPhases && phaseNum<3) {
444              phaseNum++;
445 <            mrna.CDphase = '0'+((mCDphase+phaseNum)%3);
445 >            gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
446              GFREE(cdsaa);
447              goto CDS_CHECK;
448              }
449 <         if (mrna.exons.Count()==1 && bothStrands) {
449 >         if (gffrec.exons.Count()==1 && bothStrands) {
450              strandNum++;
451              phaseNum=0;
452              if (strandNum<2) {
453                 GFREE(cdsaa);
454 <               mrna.strand = (mrna.strand=='-') ? '+':'-';
454 >               gffrec.strand = (gffrec.strand=='-') ? '+':'-';
455                 goto CDS_CHECK; //repeat the CDS check for a different frame
456                 }
457              }
458 <         if (verbose) GMessage("In-frame STOP found for '%s'\n",mrna.getID());
458 >         if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
459           } //has in-frame STOP
460         if (fullCDSonly) {
461             if (!hasStop || cdsaa[0]!='M') trprint=false;
# Line 487 | Line 465
465    if (!trprint) {
466      GFREE(cdsnt);
467      GFREE(cdsaa);
468 <    return;
468 >    return false;
469      }
470 <  if (f_out!=NULL) {
471 <     if (fmtGTF) mrna.printGtf(f_out,tracklabel);
472 <            else mrna.printGff(f_out, tracklabel);
473 <     }
470 >  if (stopCodonAdjust>0 && !hasStop) {
471 >          //restore stop codon location
472 >          adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
473 >          if (cdsnt!=NULL && seqlen>0) {
474 >             seqlen-=stopCodonAdjust;
475 >             cdsnt[seqlen]=0;
476 >             }
477 >          if (cdsaa!=NULL) aalen--;
478 >          }
479 >
480    if (f_y!=NULL) { //CDS translation fasta output requested
481           //char*
482           if (cdsaa==NULL) { //translate now if not done before
483             cdsaa=translateDNA(cdsnt, aalen, seqlen);
484             }
485 <         if (fullattr && mrna.attrs!=NULL) {
485 >         if (fullattr && gffrec.attrs!=NULL) {
486               //append all attributes found for each transcripts
487 <              for (int i=0;i<mrna.attrs->Count();i++) {
487 >              for (int i=0;i<gffrec.attrs->Count();i++) {
488                  defline.append(" ");
489 <                defline.append(mrna.getAttrName(i));
489 >                defline.append(gffrec.getAttrName(i));
490                  defline.append("=");
491 <                defline.append(mrna.getAttrValue(i));
491 >                defline.append(gffrec.getAttrValue(i));
492                  }
493                }
494           printFasta(f_y, defline, cdsaa, aalen);
# Line 512 | Line 496
496     if (f_x!=NULL) { //CDS only
497           if (writeExonSegs) {
498                defline.append(" loc:");
499 <              defline.append(mrna.getGSeqName());
500 <              defline.appendfmt("(%c)",mrna.strand);
499 >              defline.append(gffrec.getGSeqName());
500 >              defline.appendfmt("(%c)",gffrec.strand);
501                //warning: not CDS coordinates are written here, but the exon ones
502 <              defline+=(int)mrna.start;
502 >              defline+=(int)gffrec.start;
503                defline+=(char)'-';
504 <              defline+=(int)mrna.end;
504 >              defline+=(int)gffrec.end;
505                // -- here these are CDS substring coordinates on the spliced sequence:
506                defline.append(" segs:");
507                for (int i=0;i<seglst.Count();i++) {
# Line 527 | Line 511
511                    defline+=(int)seglst[i]->end;
512                    }
513                }
514 <         if (fullattr && mrna.attrs!=NULL) {
514 >         if (fullattr && gffrec.attrs!=NULL) {
515               //append all attributes found for each transcript
516 <              for (int i=0;i<mrna.attrs->Count();i++) {
516 >              for (int i=0;i<gffrec.attrs->Count();i++) {
517                  defline.append(" ");
518 <                defline.append(mrna.getAttrName(i));
518 >                defline.append(gffrec.getAttrName(i));
519                  defline.append("=");
520 <                defline.append(mrna.getAttrValue(i));
520 >                defline.append(gffrec.getAttrValue(i));
521                  }
522                }
523           printFasta(f_x, defline, cdsnt, seqlen);
# Line 544 | Line 528
528      uint cds_start=0;
529      uint cds_end=0;
530      seglst.Clear();
531 <    char* exont=mrna.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
531 >    char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
532      if (exont!=NULL) {
533 <    if (mrna.CDstart>0) {
533 >    if (gffrec.CDstart>0) {
534          defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
535          }
536        if (writeExonSegs) {
537          defline.append(" loc:");
538 <        defline.append(mrna.getGSeqName());
538 >        defline.append(gffrec.getGSeqName());
539          defline+=(char)'|';
540 <        defline+=(int)mrna.start;
540 >        defline+=(int)gffrec.start;
541          defline+=(char)'-';
542 <        defline+=(int)mrna.end;
542 >        defline+=(int)gffrec.end;
543          defline+=(char)'|';
544 <        defline+=(char)mrna.strand;
544 >        defline+=(char)gffrec.strand;
545          defline.append(" exons:");
546 <        for (int i=0;i<mrna.exons.Count();i++) {
546 >        for (int i=0;i<gffrec.exons.Count();i++) {
547                  if (i>0) defline.append(",");
548 <                defline+=(int)mrna.exons[i]->start;
548 >                defline+=(int)gffrec.exons[i]->start;
549                  defline.append("-");
550 <                defline+=(int)mrna.exons[i]->end;
550 >                defline+=(int)gffrec.exons[i]->end;
551                  }
552          defline.append(" segs:");
553          for (int i=0;i<seglst.Count();i++) {
# Line 573 | Line 557
557              defline+=(int)seglst[i]->end;
558              }
559          }
560 <      if (fullattr && mrna.attrs!=NULL) {
560 >      if (fullattr && gffrec.attrs!=NULL) {
561         //append all attributes found for each transcripts
562 <        for (int i=0;i<mrna.attrs->Count();i++) {
562 >        for (int i=0;i<gffrec.attrs->Count();i++) {
563            defline.append(" ");
564 <          defline.append(mrna.getAttrName(i));
564 >          defline.append(gffrec.getAttrName(i));
565            defline.append("=");
566 <          defline.append(mrna.getAttrValue(i));
566 >          defline.append(gffrec.getAttrValue(i));
567            }
568          }
569        printFasta(f_w, defline, exont, seqlen);
570        GFREE(exont);
571        }
572      } //writing f_w (spliced exons)
573 < return;
573 > return true;
574   }
575  
576   void openfw(FILE* &f, GArgs& args, char opt) {
# Line 604 | Line 588
588   #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
589   #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
590  
591 + void printGff3Header(FILE* f, GArgs& args) {
592 +  fprintf(f, "# ");
593 +  args.printCmdLine(f);
594 +  fprintf(f, "##gff-version 3\n");
595 +  //for (int i=0;i<gseqdata.Count();i++) {
596 +  //
597 +  //}
598 +  }
599 +
600 + bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
601 +  if (reftbl.Count()>0) {
602 +        GStr refname(gffrec->getRefName());
603 +        RefTran* rt=reftbl.Find(refname.chars());
604 +        if (rt==NULL && refname[-2]=='.' && isdigit(refname[-1])) {
605 +           //try removing the version
606 +           refname.cut(-2);
607 +           //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
608 +           rt=reftbl.Find(refname.chars());
609 +           }
610 +        if (rt) {
611 +          gffrec->setRefName(rt->new_name);
612 +          }
613 +         else return false; //discard, ref seq not in the given translation table
614 +        }
615 +      if (mRNAOnly && gffrec->isDiscarded()) {
616 +       //discard generic "gene" or "locus" features with no other detailed subfeatures
617 +       //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
618 +       return false;
619 +       }
620 +      /*
621 +      if (gffrec->exons.Count()==0  && gffrec->children.Count()==0)) {
622 +        //a non-mRNA feature with no subfeatures
623 +        //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
624 +        //--this could be a single "pseudogene" entry or another genomic region without exons
625 +        //
626 +        gffrec->addExon(gffrec->start,gffrec->end);
627 +        }
628 +      */  
629 +     if (rfltGSeq!=NULL) { //filter by gseqName
630 +        if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
631 +          return false;
632 +          }
633 +        }
634 +     if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
635 +        return false;
636 +        }
637 +     //check coordinates
638 +     if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
639 +       if (rfltWithin) {
640 +         if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
641 +            return false;
642 +            }
643 +         }
644 +       else {
645 +         if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
646 +           return false;
647 +           }
648 +         }
649 +       }
650 +     if (multiExon && gffrec->exons.Count()<=1) {
651 +         return false;
652 +         }
653 +   if (wCDSonly && gffrec->CDstart==0) {
654 +         return false;
655 +         }
656 +   if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
657 +       //keep track of chr|gene_id data -- coordinate range
658 +       char* geneid=gffrec->getGeneID();
659 +       if (geneid!=NULL) {
660 +         GeneInfo* ginfo=gene_ids.Find(geneid);
661 +         if (ginfo==NULL) {//first time seeing this gene ID
662 +                   GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
663 +                   gene_ids.Add(geneid, geneinfo);
664 +                   if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
665 +                   }
666 +             else ginfo->update(gffrec);
667 +         }
668 +       }
669 +  return true;
670 + }
671 +
672 +
673   int main(int argc, char * const argv[]) {
674 < GArgs args(argc, argv, "vOUNHWCVMNSXTDAPRZFGg:i:r:s:t:a:b:o:w:x:y:MINCOV=MINPID=");
675 < int e;
676 < if ((e=args.isError())>0)
677 <    GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
678 < if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
679 < debugMode=(args.getOpt('D')!=NULL);
674 > GArgs args(argc, argv,
675 >   "debug;merge;cluster-only;help;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
676 > args.printError(USAGE, true);
677 > if (args.getOpt('h') || args.getOpt("help")) {
678 >    GMessage("%s",USAGE);
679 >    exit(1);
680 >    }
681 > debugMode=(args.getOpt("debug")!=NULL);
682   mRNAOnly=(args.getOpt('O')==NULL);
683 < sortByLoc=(args.getOpt('S')!=NULL);
683 > //sortByLoc=(args.getOpt('S')!=NULL);
684   addDescr=(args.getOpt('A')!=NULL);
685   verbose=(args.getOpt('v')!=NULL);
686   wCDSonly=(args.getOpt('C')!=NULL);
# Line 620 | Line 688
688   altPhases=(args.getOpt('H')!=NULL);
689   fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
690   bothStrands=(args.getOpt('B')!=NULL);
691 < fullCDSonly=(args.getOpt('M')!=NULL);
691 > fullCDSonly=(args.getOpt('J')!=NULL);
692   spliceCheck=(args.getOpt('N')!=NULL);
693 + bool matchAllIntrons=(args.getOpt('K')==NULL);
694 + bool fuzzSpan=(args.getOpt('Q')!=NULL);
695 + if (args.getOpt('M') || args.getOpt("merge")) {
696 +    doCluster=true;
697 +    doCollapseRedundant=true;
698 +    }
699 +   else {
700 +    if (!matchAllIntrons || fuzzSpan) {
701 +      GMessage("%s",USAGE);
702 +      GMessage("Error: -K or -Q options require -M/--merge option!\n");
703 +      exit(1);
704 +      }
705 +    }
706 + if (args.getOpt("cluster-only")) {
707 +    doCluster=true;
708 +    doCollapseRedundant=false;
709 +    if (!matchAllIntrons || fuzzSpan) {
710 +      GMessage("%s",USAGE);
711 +      GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
712 +      exit(1);
713 +      }
714 +    }
715   //protmap=(args.getOpt('P')!=NULL);
716   if (fullCDSonly) validCDSonly=true;
717 + if (verbose) {
718 +     fprintf(stderr, "Command line was:\n");
719 +     args.printCmdLine(stderr);
720 +     }
721 +
722   fullattr=(args.getOpt('F')!=NULL);
723   if (args.getOpt('G')==NULL)
724      noExonAttr=!fullattr;
# Line 631 | Line 726
726       noExonAttr=true;
727       fullattr=true;
728       }
729 + ensembl_convert=(args.getOpt('L')!=NULL);
730 + if (ensembl_convert) {
731 +    fullattr=true;
732 +    noExonAttr=false;
733 +    //sortByLoc=true;
734 +    }
735 +    
736   mergeCloseExons=(args.getOpt('Z')!=NULL);
737   multiExon=(args.getOpt('U')!=NULL);
738   writeExonSegs=(args.getOpt('W')!=NULL);
739   tracklabel=args.getOpt('t');
740 < gseqpath=args.getOpt('g');
741 < if (!gseqpath.is_empty()) {
742 <   sortByLoc=true;
641 <   int c=fileExists(gseqpath.chars());
642 <   if (c==1)
643 <       multiGSeq=true;//directory given
644 <   else if (c==2) { //single file given
645 <         if (gseqpath.rindex(".cidx")==gseqpath.length()-5) {
646 <            //cdbyank index given
647 <            gcdb=new GCdbYank(gseqpath.chars());
648 <            if (fileExists(gcdb->getDbName())) {
649 <               gcdbfa=gcdb->getDbName();
650 <               } else {
651 <               gcdbfa=gseqpath;
652 <               gcdbfa.chomp(".cidx");
653 <               if (!fileExists(gcdbfa.chars()))
654 <                   GError("Error: cannot locate the fasta file for index %s !\n",gseqpath.chars());
655 <               }
656 <            multiGSeq=true;
657 <            }
658 <         else { //must be just the one single fasta file
659 <           multiGSeq=false;
660 <           faseq=new GFaSeqGet(gseqpath.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
661 <           }
662 <         }//fasta file given
663 <     else GError("Error: cannot find genomic sequence path %s !\n",gseqpath.chars());
664 <   } // -g option
740 > GFastaDb gfasta(args.getOpt('g'));
741 > //if (gfasta.fastaPath!=NULL)
742 > //    sortByLoc=true; //enforce sorting by chromosome/contig
743   GStr s=args.getOpt('i');
744   if (!s.is_empty()) maxintron=s.asInt();
745 +
746 + FILE* f_repl=NULL;
747 + s=args.getOpt('d');
748 + if (!s.is_empty()) {
749 +   if (s=="-") f_repl=stdout;
750 +     else {
751 +       f_repl=fopen(s.chars(), "w");
752 +       if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
753 +       }
754 +   }
755 +
756   rfltWithin=(args.getOpt('R')!=NULL);
757   s=args.getOpt('r');
758   if (!s.is_empty()) {
759     s.trim();
760 +   if (s[0]=='+' || s[0]=='-') {
761 +     rfltStrand=s[0];
762 +     s.cut(0,1);
763 +     }
764     int isep=s.index(':');
765 <   if (isep>=0) { //gseq name given
766 <      if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
765 >   if (isep>0) { //gseq name given
766 >      if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
767 >        isep--;
768 >        rfltStrand=s[isep];
769 >        s.cut(isep,1);
770 >        }
771 >      if (isep>0)
772 >          rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
773        s.cut(0,isep+1);
774        }
775     GStr gsend;
776 +   char slast=s[s.length()-1];
777 +   if (rfltStrand==0 && (slast=='+' || slast=='-')) {
778 +      s.chomp(slast);
779 +      rfltStrand=slast;
780 +      }
781     if (s.index("..")>=0) gsend=s.split("..");
782 <                   else gsend=s.split('-');
782 >                    else gsend=s.split('-');
783     if (!s.is_empty()) rfltStart=(uint)s.asInt();
784 <   if (!gsend.is_empty()) rfltEnd=(uint)gsend.asInt();
784 >   if (!gsend.is_empty()) {
785 >      rfltEnd=(uint)gsend.asInt();
786 >      if (rfltEnd==0) rfltEnd=MAX_UINT;
787 >      }
788 >  
789     } //gseq/range filtering
790   else {
791     if (rfltWithin)
792       GError("Error: option -R doesn't make sense without -r!\n");
793     }
794 + s=args.getOpt('m');
795 + if (!s.is_empty()) {
796 +   FILE* ft=fopen(s,"r");
797 +   if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
798 +   loadRefTable(ft, reftbl);
799 +   fclose(ft);
800 +   }
801   s=args.getOpt('s');
802   if (!s.is_empty()) {
803 <  FILE* fsize=fopen(s,"r");
804 <  if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
805 <  loadSeqInfo(fsize, seqinfo);
806 < }
807 < /*
808 < openfw(fgtfok, args, 'a');
694 < openfw(fgtfbad, args, 'b');
695 < */
803 >   FILE* fsize=fopen(s,"r");
804 >   if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
805 >   loadSeqInfo(fsize, seqinfo);
806 >   fclose(fsize);
807 >   }
808 >
809   openfw(f_out, args, 'o');
810   //if (f_out==NULL) f_out=stdout;
811 < if (gseqpath.is_empty() && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
811 > if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
812    GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
813  
814   openfw(f_w, args, 'w');
# Line 703 | Line 816
816   openfw(f_y, args, 'y');
817   if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
818   //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
819 < GStr infile;
820 < if (args.startNonOpt()) {
821 <        infile=args.nextNonOpt();
822 <        //GMessage("Given file: %s\n",infile.chars());
823 <        }
824 < if (!infile.is_empty()) {
825 <    f_in=fopen(infile, "r");
826 <    if (f_in==NULL)
827 <       GError("Cannot open input file %s!\n",infile.chars());
828 <    }
829 <  else
830 <    f_in=stdin;
831 <
719 < //GffReader* gfreader=new GffReader(f_in,true);
720 < GffReader* gfreader=new GffReader(f_in, mRNAOnly, sortByLoc);
721 < gfreader->readAll(fullattr, mergeCloseExons, noExonAttr);
722 < if (debugMode) GMessage("[D] gff data loaded, now processing each entry\n");
723 < //GList<GffObj> mrnas(true,true,false);
724 < for (int m=0;m<gfreader->gflst.Count();m++) {
725 <   GffObj* mrna=gfreader->gflst[m];
726 <   if (mrna->exons.Count()==0) {
727 <      //a non-mRNA feature with no subfeatures
728 <      //yet just so we get some sequence functions working, add the dummy "exon" here
729 <      mrna->addExon(mrna->start,mrna->end);
730 <      }
731 <   if (rfltGSeq!=NULL) { //filter by gseqName
732 <      if (strcmp(mrna->getGSeqName(),rfltGSeq)!=0) {
733 <        delete mrna;
734 <        gfreader->gflst.Forget(m);
735 <        continue;
736 <        }
737 <      }
738 <   //check coordinates
739 <   if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
740 <     if (rfltWithin) {
741 <       if (mrna->start<rfltStart || mrna->end>rfltEnd) {
742 <          delete mrna;
743 <          gfreader->gflst.Forget(m);
744 <          continue;
819 >
820 > int numfiles = args.startNonOpt();
821 > //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
822 > int out_counter=0; //number of records printed
823 > while (true) {
824 >   GStr infile;
825 >   if (numfiles) {
826 >          infile=args.nextNonOpt();
827 >          if (infile.is_empty()) break;
828 >          if (infile=="-") { f_in=stdin; infile="stdin"; }
829 >               else
830 >                 if ((f_in=fopen(infile, "r"))==NULL)
831 >                    GError("Error: cannot open input file %s!\n",infile.chars());
832            }
833 <       }
834 <     else {
835 <       if (mrna->start>rfltEnd || mrna->end<rfltStart) {
836 <         delete mrna;
837 <         gfreader->gflst.Forget(m);
838 <         continue;
833 >        else
834 >          infile="-";
835 >   GffLoader gffloader(infile.chars());
836 >   gffloader.transcriptsOnly=mRNAOnly;
837 >   gffloader.fullAttributes=fullattr;
838 >   gffloader.noExonAttrs=noExonAttr;
839 >   gffloader.mergeCloseExons=mergeCloseExons;
840 >   gffloader.showWarnings=(args.getOpt('E')!=NULL);
841 >   gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
842 >                             matchAllIntrons, fuzzSpan);
843 >   if (doCluster)
844 >     collectLocusData(g_data);
845 >   if (numfiles==0) break;
846 >   }
847 >  
848 > GStr loctrack("gffcl");
849 > if (tracklabel) loctrack=tracklabel;
850 > g_data.setSorted(&gseqCmpName);
851 > if (doCluster) {
852 >   //grouped in loci
853 >   for (int g=0;g<g_data.Count();g++) {
854 >     GenomicSeqData* gdata=g_data[g];
855 >     for (int l=0;l<gdata->loci.Count();l++) {
856 >       GffLocus& loc=*(gdata->loci[l]);
857 >       //check all non-replaced transcripts in this locus:
858 >       int numvalid=0;
859 >       int idxfirstvalid=-1;
860 >       for (int i=0;i<loc.rnas.Count();i++) {
861 >         GffObj& t=*(loc.rnas[i]);
862 >         GTData* tdata=(GTData*)(t.uptr);
863 >         if (tdata->replaced_by!=NULL) {
864 >            if (f_repl && (t.udata & 8)==0) {
865 >               //t.udata|=8;
866 >               fprintf(f_repl, "%s", t.getID());
867 >               GTData* rby=tdata;
868 >               while (rby->replaced_by!=NULL) {
869 >                  fprintf(f_repl," => %s", rby->replaced_by->getID());
870 >                  rby->rna->udata|=8;
871 >                  rby=(GTData*)(rby->replaced_by->uptr);
872 >                  }
873 >               fprintf(f_repl, "\n");
874 >               }
875 >            continue;
876 >            }
877 >         if (process_transcript(gfasta, t)) {
878 >             t.udata|=4; //tag it as valid
879 >             numvalid++;
880 >             if (idxfirstvalid<0) idxfirstvalid=i;
881 >             }
882           }
883 +      
884 +       if (f_out && numvalid>0) {
885 +         GStr locname("RLOC_");
886 +         locname.appendfmt("%08d",loc.locus_num);
887 +         if (!fmtGTF) {
888 +           if (out_counter==0)
889 +              printGff3Header(f_out, args);
890 +           fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
891 +                    loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
892 +                     locname.chars(), locname.chars());
893 +           //const char* loc_gname=loc.getGeneName();
894 +           if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
895 +              fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
896 +              for (int i=1;i<loc.gene_names.Count();i++) {
897 +                fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
898 +                }
899 +              }
900 +           if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
901 +              fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
902 +              for (int i=1;i<loc.gene_ids.Count();i++) {
903 +                fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
904 +                }
905 +              }
906 +           fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
907 +           for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
908 +              fprintf(f_out, ",%s",loc.rnas[i]->getID());
909 +              }
910 +           fprintf(f_out, "\n");
911 +           }
912 +         //now print all valid, non-replaced transcripts in this locus:
913 +         for (int i=0;i<loc.rnas.Count();i++) {
914 +           GffObj& t=*(loc.rnas[i]);
915 +           GTData* tdata=(GTData*)(t.uptr);
916 +           if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
917 +           t.addAttr("locus", locname.chars());
918 +           out_counter++;
919 +           if (fmtGTF) t.printGtf(f_out, tracklabel);
920 +               else {
921 +                //print the parent first, if any
922 +                if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
923 +                    GTData* pdata=(GTData*)(t.parent->uptr);
924 +                    if (pdata->geneinfo!=NULL)
925 +                         pdata->geneinfo->finalize();
926 +                    t.parent->addAttr("locus", locname.chars());
927 +                    t.parent->printGff(f_out, tracklabel);
928 +                    t.parent->udata|=4;
929 +                    }
930 +                t.printGff(f_out, tracklabel);
931 +                }
932 +            }
933 +          } //have valid transcripts to print
934 +       }//for each locus
935 +     if (f_out && !mRNAOnly) {
936 +       //final pass through the non-transcripts, in case any of them were not printed
937 +       //TODO: order broken, these should be interspersed among the rnas in the correct order!
938 +       for (int m=0;m<gdata->gfs.Count();m++) {
939 +         GffObj& t=*(gdata->gfs[m]);
940 +         if ((t.udata&4)==0) { //never printed
941 +           t.udata|=4;
942 +           if (fmtGTF) t.printGtf(f_out, tracklabel);
943 +              else t.printGff(f_out, tracklabel);
944 +           }
945 +         } //for each non-transcript
946         }
947 <     }
948 <   if (multiExon && mrna->exons.Count()==1) {
949 <       delete mrna;
950 <       gfreader->gflst.Forget(m);
951 <       continue;
952 <       }
953 <   if (wCDSonly && mrna->CDstart==0) {
954 <       delete mrna;
955 <       gfreader->gflst.Forget(m);
956 <       continue;
957 <       }
958 <   /* if (validCDSonly && mrna->hasErrors) {
959 <       delete mrna;
960 <       gfreader->gflst.Forget(m);
961 <       continue;
947 >     } //for each genomic sequence
948 >   }
949 >  else {
950 >   //not grouped into loci, print the rnas with their parents, if any
951 >   int numvalid=0;
952 >   for (int g=0;g<g_data.Count();g++) {
953 >     GenomicSeqData* gdata=g_data[g];
954 >     for (int m=0;m<gdata->rnas.Count();m++) {
955 >        GffObj& t=*(gdata->rnas[m]);
956 >        GTData* tdata=(GTData*)(t.uptr);
957 >        if (tdata->replaced_by!=NULL) continue;
958 >        if (process_transcript(gfasta, t)) {
959 >           t.udata|=4; //tag it as valid
960 >           numvalid++;
961 >           if (f_out) {
962 >             if (tdata->geneinfo) tdata->geneinfo->finalize();
963 >             out_counter++;
964 >             if (fmtGTF) t.printGtf(f_out, tracklabel);
965 >               else {
966 >                if (out_counter==1)
967 >                  printGff3Header(f_out, args);
968 >                //print the parent first, if any
969 >                if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
970 >                    GTData* pdata=(GTData*)(t.parent->uptr);
971 >                    if (pdata->geneinfo!=NULL)
972 >                         pdata->geneinfo->finalize();
973 >                    t.parent->printGff(f_out, tracklabel);
974 >                    t.parent->udata|=4;
975 >                    }
976 >                t.printGff(f_out, tracklabel);
977 >                }
978 >             }//GFF/GTF output requested
979 >           } //valid transcript
980 >        } //for each rna
981 >     if (f_out && !mRNAOnly) {
982 >       //final pass through the non-transcripts, in case any of them were not printed
983 >       //TODO: order broken, these should be interspersed among the rnas in the correct order!
984 >       for (int m=0;m<gdata->gfs.Count();m++) {
985 >         GffObj& t=*(gdata->gfs[m]);
986 >         if ((t.udata&4)==0) { //never printed
987 >           t.udata|=4;
988 >           if (fmtGTF) t.printGtf(f_out, tracklabel);
989 >              else t.printGff(f_out, tracklabel);
990 >           }
991 >         } //for each non-transcript
992         }
993 <   */
771 <   process_mRNA(*mrna);
772 <   delete mrna;
773 <   gfreader->gflst.Forget(m);
993 >     } //for each genomic seq
994     }
995 < // M_END:
776 < delete gfreader;
995 > if (f_repl && f_repl!=stdout) fclose(f_repl);
996   seqinfo.Clear();
997 < if (faseq!=NULL) delete faseq;
998 < if (gcdb!=NULL) delete gcdb;
997 > //if (faseq!=NULL) delete faseq;
998 > //if (gcdb!=NULL) delete gcdb;
999   GFREE(rfltGSeq);
1000   FRCLOSE(f_in);
1001   FWCLOSE(f_out);

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines