ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gffmanip
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 18716 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     gffmanip [-v] [-o outrecords.txt] [{-r|-c|-e} <rangedata>] [-C] [-t <track>]\
8     [-i [<attr>:]<ids.txt>] [-a <attr>=<value>[,...]] [-f <feature1>[,..]] \
9     [-k <attrlist>] [-l [{-T|-L|-l <attrlist>}] [<gff\/gtf_file>]
10    
11     Filtering options:
12     -C output only those transcripts having CDS features
13     -f output only the features matching any of the strings in the given
14     comma-delimited list (e.g. -f 'mRNA,exon,CDS')
15     -v show records that do NOT match the given filtering options (like grep's -v)
16     -r show only records overlapping any of the intervals given in <rangedata>,
17     which has the format:
18     [<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]]
19     -c same as -r, but only shows records fully contained in the interval(s) given
20     -e outputs records whose exon\/CDS segments overlap the interval(s) given
21     -i only shows records whose ID match any of the entries in the file <ids.txt>;
22     unless another attribute is given as a prefix (<attr>:), the ID attribute
23     is used for GFF3 input, and transcript_id for GTF
24     -a only shows records having the <attr> attribute with the value <value>;
25     multiple attribute\/value pairs can be given (comma delimited) and a record
26     will be shown if there is at least one attribute match
27    
28     Output options:
29     -K keep all attributes ; by default only the core GFF\/GTF attributes are
30     shown (transcript_id, gene_id, gene_name, ID, Parent, Name, gene_name)
31     -k keep only the specified non-core attributes in the output records
32     -T output GTF format (default is GFF3)
33     -L output only the list of transcript IDs found in the file
34     (and matching the filtering options)
35     -l output the value of all attributes in <attrlist> for each record
36     (tab delimited if more than one attribute is given)
37     -t replace the 2nd column with the given <track> text
38     /;
39     umask 0002;
40     getopts('vTKACLf:r:l:c:e:i:a:t:k:o:') || die($usage."\n");
41     die("$usage\n") unless @ARGV>0;
42     die("Only one input file is expected (or '-' for stdin)!\n") unless @ARGV==1;
43     my $input_gff=$ARGV[0];
44     my $outfile=$Getopt::Std::opt_o;
45     my %ignoredFeatures;
46     @ignoredFeatures{qw(intron cds_start cds_stop start_codon stop_codon start stop cdsstart cdsstop)}=();
47     my %oattrs; #hash with attrs to print
48     @oattrs{qw(ID transcript_id Parent gene_id gene gene_name)}=();
49     my %attrflt; # attribute=>value filter
50     my %idflt; # hash with IDs to keep
51     my %atab; # hash with attribute names whose values will be listed as tab delimited
52     my @atab_cols;
53     if ($outfile) {
54     open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
55     select(OUTF);
56     }
57     # --
58     my ($fltinvert, $printGTF, $out_track)=
59     ($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t);
60     my $only_ifCDS=$Getopt::Std::opt_v;
61     my ($idfile, $avflt, $featlist, $oattrlist)=
62     ($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k);
63     my $keepAll=$Getopt::Std::opt_K || $Getopt::Std::opt_A || (lc($oattrlist) eq 'a' || lc($oattrlist) eq 'all');
64     my ($r_range, $c_range, $e_range)=
65     ($Getopt::Std::opt_r, $Getopt::Std::opt_c, $Getopt::Std::opt_e);
66     my $check_range;
67     if ($r_range) {
68     $check_range=$r_range;
69     die("Error: options -r, -c and -e are mutually exclusive!\n") if ($c_range || $e_range);
70     }
71     elsif ($c_range) {
72     $check_range=$c_range;
73     die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $e_range);
74     }
75     elsif ($e_range) {
76     $check_range=$e_range;
77     die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $c_range);
78     }
79     my ($flt_chr, $flt_strand);
80     my @flt_intv;
81     if ($check_range) {
82     ($flt_chr, my $ck_rlst)=split(/\:/,$check_range);
83     #die("$usage Incorrect format for the interval list!\n") unless $flt_chr && $ck_rlst;
84     my $flt_strand=substr($flt_chr,0,1);
85     if ($flt_strand eq '-' || $flt_strand eq '+') {
86     substr($flt_chr,0,1)='';
87     }
88     else {
89     $flt_strand=undef;
90     my $e=chop($flt_chr);
91     if ($e eq '-' || $e eq '+') {
92     $flt_strand=$e;
93     }
94     else { $flt_chr.=$e; }
95     } #no strand
96     my @flt_intv;
97     if ($ck_rlst) {
98     my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$ck_rlst));
99     foreach my $d (@rdata) {
100     ($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
101     }
102     @flt_intv = sort { $a->[0] <=> $b->[0] } @rdata;
103     }
104     }
105     #my $range=$flt_chr.':'.$ck_rex[0]->[0].'-'.$ex[-1]->[1];
106    
107     $oattrlist='' if $keepAll;
108     my $tab_attrs=$Getopt::Std::opt_l;
109     $tab_attrs='ID' if $Getopt::Std::opt_L;
110     if ($tab_attrs) {
111     @atab_cols=split(/\,/, $tab_attrs);
112     @atab{@atab_cols}=();
113     }
114     my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id
115     if ($idfile) {
116     unless (-f $idfile) {
117     my ($a,$f)=(split(/\:/,$idfile));
118     ($idfile, $idfileattr)=($f,$a) if $f;
119     }
120     my $idf;
121     if ($idfile eq '-') {
122     open($idf, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n");
123     }
124     else {
125     open($idf, $idfile) || die("Error: cannot open $idfile $!\n");
126     }
127     while (<$idf>) {
128     my ($id)=(m/(\S+)/);
129     $idflt{$id}=1 if $id;
130     }
131     close($idf);
132     }
133     if ($avflt) {
134     my @avl=split(/\,/, $avflt);
135     foreach my $avpair (@avl) {
136     my ($attr, $value)=split(/\s*=\s*/,$avpair,2);
137     $value=~s/[" ]+$//;$value=~s/^[" ]+//;
138     $attrflt{$attr}=$value;
139     }
140     }
141     my %featflt; # feature list filter -- if $featlist was given
142     if ($featlist) {
143     #add to the list of attributes to be kept
144     my @fl=split(/\,/, $featlist);
145     @featflt{@fl}=(); # this is actually an input filter
146     }
147     if ($oattrlist) {
148     #add to the list of attributes to be kept
149     my @al=split(/\,/, $oattrlist);
150     @oattrs{@al}=();
151     }
152     # ----
153     my %gffrecs; # recID => [ chr, strand, feat_type, \%attrs, fstart, fend, [@exons], [@cds], isgff3, rejected, track, subfeat, fscore ]
154     # 0 1 2 3 4 5 6 7 8 9 10 , 11, 12
155     # recID has the prefix '<chr>|' which should be removed before output
156     my $gffh;
157     if ($input_gff eq '-') {
158     open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN $!\n");
159     }
160     else {
161     open($gffh, $input_gff) || die("Error opening file $input_gff $!\n");
162     }
163    
164     loadGff($gffh, \%gffrecs);
165     my @sorted_recs=sort sortByLoc keys(%gffrecs);
166     processGffRecs(\%gffrecs, \@sorted_recs);
167    
168     # --
169     if ($outfile) {
170     select(STDOUT);
171     close(OUTF);
172     }
173    
174     #************ Subroutines **************
175     sub sortByLoc {
176     my $da=$gffrecs{$a};
177     my $db=$gffrecs{$b};
178     if ($$da[0] eq $$db[0]) {
179     return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ;
180     }
181     else { return $$da[0] cmp $$db[0] ; }
182     }
183    
184     sub checkOvlExons {
185     my ($a, $b, $rx)=@_;
186     return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
187     foreach my $x (@$rx) {
188     return 1 if ($a<=$$x[1] && $b>=$$x[0]);
189     return 0 if $b<$$x[0];
190     }
191     }
192    
193     sub checkWithinExons {
194     my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval
195     return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
196     foreach my $x (@$rx) {
197     return 1 if ($a>=$$x[0] && $b<=$$x[1]);
198     return 0 if $b<$$x[0];
199     }
200     }
201    
202    
203     sub loadGff {
204     my ($gffhandle, $recs)=@_;
205     while (<$gffhandle>) {
206     next if m/^\s*#/;
207     chomp;
208     my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
209     next unless $fstart>1 && $lnum;
210     $track=$out_track if $out_track;
211     next if exists($ignoredFeatures{lc($f)});
212     $f='exon' if $f=~m/exon/i;
213     $f='CDS' if $f=~m/^cds$/i;
214     my $rejected;
215     $rejected=1 if ($featlist && not exists($featflt{$f}));
216     #next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
217     my $gff3_ID;
218     my $gff3_Parent;
219     my ($gname,$tdescr);
220     my %attrs;
221     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
222     #$lnum=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; within text between quotes
223     my @av=split(/\s*\;\s*/,$lnum);
224     ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
225     ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
226     my $isGFF3 = ($gff3_ID || $gff3_Parent);
227     if ($isGFF3) { # GFF format
228     $gff3_ID=~tr/"//d; #"
229     $gff3_Parent=~tr/"//d; #"
230     $gff3_Parent='' if ($f =~m/RNA/); # we really don't care about parent for RNA features
231     if ($gff3_ID && !$gff3_Parent) { #top level feature
232     foreach my $a (@av) {
233     my ($attr, $value)=split(/\s*=\s*/,$a,2);
234     $value=~s/[" ]+$//;$value=~s/^[" ]+//;
235     $attrs{$attr}=$value;
236     }
237     if ($f=~m/RNA/i || $f=~/gene/) {
238     # try to parse the description, if any
239    
240     if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
241     $tdescr=$1;
242     }
243     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
244     $tdescr=$1;
245     }
246     if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
247     $gname=$1;
248     }
249     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
250     $gname=$1;
251     }
252     $tdescr='' if ($tdescr eq $gname);
253     $gname='' if $gname eq $gff3_ID;
254     }
255     die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs->{"$chr|$gff3_ID"}));
256     my $recID="$chr|$gff3_ID";
257     $recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore];
258     next;
259     } # parent/top-level feature
260     } #GFF
261     else { #GTF format
262     if ($f eq 'transcript') { # GTF with parent 'transcript' feature
263     foreach my $a (@av) {
264     my ($attr, $value)=split(/\s+"/,$a,2); #"
265     $value=~s/[" ]+$//;
266     if ($attr eq 'transcript_id') {
267     $attr='ID';
268     }
269     elsif ($attr eq 'gene_id') {
270     $attr='Name';
271     }
272     $attrs{$attr}=$value;
273     }
274     my $recID=$attrs{'ID'};
275     die("Error: cannot find transcript_id for GTF 'transcript' line:\n$_\n") unless $recID;
276     die("Error: duplicate feature $recID on $chr\n") if (exists($recs->{"$chr|$recID"}));
277     $recID=$chr.'|'.$recID;
278     $recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore ];
279     next;
280     } # parent 'transcript' feature in GTF
281     }
282     # -------------- exon/CDS line here:
283     next if ($featlist && !exists($featflt{$f}));
284     my $recID;
285     ($gname, $tdescr)=();
286     if ($isGFF3) {
287     $recID=$gff3_Parent;
288     }
289     elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
290     $recID=$1;
291     $recID=~tr/"//d; #"
292     }
293     elsif ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
294     $recID=$chr.'.jsm.'.$lnum;
295     $gff3_Parent=$recID;
296     $isGFF3=1;
297     $f='CDS';
298     }
299     else {
300     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
301     }
302     if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
303     $gname=$1;
304     $gname=~tr/"//d; #"
305     }
306     $tdescr='' if index($recID, $tdescr)>=0;
307     $gname='' if index($recID, $gname)>=0;
308     $recID=$chr.'|'.$recID;
309     my $ld = $recs->{$recID};
310     if ($ld) { #existing entry
311     my $i=($f eq 'CDS') ? 7 : 6;
312     my ($lstart, $lend)=($$ld[4], $$ld[5]);
313     $$ld[4]=$fstart if $fstart<$lstart;
314     $$ld[5]=$fend if $fend>$lend;
315     push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame]);
316     if ($f ne 'CDS') {
317     if ($$ld[11] && $$ld[11] ne $f) {
318     die("Error: multiple non-CDS subfeatures found for $recID ($$ld[11], $f)\n");
319     }
320     $$ld[11]=$f;
321     }
322     }
323     else { # first time seeing this locus/gene
324     # get the attributes from this first exon line
325     if ($gff3_Parent) { #has GFF3 Parent
326     foreach my $a (@av) {
327     my ($attr, $value)=split(/\s*=\s*/,$a,2); #"
328     $attr='ID' if $attr eq 'Parent';
329     $value=~s/[" ]+$//;$value=~s/^[" ]+//;
330     }
331     }
332     else { # GTF
333     foreach my $a (@av) {
334     my ($attr, $value)=split(/\s+"/,$a,2); #"
335     $value=~s/[" ]+$//;
336     if ($attr eq 'transcript_id') {
337     $attr='ID';
338     }
339     elsif ($attr eq 'gene_id') {
340     $attr='Name';
341     }
342     $attrs{$attr}=$value unless $attr=~m/^exon/;;
343     }
344     }
345     $recs->{$recID} = ($f eq 'CDS') ?
346     [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [[$fstart, $fend, $fscore, $frame]], $isGFF3, $rejected, $track, '','.' ] :
347     [$chr, $strand, $f, {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame]], [], $isGFF3, $rejected, $track, $f,'.' ] ;
348     # 0 1 2 3 4 5 6(exons) 7 (CDS) 8 9 10
349     }
350     }
351     close($gffh);
352     }
353    
354     sub processGffRecs {
355     #return if keys(%recs)==0;
356     my ($recs, $rlist)=@_;
357     my @recs_keys;
358     unless ($rlist) {
359     @recs_keys=keys(%$recs);
360     $rlist=\@recs_keys;
361     }
362     foreach my $recid (@$rlist) {
363     my $td=$$recs{$recid};
364     # 0 1 2 3 4 5 6 7 8 9 10 11 12
365     my ($chr, $strand, $feature, $attrs, $fstart, $fend, $er, $cr, $isGFF3, $rej, $track, $subf, $fscore) = @$td;
366     next if ($rej && !$fltinvert);
367     next if ($fltinvert && $featlist && !$rej);
368     # my ($mstart,$mend)=($fstart, $fend);
369     my $CDSonly=0; # set to 1 if only CDS segments were given
370     my $hasCDS=0;
371     my @ex;
372     my @cds;
373     #some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
374     if (@$er<1 && @$cr>0) {
375     @ex = sort { $a->[0] <=> $b->[0] } @$cr;
376     @cds=@ex;
377     $CDSonly=1;
378     $hasCDS=1;
379     }
380     else {
381     @ex = sort { $a->[0] <=> $b->[0] } @$er;
382     if (@$cr>0) { # sort cds segments too
383     @cds= sort { $a->[0] <=> $b->[0] } @$cr;
384     $hasCDS=1;
385     }
386     }
387     # --------------
388     # get the more accurate version of the start-end coords for the feature
389     my $covlen=0;
390     # map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
391     my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
392     my $gffid=$recid;
393     substr($gffid, 0, length($chr)+1)='';
394     die("Error: gffid ($gffid) not matching attrs\{ID\}(".$attrs-{'ID'}.")!\n")
395     unless ($gffid eq $attrs->{'ID'});
396     my $doprint=1;
397     #check all the filters
398     $doprint=0 if ($only_ifCDS && $hasCDS==0);
399     if ($doprint && $idfile) {
400     if ($idfileattr) {
401     $doprint=0 unless exists($idflt{$attrs->{$idfileattr}});
402     }
403     else {
404     $doprint=0 unless exists($idflt{$gffid});
405     }
406     }
407     if ($doprint && $avflt) {
408     my $avfound=0;
409     foreach my $a (keys(%attrflt)) {
410     if ($attrflt{$a} eq $attrs->{$a}) {
411     $avfound=1;
412     last;
413     }
414     }
415     $doprint=0 unless $avfound;
416     }
417    
418    
419     if ($doprint && $flt_chr && $flt_chr ne $chr) {
420     $doprint=0;
421     }
422     if ($doprint) {
423     if ($flt_strand && $flt_strand ne $strand) {
424     $doprint=0;
425     }
426     }
427     if ($doprint && @flt_intv>0) {
428     if ($r_range) { #t span overlap any range intervals
429     $doprint=1 if checkOvlExons($fstart, $fend, \@flt_intv);
430     }
431     elsif ($c_range) { #t span contained in any of the intervals
432     $doprint=1 if checkWithinExons($fstart, $fend, \@flt_intv);
433     }
434     elsif ($e_range) { # each exon checked for overlap of any interval
435     for my $ed (@ex) {
436     if (checkOvlExons($$ed[0], $$ed[1], \@flt_intv)) {
437     $doprint=1;
438     last;
439     }
440     }
441     }
442     }
443    
444     $doprint = ! $doprint if $fltinvert;
445     next unless $doprint;
446     #filter passed, print output
447     if ($tab_attrs) {
448     my @od;
449     foreach my $a (@atab_cols) {
450     if (uc($a) eq 'ID') {
451     push(@od, $gffid);
452     next;
453     }
454     push(@od, $attrs->{$a});
455     }
456     print join("\t",@od)."\n";
457     next;
458     }
459     #the core attributes ID, Name must be there already
460     unless (exists($attrs->{Name})) {
461     foreach my $name (qw(gene_name gene geneID geneId geneid locus loc ID)) {
462     if (exists($attrs->{$name})) {
463     $attrs->{Name}=$attrs->{$name};
464     last;
465     }
466     }
467     }
468     my ($tid, $tname)= (delete($attrs->{ID}), delete($attrs->{Name}));
469     my ($gene_name, $gene, $locus);
470     unless ($keepAll) {
471     if (exists($attrs->{gene_name})) {
472     $gene_name=delete($attrs->{gene_name});
473     $gene_name=undef if ($gene_name eq $tname && !exists($oattrs{gene_name}));
474     }
475     if (exists($attrs->{gene})) {
476     $gene=delete($attrs->{gene});
477     $gene=undef if ($gene eq $tname || $gene eq $gene_name) && !exists($oattrs{gene});
478     }
479     if (exists($attrs->{locus})) {
480     $locus=delete($attrs->{locus});
481     $locus=undef if ($locus eq $gene_name || $locus eq $gene_name) && !exists($oattrs{locus});
482     }
483     }
484     my $tattrs;
485     if ($printGTF) { #print GTF
486     $tattrs='transcript_id "'.$tid.'"; gene_id "'.$tname.'";';
487     $tattrs.=' gene_name "'.$gene_name.'";' if $gene_name;
488     $tattrs.=' gene "'.$gene.'";' if $gene;
489     $tattrs.=' locus "'.$locus.'";' if $locus;
490     if ($keepAll) {
491     foreach my $attr (keys(%$attrs)) {
492     $tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
493     }
494     }
495     else { # only a subset of attributes will be shown
496     foreach my $attr (keys(%$attrs)) {
497     next unless exists($oattrs{$attr});
498     $tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
499     }
500     }
501     }
502     else { #print GFF3
503     $tattrs='Parent='.$tid;
504     my $pattrs="ID=$tid;Name=$tname";
505     $pattrs.='gene_name='.$gene_name.';' if $gene_name;
506     $pattrs.='gene='.$gene.';' if $gene;
507     $pattrs.='locus='.$locus.';' if $locus;
508     if ($keepAll) {
509     foreach my $attr (keys(%$attrs)) {
510     my $val=$attrs->{$attr};
511     $pattrs.="$attr=$val;";
512     }
513     }
514     else { # only a subset of attributes will be shown
515     foreach my $attr (keys(%$attrs)) {
516     next unless exists($oattrs{$attr});
517     my $val=$attrs->{$attr};
518     $pattrs.="$attr=$val;";
519     }
520     }
521     print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n";
522     }
523     if ($CDSonly==0) { #write exons only when found in the input
524     foreach my $ed (@ex) {
525     print join("\t",$chr, $track, $subf, $$ed[0], $$ed[1], $$ed[2], $strand, $$ed[3], $tattrs)."\n";
526     }
527     }
528     foreach my $cd (@cds) {
529     print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $$cd[2], $strand, $$cd[3], $tattrs)."\n";
530     }
531     } #for each stored transcript
532     }

Properties

Name Value
svn:executable *