[ViewVC] Annotation of: gclib/scripts/gffmanip

#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;

my $usage = q/Usage:
 gffmanip [-v] [-o outrecords.txt] [{-r|-c|-e} <rangedata>] [-C] [-t <track>]\
  [-i [<attr>:]<ids.txt>] [-a <attr>=<value>[,...]] [-f <feature1>[,..]] \
  [-k <attrlist>] [-l [{-T|-L|-l <attrlist>}] [<gff\/gtf_file>]

Filtering options:
 -C output only those transcripts having CDS features
 -f output only the features matching any of the strings in the given
    comma-delimited list (e.g. -f 'mRNA,exon,CDS')
 -v show records that do NOT match the given filtering options (like grep's -v)
 -r show only records overlapping any of the intervals given in <rangedata>, 
    which has the format:
       [<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]]
 -c same as -r, but only shows records fully contained in the interval(s) given
 -e outputs records whose exon\/CDS segments overlap the interval(s) given
 -i only shows records whose ID match any of the entries in the file <ids.txt>;
    unless another attribute is given as a prefix (<attr>:), the ID attribute
    is used for GFF3 input, and transcript_id for GTF
 -a only shows records having the <attr> attribute with the value <value>;
    multiple attribute\/value pairs can be given (comma delimited) and a record
    will be shown if there is at least one attribute match
 
Output options:
 -K keep all attributes ; by default only the core GFF\/GTF attributes are
    shown (transcript_id, gene_id, gene_name, ID, Parent, Name, gene_name)
 -k keep only the specified non-core attributes in the output records
 -T output GTF format (default is GFF3)
 -L output only the list of transcript IDs found in the file 
    (and matching the filtering options)
 -l output the value of all attributes in <attrlist> for each record
    (tab delimited if more than one attribute is given)
 -t replace the 2nd column with the given <track> text
/;
umask 0002;
getopts('vTKACLf:r:l:c:e:i:a:t:k:o:') || die($usage."\n");
die("$usage\n") unless @ARGV>0;
die("Only one input file is expected (or '-' for stdin)!\n") unless @ARGV==1;
my $input_gff=$ARGV[0];
my $outfile=$Getopt::Std::opt_o;
my %ignoredFeatures;
@ignoredFeatures{qw(intron cds_start cds_stop start_codon stop_codon start stop cdsstart cdsstop)}=();
my %oattrs; #hash with attrs to print
@oattrs{qw(ID transcript_id Parent gene_id gene gene_name)}=();
my %attrflt; # attribute=>value filter
my %idflt;   # hash with IDs to keep
my %atab;    # hash with attribute names whose values will be listed as tab delimited
my @atab_cols; 
if ($outfile) {
  open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
  select(OUTF);
  }
# --
my ($fltinvert, $printGTF, $out_track)=
  ($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t);
my $only_ifCDS=$Getopt::Std::opt_v;
my ($idfile, $avflt, $featlist, $oattrlist)=
  ($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k);
my $keepAll=$Getopt::Std::opt_K || $Getopt::Std::opt_A || (lc($oattrlist) eq 'a' || lc($oattrlist) eq 'all');
my ($r_range, $c_range, $e_range)=
   ($Getopt::Std::opt_r, $Getopt::Std::opt_c, $Getopt::Std::opt_e);
my $check_range;
if ($r_range) {
   $check_range=$r_range;
   die("Error: options -r, -c and -e are mutually exclusive!\n") if ($c_range || $e_range);
   }
 elsif ($c_range) {
   $check_range=$c_range;
   die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $e_range);
   }
 elsif ($e_range) {
   $check_range=$e_range;
   die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $c_range);
   }
my ($flt_chr, $flt_strand);
my @flt_intv;
if ($check_range) {
   ($flt_chr, my $ck_rlst)=split(/\:/,$check_range);
   #die("$usage Incorrect format for the interval list!\n") unless $flt_chr && $ck_rlst;
   my $flt_strand=substr($flt_chr,0,1);
   if ($flt_strand eq '-' || $flt_strand eq '+') {
      substr($flt_chr,0,1)='';
      }
     else { 
     $flt_strand=undef; 
     my $e=chop($flt_chr);
     if ($e eq '-' || $e eq '+') {
         $flt_strand=$e;
         }
       else { $flt_chr.=$e; }
     } #no strand
   my @flt_intv; 
   if ($ck_rlst) {
     my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$ck_rlst));
     foreach my $d (@rdata) {
       ($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
       }
     @flt_intv = sort { $a->[0] <=> $b->[0] } @rdata;
     }
   }
#my $range=$flt_chr.':'.$ck_rex[0]->[0].'-'.$ex[-1]->[1];

$oattrlist='' if $keepAll;
my $tab_attrs=$Getopt::Std::opt_l;
$tab_attrs='ID' if $Getopt::Std::opt_L;
if ($tab_attrs) {
  @atab_cols=split(/\,/, $tab_attrs);
  @atab{@atab_cols}=();
  }
my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id
if ($idfile) {
  unless (-f $idfile) {
     my ($a,$f)=(split(/\:/,$idfile));
     ($idfile, $idfileattr)=($f,$a) if $f;
     }
  my $idf;
  if ($idfile eq '-') {
     open($idf, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n");
     }
    else {
     open($idf, $idfile) || die("Error: cannot open $idfile $!\n");
     }
  while (<$idf>) {
    my ($id)=(m/(\S+)/);
    $idflt{$id}=1 if $id;
    }
  close($idf);
  }
if ($avflt) {
  my @avl=split(/\,/, $avflt); 
  foreach my $avpair (@avl) {
    my ($attr, $value)=split(/\s*=\s*/,$avpair,2);
    $value=~s/[" ]+$//;$value=~s/^[" ]+//;
    $attrflt{$attr}=$value;
    }
  }
my %featflt; # feature list filter -- if $featlist was given
if ($featlist)  {
 #add to the list of attributes to be kept
 my @fl=split(/\,/, $featlist);
 @featflt{@fl}=(); # this is actually an input filter
 }
if ($oattrlist) {
  #add to the list of attributes to be kept
  my @al=split(/\,/, $oattrlist);
  @oattrs{@al}=();
  }
# ---- 
my %gffrecs; # recID =>  [ chr, strand, feat_type,  \%attrs, fstart, fend, [@exons], [@cds], isgff3, rejected, track, subfeat, fscore ]
          #                0      1        2           3      4       5       6        7        8        9     10   ,   11,     12
          # recID has the prefix '<chr>|' which should be removed before output
my $gffh;
if ($input_gff eq '-') {
  open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN $!\n");
  }
  else {
  open($gffh, $input_gff) ||  die("Error opening file $input_gff $!\n");
  }

loadGff($gffh, \%gffrecs);
my @sorted_recs=sort sortByLoc keys(%gffrecs);
processGffRecs(\%gffrecs, \@sorted_recs);

# --
if ($outfile) {
 select(STDOUT);
 close(OUTF);
 }

#************ Subroutines **************
sub sortByLoc {
 my $da=$gffrecs{$a};
 my $db=$gffrecs{$b};
 if ($$da[0] eq $$db[0]) {
    return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ;
    }
  else { return $$da[0] cmp $$db[0] ; }
}

sub checkOvlExons {
 my ($a, $b, $rx)=@_;
 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
 foreach my $x (@$rx) {
   return 1 if ($a<=$$x[1] && $b>=$$x[0]);
   return 0 if $b<$$x[0];
   }
}

sub checkWithinExons {
 my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval
 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
 foreach my $x (@$rx) {
   return 1 if ($a>=$$x[0] && $b<=$$x[1]);
   return 0 if $b<$$x[0];
   }
}


sub loadGff {
 my ($gffhandle, $recs)=@_;
 while (<$gffhandle>) {
   next if m/^\s*#/;
   chomp;
   my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
   next unless $fstart>1 && $lnum;
   $track=$out_track if $out_track;
   next if exists($ignoredFeatures{lc($f)});
   $f='exon' if $f=~m/exon/i;
   $f='CDS' if $f=~m/^cds$/i;
   my $rejected;
   $rejected=1 if ($featlist && not exists($featflt{$f}));
   #next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
   my $gff3_ID;
   my $gff3_Parent;
   my ($gname,$tdescr);
   my %attrs;
   ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
   #$lnum=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; within text between quotes
   my @av=split(/\s*\;\s*/,$lnum);
   ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
   ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
   my $isGFF3 = ($gff3_ID || $gff3_Parent);
   if ($isGFF3) { # GFF format
      $gff3_ID=~tr/"//d; #"
      $gff3_Parent=~tr/"//d; #"
      $gff3_Parent='' if ($f =~m/RNA/); # we really don't care about parent for RNA features
      if ($gff3_ID && !$gff3_Parent) { #top level feature
         foreach my $a (@av) {
            my ($attr, $value)=split(/\s*=\s*/,$a,2);
            $value=~s/[" ]+$//;$value=~s/^[" ]+//;
            $attrs{$attr}=$value;
            }
         if ($f=~m/RNA/i || $f=~/gene/) {
           # try to parse the description, if any
           
           if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
             $tdescr=$1;
             }
            elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
             $tdescr=$1;
             }
           if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
             $gname=$1;
             }
            elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
             $gname=$1;
             }
           $tdescr='' if ($tdescr eq $gname);
           $gname='' if $gname eq $gff3_ID;
           }
         die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs->{"$chr|$gff3_ID"}));
         my $recID="$chr|$gff3_ID";
         $recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore];
         next;
         } # parent/top-level feature
      } #GFF
     else { #GTF format
      if ($f eq 'transcript') { # GTF with parent 'transcript' feature
        foreach my $a (@av) {
           my ($attr, $value)=split(/\s+"/,$a,2); #"
           $value=~s/[" ]+$//;
           if ($attr eq 'transcript_id') {
                     $attr='ID';
                     }
                  elsif ($attr eq 'gene_id') {
                     $attr='Name';
                     }
           $attrs{$attr}=$value;
           }
        my $recID=$attrs{'ID'};
        die("Error: cannot find transcript_id for GTF 'transcript' line:\n$_\n") unless $recID;
        die("Error: duplicate feature $recID on $chr\n") if (exists($recs->{"$chr|$recID"}));
        $recID=$chr.'|'.$recID;
        $recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore ];
        next;
        } # parent 'transcript' feature in GTF
      }
   # -------------- exon/CDS line here:
   next if ($featlist && !exists($featflt{$f}));
   my $recID;
   ($gname, $tdescr)=();
   if ($isGFF3) {
      $recID=$gff3_Parent;
      }
    elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
     $recID=$1;
     $recID=~tr/"//d; #"
     }
    elsif ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
     $recID=$chr.'.jsm.'.$lnum;
     $gff3_Parent=$recID;
     $isGFF3=1;
     $f='CDS';
     }
    else {
     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
     }
   if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
      $gname=$1;
      $gname=~tr/"//d; #"
      }
   $tdescr='' if index($recID, $tdescr)>=0;
   $gname='' if index($recID, $gname)>=0;
   $recID=$chr.'|'.$recID;
   my $ld = $recs->{$recID};
   if ($ld) { #existing entry
     my $i=($f eq 'CDS') ? 7 : 6;
     my ($lstart, $lend)=($$ld[4], $$ld[5]);
     $$ld[4]=$fstart if $fstart<$lstart;
     $$ld[5]=$fend if $fend>$lend;
     push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame]);
     if ($f ne 'CDS') {
        if ($$ld[11] && $$ld[11] ne $f) {
           die("Error: multiple non-CDS subfeatures found for $recID ($$ld[11], $f)\n");
           }
        $$ld[11]=$f;
        }
     }
    else { # first time seeing this locus/gene
     # get the attributes from this first exon line
     if ($gff3_Parent) { #has GFF3 Parent
       foreach my $a (@av) {
           my ($attr, $value)=split(/\s*=\s*/,$a,2); #"
           $attr='ID' if $attr eq 'Parent';
           $value=~s/[" ]+$//;$value=~s/^[" ]+//;
           }
        }
      else { # GTF
       foreach my $a (@av) {
           my ($attr, $value)=split(/\s+"/,$a,2); #"
           $value=~s/[" ]+$//;
           if ($attr eq 'transcript_id') {
                     $attr='ID';
                     }
                  elsif ($attr eq 'gene_id') {
                     $attr='Name';
                     }
           $attrs{$attr}=$value unless $attr=~m/^exon/;;
           }
       }
     $recs->{$recID} = ($f eq 'CDS') ? 
           [$chr, $strand, $f, {%attrs}, $fstart, $fend,          [],      [[$fstart, $fend, $fscore, $frame]], $isGFF3, $rejected, $track, '','.' ] :
           [$chr, $strand, $f, {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame]], [],               $isGFF3, $rejected, $track, $f,'.' ] ;
         #   0       1      2     3        4       5       6(exons)                          7 (CDS)              8          9        10
     }
 }
close($gffh);
}

sub processGffRecs {
 #return if keys(%recs)==0;
 my ($recs, $rlist)=@_;
 my @recs_keys;
 unless ($rlist) {
    @recs_keys=keys(%$recs);
    $rlist=\@recs_keys;
    }
 foreach my $recid (@$rlist) {
   my $td=$$recs{$recid};
   #     0       1       2        3        4      5       6    7     8        9     10      11     12
   my ($chr, $strand, $feature, $attrs, $fstart, $fend,  $er, $cr, $isGFF3, $rej, $track, $subf, $fscore) = @$td;
   next if ($rej && !$fltinvert);
   next if ($fltinvert && $featlist && !$rej);
   # my ($mstart,$mend)=($fstart, $fend);
   my $CDSonly=0; # set to 1 if only CDS segments were given
   my $hasCDS=0;
   my @ex;
   my @cds;
   #some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
   if (@$er<1 && @$cr>0) {
     @ex = sort { $a->[0] <=> $b->[0] } @$cr;
     @cds=@ex;
     $CDSonly=1;
     $hasCDS=1;
     }
    else {
     @ex = sort { $a->[0] <=> $b->[0] } @$er;
     if (@$cr>0) { # sort cds segments too
       @cds= sort { $a->[0] <=> $b->[0] } @$cr;
       $hasCDS=1;
       }
     }
   # --------------
   # get the more accurate version of the start-end coords for the feature
   my $covlen=0;
   # map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
   my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
   my $gffid=$recid;
   substr($gffid, 0, length($chr)+1)='';
   die("Error: gffid ($gffid) not matching attrs\{ID\}(".$attrs-{'ID'}.")!\n")
       unless ($gffid eq $attrs->{'ID'});
   my $doprint=1;
   #check all the filters
   $doprint=0 if ($only_ifCDS && $hasCDS==0);
   if ($doprint && $idfile) {
     if ($idfileattr) {
        $doprint=0 unless exists($idflt{$attrs->{$idfileattr}});
        }
      else {
        $doprint=0 unless exists($idflt{$gffid});
        }
     }
    if ($doprint && $avflt) {
     my $avfound=0;
     foreach my $a (keys(%attrflt)) {
       if ($attrflt{$a} eq $attrs->{$a}) {
          $avfound=1;
          last;
          }
       }
     $doprint=0 unless $avfound;
     }
     

  if ($doprint && $flt_chr && $flt_chr ne $chr) {
    $doprint=0;
    }
  if ($doprint) {
    if ($flt_strand && $flt_strand ne $strand) {
      $doprint=0;
      }
    }
  if ($doprint && @flt_intv>0) {
    if ($r_range) { #t span overlap any range intervals
       $doprint=1 if checkOvlExons($fstart, $fend, \@flt_intv);
       }
     elsif ($c_range) { #t span contained in any of the intervals
       $doprint=1 if checkWithinExons($fstart, $fend, \@flt_intv);
       }
     elsif ($e_range) { # each exon checked for overlap of any interval
       for my $ed (@ex) {
         if (checkOvlExons($$ed[0], $$ed[1], \@flt_intv)) {
            $doprint=1;
            last;
            }
         }
       }
    }
  
  $doprint = ! $doprint if $fltinvert;
  next unless $doprint;
  #filter passed, print output
  if ($tab_attrs) {
     my @od;
     foreach my $a (@atab_cols) {
       if (uc($a) eq 'ID') {
          push(@od, $gffid);
          next;
          }
       push(@od, $attrs->{$a});
       }
     print join("\t",@od)."\n";
     next;
     }
   #the core attributes ID, Name must be there already
   unless (exists($attrs->{Name})) {
     foreach my $name (qw(gene_name gene geneID geneId geneid locus loc ID)) {
       if (exists($attrs->{$name})) {
           $attrs->{Name}=$attrs->{$name};
           last;
           }
          }
     }
   my ($tid, $tname)= (delete($attrs->{ID}), delete($attrs->{Name}));
   my ($gene_name, $gene, $locus);
   unless ($keepAll) {
     if (exists($attrs->{gene_name})) {
          $gene_name=delete($attrs->{gene_name});
          $gene_name=undef if ($gene_name eq $tname && !exists($oattrs{gene_name}));
          }
     if (exists($attrs->{gene})) {
          $gene=delete($attrs->{gene});
          $gene=undef if ($gene eq $tname || $gene eq $gene_name) && !exists($oattrs{gene});
          }
     if (exists($attrs->{locus})) {
          $locus=delete($attrs->{locus});
          $locus=undef if ($locus eq $gene_name || $locus eq $gene_name) && !exists($oattrs{locus});
          }
     }
   my $tattrs;  
   if ($printGTF) { #print GTF
     $tattrs='transcript_id "'.$tid.'"; gene_id "'.$tname.'";';
     $tattrs.=' gene_name "'.$gene_name.'";' if $gene_name;
     $tattrs.=' gene "'.$gene.'";' if $gene;
     $tattrs.=' locus "'.$locus.'";' if $locus;
     if ($keepAll) {
        foreach my $attr (keys(%$attrs)) {
           $tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
           }
        }
       else { # only a subset of attributes will be shown
        foreach my $attr (keys(%$attrs)) {
           next unless exists($oattrs{$attr});
           $tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
           }
        }
     }
    else { #print GFF3
     $tattrs='Parent='.$tid;
     my $pattrs="ID=$tid;Name=$tname";
     $pattrs.='gene_name='.$gene_name.';' if $gene_name;
     $pattrs.='gene='.$gene.';' if $gene;
     $pattrs.='locus='.$locus.';' if $locus;
     if ($keepAll) {
        foreach my $attr (keys(%$attrs)) {
           my $val=$attrs->{$attr};
           $pattrs.="$attr=$val;";
           }
        }
       else { # only a subset of attributes will be shown
        foreach my $attr (keys(%$attrs)) {
           next unless exists($oattrs{$attr});
           my $val=$attrs->{$attr};
           $pattrs.="$attr=$val;";
           }
        }
     print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n";
     }
   if ($CDSonly==0) { #write exons only when found in the input
        foreach my $ed (@ex) {
            print join("\t",$chr, $track, $subf, $$ed[0], $$ed[1], $$ed[2], $strand, $$ed[3], $tattrs)."\n";
            }
        }
   foreach my $cd (@cds) {
            print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $$cd[2], $strand, $$cd[3], $tattrs)."\n";
            }
  } #for each stored transcript
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 6 months ago) by gpertea
File size:	18716 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	use FindBin;use lib $FindBin::Bin;
5
6	my $usage = q/Usage:
7	gffmanip [-v] [-o outrecords.txt] [{-r\|-c\|-e} <rangedata>] [-C] [-t <track>]\
8	[-i [<attr>:]<ids.txt>] [-a <attr>=<value>[,...]] [-f <feature1>[,..]] \
9	[-k <attrlist>] [-l [{-T\|-L\|-l <attrlist>}] [<gff\/gtf_file>]
10
11	Filtering options:
12	-C output only those transcripts having CDS features
13	-f output only the features matching any of the strings in the given
14	comma-delimited list (e.g. -f 'mRNA,exon,CDS')
15	-v show records that do NOT match the given filtering options (like grep's -v)
16	-r show only records overlapping any of the intervals given in <rangedata>,
17	which has the format:
18	[<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]]
19	-c same as -r, but only shows records fully contained in the interval(s) given
20	-e outputs records whose exon\/CDS segments overlap the interval(s) given
21	-i only shows records whose ID match any of the entries in the file <ids.txt>;
22	unless another attribute is given as a prefix (<attr>:), the ID attribute
23	is used for GFF3 input, and transcript_id for GTF
24	-a only shows records having the <attr> attribute with the value <value>;
25	multiple attribute\/value pairs can be given (comma delimited) and a record
26	will be shown if there is at least one attribute match
27
28	Output options:
29	-K keep all attributes ; by default only the core GFF\/GTF attributes are
30	shown (transcript_id, gene_id, gene_name, ID, Parent, Name, gene_name)
31	-k keep only the specified non-core attributes in the output records
32	-T output GTF format (default is GFF3)
33	-L output only the list of transcript IDs found in the file
34	(and matching the filtering options)
35	-l output the value of all attributes in <attrlist> for each record
36	(tab delimited if more than one attribute is given)
37	-t replace the 2nd column with the given <track> text
38	/;
39	umask 0002;
40	getopts('vTKACLf:r:l:c:e:i:a:t:k:o:') \|\| die($usage."\n");
41	die("$usage\n") unless @ARGV>0;
42	die("Only one input file is expected (or '-' for stdin)!\n") unless @ARGV==1;
43	my $input_gff=$ARGV[0];
44	my $outfile=$Getopt::Std::opt_o;
45	my %ignoredFeatures;
46	@ignoredFeatures{qw(intron cds_start cds_stop start_codon stop_codon start stop cdsstart cdsstop)}=();
47	my %oattrs; #hash with attrs to print
48	@oattrs{qw(ID transcript_id Parent gene_id gene gene_name)}=();
49	my %attrflt; # attribute=>value filter
50	my %idflt; # hash with IDs to keep
51	my %atab; # hash with attribute names whose values will be listed as tab delimited
52	my @atab_cols;
53	if ($outfile) {
54	open(OUTF, '>'.$outfile) \|\| die("Error creating output file $outfile\n");
55	select(OUTF);
56	}
57	# --
58	my ($fltinvert, $printGTF, $out_track)=
59	($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t);
60	my $only_ifCDS=$Getopt::Std::opt_v;
61	my ($idfile, $avflt, $featlist, $oattrlist)=
62	($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k);
63	my $keepAll=$Getopt::Std::opt_K \|\| $Getopt::Std::opt_A \|\| (lc($oattrlist) eq 'a' \|\| lc($oattrlist) eq 'all');
64	my ($r_range, $c_range, $e_range)=
65	($Getopt::Std::opt_r, $Getopt::Std::opt_c, $Getopt::Std::opt_e);
66	my $check_range;
67	if ($r_range) {
68	$check_range=$r_range;
69	die("Error: options -r, -c and -e are mutually exclusive!\n") if ($c_range \|\| $e_range);
70	}
71	elsif ($c_range) {
72	$check_range=$c_range;
73	die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range \|\| $e_range);
74	}
75	elsif ($e_range) {
76	$check_range=$e_range;
77	die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range \|\| $c_range);
78	}
79	my ($flt_chr, $flt_strand);
80	my @flt_intv;
81	if ($check_range) {
82	($flt_chr, my $ck_rlst)=split(/\:/,$check_range);
83	#die("$usage Incorrect format for the interval list!\n") unless $flt_chr && $ck_rlst;
84	my $flt_strand=substr($flt_chr,0,1);
85	if ($flt_strand eq '-' \|\| $flt_strand eq '+') {
86	substr($flt_chr,0,1)='';
87	}
88	else {
89	$flt_strand=undef;
90	my $e=chop($flt_chr);
91	if ($e eq '-' \|\| $e eq '+') {
92	$flt_strand=$e;
93	}
94	else { $flt_chr.=$e; }
95	} #no strand
96	my @flt_intv;
97	if ($ck_rlst) {
98	my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$ck_rlst));
99	foreach my $d (@rdata) {
100	($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
101	}
102	@flt_intv = sort { $a->[0] <=> $b->[0] } @rdata;
103	}
104	}
105	#my $range=$flt_chr.':'.$ck_rex[0]->[0].'-'.$ex[-1]->[1];
106
107	$oattrlist='' if $keepAll;
108	my $tab_attrs=$Getopt::Std::opt_l;
109	$tab_attrs='ID' if $Getopt::Std::opt_L;
110	if ($tab_attrs) {
111	@atab_cols=split(/\,/, $tab_attrs);
112	@atab{@atab_cols}=();
113	}
114	my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id
115	if ($idfile) {
116	unless (-f $idfile) {
117	my ($a,$f)=(split(/\:/,$idfile));
118	($idfile, $idfileattr)=($f,$a) if $f;
119	}
120	my $idf;
121	if ($idfile eq '-') {
122	open($idf, "<&=STDIN") \|\| die("Error: couldn't alias STDIN. $!\n");
123	}
124	else {
125	open($idf, $idfile) \|\| die("Error: cannot open $idfile $!\n");
126	}
127	while (<$idf>) {
128	my ($id)=(m/(\S+)/);
129	$idflt{$id}=1 if $id;
130	}
131	close($idf);
132	}
133	if ($avflt) {
134	my @avl=split(/\,/, $avflt);
135	foreach my $avpair (@avl) {
136	my ($attr, $value)=split(/\s=\s/,$avpair,2);
137	$value=~s/[" ]+$//;$value=~s/^[" ]+//;
138	$attrflt{$attr}=$value;
139	}
140	}
141	my %featflt; # feature list filter -- if $featlist was given
142	if ($featlist) {
143	#add to the list of attributes to be kept
144	my @fl=split(/\,/, $featlist);
145	@featflt{@fl}=(); # this is actually an input filter
146	}
147	if ($oattrlist) {
148	#add to the list of attributes to be kept
149	my @al=split(/\,/, $oattrlist);
150	@oattrs{@al}=();
151	}
152	# ----
153	my %gffrecs; # recID => [ chr, strand, feat_type, \%attrs, fstart, fend, [@exons], [@cds], isgff3, rejected, track, subfeat, fscore ]
154	# 0 1 2 3 4 5 6 7 8 9 10 , 11, 12
155	# recID has the prefix '<chr>\|' which should be removed before output
156	my $gffh;
157	if ($input_gff eq '-') {
158	open($gffh, "<&=STDIN") \|\| die("Error: couldn't alias STDIN $!\n");
159	}
160	else {
161	open($gffh, $input_gff) \|\| die("Error opening file $input_gff $!\n");
162	}
163
164	loadGff($gffh, \%gffrecs);
165	my @sorted_recs=sort sortByLoc keys(%gffrecs);
166	processGffRecs(\%gffrecs, \@sorted_recs);
167
168	# --
169	if ($outfile) {
170	select(STDOUT);
171	close(OUTF);
172	}
173
174	#********** Subroutines ************
175	sub sortByLoc {
176	my $da=$gffrecs{$a};
177	my $db=$gffrecs{$b};
178	if ($$da[0] eq $$db[0]) {
179	return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ;
180	}
181	else { return $$da[0] cmp $$db[0] ; }
182	}
183
184	sub checkOvlExons {
185	my ($a, $b, $rx)=@_;
186	return 0 if ($a>$$rx[-1]->[1] \|\| $b<$$rx[0]->[0]); # not overlapping the whole exon chain
187	foreach my $x (@$rx) {
188	return 1 if ($a<=$$x[1] && $b>=$$x[0]);
189	return 0 if $b<$$x[0];
190	}
191	}
192
193	sub checkWithinExons {
194	my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval
195	return 0 if ($a>$$rx[-1]->[1] \|\| $b<$$rx[0]->[0]); # not overlapping the whole exon chain
196	foreach my $x (@$rx) {
197	return 1 if ($a>=$$x[0] && $b<=$$x[1]);
198	return 0 if $b<$$x[0];
199	}
200	}
201
202
203	sub loadGff {
204	my ($gffhandle, $recs)=@_;
205	while (<$gffhandle>) {
206	next if m/^\s*#/;
207	chomp;
208	my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
209	next unless $fstart>1 && $lnum;
210	$track=$out_track if $out_track;
211	next if exists($ignoredFeatures{lc($f)});
212	$f='exon' if $f=~m/exon/i;
213	$f='CDS' if $f=~m/^cds$/i;
214	my $rejected;
215	$rejected=1 if ($featlist && not exists($featflt{$f}));
216	#next if $f eq 'gene' \|\| $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
217	my $gff3_ID;
218	my $gff3_Parent;
219	my ($gname,$tdescr);
220	my %attrs;
221	($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
222	#$lnum=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; within text between quotes
223	my @av=split(/\s\;\s/,$lnum);
224	($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
225	($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
226	my $isGFF3 = ($gff3_ID \|\| $gff3_Parent);
227	if ($isGFF3) { # GFF format
228	$gff3_ID=~tr/"//d; #"
229	$gff3_Parent=~tr/"//d; #"
230	$gff3_Parent='' if ($f =~m/RNA/); # we really don't care about parent for RNA features
231	if ($gff3_ID && !$gff3_Parent) { #top level feature
232	foreach my $a (@av) {
233	my ($attr, $value)=split(/\s=\s/,$a,2);
234	$value=~s/[" ]+$//;$value=~s/^[" ]+//;
235	$attrs{$attr}=$value;
236	}
237	if ($f=~m/RNA/i \|\| $f=~/gene/) {
238	# try to parse the description, if any
239
240	if ($lnum=~m/\b(?:descr\|tophit\|info\|product)\s=\s"?([^;"]+)/i) {
241	$tdescr=$1;
242	}
243	elsif ($lnum=~m/Name\s=\s"?([^;"]+)/) {
244	$tdescr=$1;
245	}
246	if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
247	$gname=$1;
248	}
249	elsif ($lnum=~m/Name\s=\s"?([^;"]+)/) {
250	$gname=$1;
251	}
252	$tdescr='' if ($tdescr eq $gname);
253	$gname='' if $gname eq $gff3_ID;
254	}
255	die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs->{"$chr\|$gff3_ID"}));
256	my $recID="$chr\|$gff3_ID";
257	$recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore];
258	next;
259	} # parent/top-level feature
260	} #GFF
261	else { #GTF format
262	if ($f eq 'transcript') { # GTF with parent 'transcript' feature
263	foreach my $a (@av) {
264	my ($attr, $value)=split(/\s+"/,$a,2); #"
265	$value=~s/[" ]+$//;
266	if ($attr eq 'transcript_id') {
267	$attr='ID';
268	}
269	elsif ($attr eq 'gene_id') {
270	$attr='Name';
271	}
272	$attrs{$attr}=$value;
273	}
274	my $recID=$attrs{'ID'};
275	die("Error: cannot find transcript_id for GTF 'transcript' line:\n$_\n") unless $recID;
276	die("Error: duplicate feature $recID on $chr\n") if (exists($recs->{"$chr\|$recID"}));
277	$recID=$chr.'\|'.$recID;
278	$recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore ];
279	next;
280	} # parent 'transcript' feature in GTF
281	}
282	# -------------- exon/CDS line here:
283	next if ($featlist && !exists($featflt{$f}));
284	my $recID;
285	($gname, $tdescr)=();
286	if ($isGFF3) {
287	$recID=$gff3_Parent;
288	}
289	elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\\|\-]+)/) {
290	$recID=$1;
291	$recID=~tr/"//d; #"
292	}
293	elsif ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
294	$recID=$chr.'.jsm.'.$lnum;
295	$gff3_Parent=$recID;
296	$isGFF3=1;
297	$f='CDS';
298	}
299	else {
300	die("Error: cannot parse locus/transcript name from input line:\n$_\n");
301	}
302	if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\\|\-]+)/) {
303	$gname=$1;
304	$gname=~tr/"//d; #"
305	}
306	$tdescr='' if index($recID, $tdescr)>=0;
307	$gname='' if index($recID, $gname)>=0;
308	$recID=$chr.'\|'.$recID;
309	my $ld = $recs->{$recID};
310	if ($ld) { #existing entry
311	my $i=($f eq 'CDS') ? 7 : 6;
312	my ($lstart, $lend)=($$ld[4], $$ld[5]);
313	$$ld[4]=$fstart if $fstart<$lstart;
314	$$ld[5]=$fend if $fend>$lend;
315	push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame]);
316	if ($f ne 'CDS') {
317	if ($$ld[11] && $$ld[11] ne $f) {
318	die("Error: multiple non-CDS subfeatures found for $recID ($$ld[11], $f)\n");
319	}
320	$$ld[11]=$f;
321	}
322	}
323	else { # first time seeing this locus/gene
324	# get the attributes from this first exon line
325	if ($gff3_Parent) { #has GFF3 Parent
326	foreach my $a (@av) {
327	my ($attr, $value)=split(/\s=\s/,$a,2); #"
328	$attr='ID' if $attr eq 'Parent';
329	$value=~s/[" ]+$//;$value=~s/^[" ]+//;
330	}
331	}
332	else { # GTF
333	foreach my $a (@av) {
334	my ($attr, $value)=split(/\s+"/,$a,2); #"
335	$value=~s/[" ]+$//;
336	if ($attr eq 'transcript_id') {
337	$attr='ID';
338	}
339	elsif ($attr eq 'gene_id') {
340	$attr='Name';
341	}
342	$attrs{$attr}=$value unless $attr=~m/^exon/;;
343	}
344	}
345	$recs->{$recID} = ($f eq 'CDS') ?
346	[$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [[$fstart, $fend, $fscore, $frame]], $isGFF3, $rejected, $track, '','.' ] :
347	[$chr, $strand, $f, {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame]], [], $isGFF3, $rejected, $track, $f,'.' ] ;
348	# 0 1 2 3 4 5 6(exons) 7 (CDS) 8 9 10
349	}
350	}
351	close($gffh);
352	}
353
354	sub processGffRecs {
355	#return if keys(%recs)==0;
356	my ($recs, $rlist)=@_;
357	my @recs_keys;
358	unless ($rlist) {
359	@recs_keys=keys(%$recs);
360	$rlist=\@recs_keys;
361	}
362	foreach my $recid (@$rlist) {
363	my $td=$$recs{$recid};
364	# 0 1 2 3 4 5 6 7 8 9 10 11 12
365	my ($chr, $strand, $feature, $attrs, $fstart, $fend, $er, $cr, $isGFF3, $rej, $track, $subf, $fscore) = @$td;
366	next if ($rej && !$fltinvert);
367	next if ($fltinvert && $featlist && !$rej);
368	# my ($mstart,$mend)=($fstart, $fend);
369	my $CDSonly=0; # set to 1 if only CDS segments were given
370	my $hasCDS=0;
371	my @ex;
372	my @cds;
373	#some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
374	if (@$er<1 && @$cr>0) {
375	@ex = sort { $a->[0] <=> $b->[0] } @$cr;
376	@cds=@ex;
377	$CDSonly=1;
378	$hasCDS=1;
379	}
380	else {
381	@ex = sort { $a->[0] <=> $b->[0] } @$er;
382	if (@$cr>0) { # sort cds segments too
383	@cds= sort { $a->[0] <=> $b->[0] } @$cr;
384	$hasCDS=1;
385	}
386	}
387	# --------------
388	# get the more accurate version of the start-end coords for the feature
389	my $covlen=0;
390	# map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
391	my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
392	my $gffid=$recid;
393	substr($gffid, 0, length($chr)+1)='';
394	die("Error: gffid ($gffid) not matching attrs\{ID\}(".$attrs-{'ID'}.")!\n")
395	unless ($gffid eq $attrs->{'ID'});
396	my $doprint=1;
397	#check all the filters
398	$doprint=0 if ($only_ifCDS && $hasCDS==0);
399	if ($doprint && $idfile) {
400	if ($idfileattr) {
401	$doprint=0 unless exists($idflt{$attrs->{$idfileattr}});
402	}
403	else {
404	$doprint=0 unless exists($idflt{$gffid});
405	}
406	}
407	if ($doprint && $avflt) {
408	my $avfound=0;
409	foreach my $a (keys(%attrflt)) {
410	if ($attrflt{$a} eq $attrs->{$a}) {
411	$avfound=1;
412	last;
413	}
414	}
415	$doprint=0 unless $avfound;
416	}
417
418
419	if ($doprint && $flt_chr && $flt_chr ne $chr) {
420	$doprint=0;
421	}
422	if ($doprint) {
423	if ($flt_strand && $flt_strand ne $strand) {
424	$doprint=0;
425	}
426	}
427	if ($doprint && @flt_intv>0) {
428	if ($r_range) { #t span overlap any range intervals
429	$doprint=1 if checkOvlExons($fstart, $fend, \@flt_intv);
430	}
431	elsif ($c_range) { #t span contained in any of the intervals
432	$doprint=1 if checkWithinExons($fstart, $fend, \@flt_intv);
433	}
434	elsif ($e_range) { # each exon checked for overlap of any interval
435	for my $ed (@ex) {
436	if (checkOvlExons($$ed[0], $$ed[1], \@flt_intv)) {
437	$doprint=1;
438	last;
439	}
440	}
441	}
442	}
443
444	$doprint = ! $doprint if $fltinvert;
445	next unless $doprint;
446	#filter passed, print output
447	if ($tab_attrs) {
448	my @od;
449	foreach my $a (@atab_cols) {
450	if (uc($a) eq 'ID') {
451	push(@od, $gffid);
452	next;
453	}
454	push(@od, $attrs->{$a});
455	}
456	print join("\t",@od)."\n";
457	next;
458	}
459	#the core attributes ID, Name must be there already
460	unless (exists($attrs->{Name})) {
461	foreach my $name (qw(gene_name gene geneID geneId geneid locus loc ID)) {
462	if (exists($attrs->{$name})) {
463	$attrs->{Name}=$attrs->{$name};
464	last;
465	}
466	}
467	}
468	my ($tid, $tname)= (delete($attrs->{ID}), delete($attrs->{Name}));
469	my ($gene_name, $gene, $locus);
470	unless ($keepAll) {
471	if (exists($attrs->{gene_name})) {
472	$gene_name=delete($attrs->{gene_name});
473	$gene_name=undef if ($gene_name eq $tname && !exists($oattrs{gene_name}));
474	}
475	if (exists($attrs->{gene})) {
476	$gene=delete($attrs->{gene});
477	$gene=undef if ($gene eq $tname \|\| $gene eq $gene_name) && !exists($oattrs{gene});
478	}
479	if (exists($attrs->{locus})) {
480	$locus=delete($attrs->{locus});
481	$locus=undef if ($locus eq $gene_name \|\| $locus eq $gene_name) && !exists($oattrs{locus});
482	}
483	}
484	my $tattrs;
485	if ($printGTF) { #print GTF
486	$tattrs='transcript_id "'.$tid.'"; gene_id "'.$tname.'";';
487	$tattrs.=' gene_name "'.$gene_name.'";' if $gene_name;
488	$tattrs.=' gene "'.$gene.'";' if $gene;
489	$tattrs.=' locus "'.$locus.'";' if $locus;
490	if ($keepAll) {
491	foreach my $attr (keys(%$attrs)) {
492	$tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
493	}
494	}
495	else { # only a subset of attributes will be shown
496	foreach my $attr (keys(%$attrs)) {
497	next unless exists($oattrs{$attr});
498	$tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
499	}
500	}
501	}
502	else { #print GFF3
503	$tattrs='Parent='.$tid;
504	my $pattrs="ID=$tid;Name=$tname";
505	$pattrs.='gene_name='.$gene_name.';' if $gene_name;
506	$pattrs.='gene='.$gene.';' if $gene;
507	$pattrs.='locus='.$locus.';' if $locus;
508	if ($keepAll) {
509	foreach my $attr (keys(%$attrs)) {
510	my $val=$attrs->{$attr};
511	$pattrs.="$attr=$val;";
512	}
513	}
514	else { # only a subset of attributes will be shown
515	foreach my $attr (keys(%$attrs)) {
516	next unless exists($oattrs{$attr});
517	my $val=$attrs->{$attr};
518	$pattrs.="$attr=$val;";
519	}
520	}
521	print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n";
522	}
523	if ($CDSonly==0) { #write exons only when found in the input
524	foreach my $ed (@ex) {
525	print join("\t",$chr, $track, $subf, $$ed[0], $$ed[1], $$ed[2], $strand, $$ed[3], $tattrs)."\n";
526	}
527	}
528	foreach my $cd (@cds) {
529	print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $$cd[2], $strand, $$cd[3], $tattrs)."\n";
530	}
531	} #for each stored transcript
532	}