[ViewVC] Annotation of: gclib/scripts/gffmanip

#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;

my $usage = q/Usage:
 gffmanip [-v] [-o outrecords.txt] [{-r|-c|-e} <rangedata>] [-C] [-t <track>]\
  [-i [<attr>:]<ids.txt>] [-a <attr>=<value>[,...]] [-f <feature1>[,..]] \
  [-k <attrlist>] [-l [{-T|-L|-l <attrlist>}] [<gff\/gtf_file>]

Filtering options:
 -C output only those transcripts having CDS features
 -f output only the features matching any of the strings in the given
    comma-delimited list (e.g. -f 'mRNA,exon,CDS')
 -v show records that do NOT match the given filtering options (like grep's -v)
 -r show only records overlapping any of the intervals given in <rangedata>, 
    which has the format:
       [<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]]
 -c same as -r, but only shows records fully contained in the interval(s) given
 -e outputs records whose exon\/CDS segments overlap the interval(s) given
 -i only shows records whose ID match any of the entries in the file <ids.txt>;
    unless another attribute is given as a prefix (<attr>:), the ID attribute
    is used for GFF3 input, and transcript_id for GTF
 -a only shows records having the <attr> attribute with the value <value>;
    multiple attribute\/value pairs can be given (comma delimited) and a record
    will be shown if there is at least one attribute match
 
Output options:
 -K keep all attributes ; by default only the core GFF\/GTF attributes are
    shown (transcript_id, gene_id, gene_name, ID, Parent, Name, gene_name)
 -k keep only the specified non-core attributes in the output records
 -T output GTF format (default is GFF3)
 -L output only the list of transcript IDs found in the file 
    (and matching the filtering options)
 -l output the value of all attributes in <attrlist> for each record
    (tab delimited if more than one attribute is given)
 -t replace the 2nd column with the given <track> text
/;
umask 0002;
getopts('vTKACLf:r:l:c:e:i:a:t:k:o:') || die($usage."\n");
die("$usage\n") unless @ARGV>0;
die("Only one input file is expected (or '-' for stdin)!\n") unless @ARGV==1;
my $input_gff=$ARGV[0];
my $outfile=$Getopt::Std::opt_o;
my %ignoredFeatures;
@ignoredFeatures{qw(intron cds_start cds_stop start_codon stop_codon start stop cdsstart cdsstop)}=();
my %oattrs; #hash with attrs to print
@oattrs{qw(ID transcript_id Parent gene_id gene gene_name)}=();
my %attrflt; # attribute=>value filter
my %idflt;   # hash with IDs to keep
my %atab;    # hash with attribute names whose values will be listed as tab delimited
my @atab_cols; 
if ($outfile) {
  open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
  select(OUTF);
  }
# --
my ($fltinvert, $printGTF, $out_track)=
  ($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t);
my $only_ifCDS=$Getopt::Std::opt_v;
my ($idfile, $avflt, $featlist, $oattrlist)=
  ($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k);
my $keepAll=$Getopt::Std::opt_K || $Getopt::Std::opt_A || (lc($oattrlist) eq 'a' || lc($oattrlist) eq 'all');
my ($r_range, $c_range, $e_range)=
   ($Getopt::Std::opt_r, $Getopt::Std::opt_c, $Getopt::Std::opt_e);
my $check_range;
if ($r_range) {
   $check_range=$r_range;
   die("Error: options -r, -c and -e are mutually exclusive!\n") if ($c_range || $e_range);
   }
 elsif ($c_range) {
   $check_range=$c_range;
   die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $e_range);
   }
 elsif ($e_range) {
   $check_range=$e_range;
   die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $c_range);
   }
my ($flt_chr, $flt_strand);
my @flt_intv;
if ($check_range) {
   ($flt_chr, my $ck_rlst)=split(/\:/,$check_range);
   #die("$usage Incorrect format for the interval list!\n") unless $flt_chr && $ck_rlst;
   my $flt_strand=substr($flt_chr,0,1);
   if ($flt_strand eq '-' || $flt_strand eq '+') {
      substr($flt_chr,0,1)='';
      }
     else { 
     $flt_strand=undef; 
     my $e=chop($flt_chr);
     if ($e eq '-' || $e eq '+') {
         $flt_strand=$e;
         }
       else { $flt_chr.=$e; }
     } #no strand
   my @flt_intv; 
   if ($ck_rlst) {
     my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$ck_rlst));
     foreach my $d (@rdata) {
       ($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
       }
     @flt_intv = sort { $a->[0] <=> $b->[0] } @rdata;
     }
   }
#my $range=$flt_chr.':'.$ck_rex[0]->[0].'-'.$ex[-1]->[1];

$oattrlist='' if $keepAll;
my $tab_attrs=$Getopt::Std::opt_l;
$tab_attrs='ID' if $Getopt::Std::opt_L;
if ($tab_attrs) {
  @atab_cols=split(/\,/, $tab_attrs);
  @atab{@atab_cols}=();
  }
my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id
if ($idfile) {
  unless (-f $idfile) {
     my ($a,$f)=(split(/\:/,$idfile));
     ($idfile, $idfileattr)=($f,$a) if $f;
     }
  my $idf;
  if ($idfile eq '-') {
     open($idf, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n");
     }
    else {
     open($idf, $idfile) || die("Error: cannot open $idfile $!\n");
     }
  while (<$idf>) {
    my ($id)=(m/(\S+)/);
    $idflt{$id}=1 if $id;
    }
  close($idf);
  }
if ($avflt) {
  my @avl=split(/\,/, $avflt); 
  foreach my $avpair (@avl) {
    my ($attr, $value)=split(/\s*=\s*/,$avpair,2);
    $value=~s/[" ]+$//;$value=~s/^[" ]+//;
    $attrflt{$attr}=$value;
    }
  }
my %featflt; # feature list filter -- if $featlist was given
if ($featlist)  {
 #add to the list of attributes to be kept
 my @fl=split(/\,/, $featlist);
 @featflt{@fl}=(); # this is actually an input filter
 }
if ($oattrlist) {
  #add to the list of attributes to be kept
  my @al=split(/\,/, $oattrlist);
  @oattrs{@al}=();
  }
# ---- 
my %gffrecs; # recID =>  [ chr, strand, feat_type,  \%attrs, fstart, fend, [@exons], [@cds], isgff3, rejected, track, subfeat, fscore ]
          #                0      1        2           3      4       5       6        7        8        9     10   ,   11,     12
          # recID has the prefix '<chr>|' which should be removed before output
my $gffh;
if ($input_gff eq '-') {
  open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN $!\n");
  }
  else {
  open($gffh, $input_gff) ||  die("Error opening file $input_gff $!\n");
  }

loadGff($gffh, \%gffrecs);
my @sorted_recs=sort sortByLoc keys(%gffrecs);
processGffRecs(\%gffrecs, \@sorted_recs);

# --
if ($outfile) {
 select(STDOUT);
 close(OUTF);
 }

#************ Subroutines **************
sub sortByLoc {
 my $da=$gffrecs{$a};
 my $db=$gffrecs{$b};
 if ($$da[0] eq $$db[0]) {
    return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ;
    }
  else { return $$da[0] cmp $$db[0] ; }
}

sub checkOvlExons {
 my ($a, $b, $rx)=@_;
 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
 foreach my $x (@$rx) {
   return 1 if ($a<=$$x[1] && $b>=$$x[0]);
   return 0 if $b<$$x[0];
   }
}

sub checkWithinExons {
 my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval
 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
 foreach my $x (@$rx) {
   return 1 if ($a>=$$x[0] && $b<=$$x[1]);
   return 0 if $b<$$x[0];
   }
}


sub loadGff {
 my ($gffhandle, $recs)=@_;
 while (<$gffhandle>) {
   next if m/^\s*#/;
   chomp;
   my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
   next unless $fstart>1 && $lnum;
   $track=$out_track if $out_track;
   next if exists($ignoredFeatures{lc($f)});
   $f='exon' if $f=~m/exon/i;
   $f='CDS' if $f=~m/^cds$/i;
   my $rejected;
   $rejected=1 if ($featlist && not exists($featflt{$f}));
   #next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
   my $gff3_ID;
   my $gff3_Parent;
   my ($gname,$tdescr);
   my %attrs;
   ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
   #$lnum=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; within text between quotes
   my @av=split(/\s*\;\s*/,$lnum);
   ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
   ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
   my $isGFF3 = ($gff3_ID || $gff3_Parent);
   if ($isGFF3) { # GFF format
      $gff3_ID=~tr/"//d; #"
      $gff3_Parent=~tr/"//d; #"
      $gff3_Parent='' if ($f =~m/RNA/); # we really don't care about parent for RNA features
      if ($gff3_ID && !$gff3_Parent) { #top level feature
         foreach my $a (@av) {
            my ($attr, $value)=split(/\s*=\s*/,$a,2);
            $value=~s/[" ]+$//;$value=~s/^[" ]+//;
            $attrs{$attr}=$value;
            }
         if ($f=~m/RNA/i || $f=~/gene/) {
           # try to parse the description, if any
           
           if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
             $tdescr=$1;
             }
            elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
             $tdescr=$1;
             }
           if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
             $gname=$1;
             }
            elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
             $gname=$1;
             }
           $tdescr='' if ($tdescr eq $gname);
           $gname='' if $gname eq $gff3_ID;
           }
         die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs->{"$chr|$gff3_ID"}));
         my $recID="$chr|$gff3_ID";
         $recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore];
         next;
         } # parent/top-level feature
      } #GFF
     else { #GTF format
      if ($f eq 'transcript') { # GTF with parent 'transcript' feature
        foreach my $a (@av) {
           my ($attr, $value)=split(/\s+"/,$a,2); #"
           $value=~s/[" ]+$//;
           if ($attr eq 'transcript_id') {
                     $attr='ID';
                     }
                  elsif ($attr eq 'gene_id') {
                     $attr='Name';
                     }
           $attrs{$attr}=$value;
           }
        my $recID=$attrs{'ID'};
        die("Error: cannot find transcript_id for GTF 'transcript' line:\n$_\n") unless $recID;
        die("Error: duplicate feature $recID on $chr\n") if (exists($recs->{"$chr|$recID"}));
        $recID=$chr.'|'.$recID;
        $recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore ];
        next;
        } # parent 'transcript' feature in GTF
      }
   # -------------- exon/CDS line here:
   next if ($featlist && !exists($featflt{$f}));
   my $recID;
   ($gname, $tdescr)=();
   if ($isGFF3) {
      $recID=$gff3_Parent;
      }
    elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
     $recID=$1;
     $recID=~tr/"//d; #"
     }
    elsif ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
     $recID=$chr.'.jsm.'.$lnum;
     $gff3_Parent=$recID;
     $isGFF3=1;
     $f='CDS';
     }
    else {
     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
     }
   if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
      $gname=$1;
      $gname=~tr/"//d; #"
      }
   $tdescr='' if index($recID, $tdescr)>=0;
   $gname='' if index($recID, $gname)>=0;
   $recID=$chr.'|'.$recID;
   my $ld = $recs->{$recID};
   if ($ld) { #existing entry
     my $i=($f eq 'CDS') ? 7 : 6;
     my ($lstart, $lend)=($$ld[4], $$ld[5]);
     $$ld[4]=$fstart if $fstart<$lstart;
     $$ld[5]=$fend if $fend>$lend;
     push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame]);
     if ($f ne 'CDS') {
        if ($$ld[11] && $$ld[11] ne $f) {
           die("Error: multiple non-CDS subfeatures found for $recID ($$ld[11], $f)\n");
           }
        $$ld[11]=$f;
        }
     }
    else { # first time seeing this locus/gene
     # get the attributes from this first exon line
     if ($gff3_Parent) { #has GFF3 Parent
       foreach my $a (@av) {
           my ($attr, $value)=split(/\s*=\s*/,$a,2); #"
           $attr='ID' if $attr eq 'Parent';
           $value=~s/[" ]+$//;$value=~s/^[" ]+//;
           }
        }
      else { # GTF
       foreach my $a (@av) {
           my ($attr, $value)=split(/\s+"/,$a,2); #"
           $value=~s/[" ]+$//;
           if ($attr eq 'transcript_id') {
                     $attr='ID';
                     }
                  elsif ($attr eq 'gene_id') {
                     $attr='Name';
                     }
           $attrs{$attr}=$value unless $attr=~m/^exon/;;
           }
       }
     $recs->{$recID} = ($f eq 'CDS') ? 
           [$chr, $strand, $f, {%attrs}, $fstart, $fend,          [],      [[$fstart, $fend, $fscore, $frame]], $isGFF3, $rejected, $track, '','.' ] :
           [$chr, $strand, $f, {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame]], [],               $isGFF3, $rejected, $track, $f,'.' ] ;
         #   0       1      2     3        4       5       6(exons)                          7 (CDS)              8          9        10
     }
 }
close($gffh);
}

sub processGffRecs {
 #return if keys(%recs)==0;
 my ($recs, $rlist)=@_;
 my @recs_keys;
 unless ($rlist) {
    @recs_keys=keys(%$recs);
    $rlist=\@recs_keys;
    }
 foreach my $recid (@$rlist) {
   my $td=$$recs{$recid};
   #     0       1       2        3        4      5       6    7     8        9     10      11     12
   my ($chr, $strand, $feature, $attrs, $fstart, $fend,  $er, $cr, $isGFF3, $rej, $track, $subf, $fscore) = @$td;
   next if ($rej && !$fltinvert);
   next if ($fltinvert && $featlist && !$rej);
   # my ($mstart,$mend)=($fstart, $fend);
   my $CDSonly=0; # set to 1 if only CDS segments were given
   my $hasCDS=0;
   my @ex;
   my @cds;
   #some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
   if (@$er<1 && @$cr>0) {
     @ex = sort { $a->[0] <=> $b->[0] } @$cr;
     @cds=@ex;
     $CDSonly=1;
     $hasCDS=1;
     }
    else {
     @ex = sort { $a->[0] <=> $b->[0] } @$er;
     if (@$cr>0) { # sort cds segments too
       @cds= sort { $a->[0] <=> $b->[0] } @$cr;
       $hasCDS=1;
       }
     }
   # --------------
   # get the more accurate version of the start-end coords for the feature
   my $covlen=0;
   # map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
   my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
   my $gffid=$recid;
   substr($gffid, 0, length($chr)+1)='';
   die("Error: gffid ($gffid) not matching attrs\{ID\}(".$attrs-{'ID'}.")!\n")
       unless ($gffid eq $attrs->{'ID'});
   my $doprint=1;
   #check all the filters
   $doprint=0 if ($only_ifCDS && $hasCDS==0);
   if ($doprint && $idfile) {
     if ($idfileattr) {
        $doprint=0 unless exists($idflt{$attrs->{$idfileattr}});
        }
      else {
        $doprint=0 unless exists($idflt{$gffid});
        }
     }
    if ($doprint && $avflt) {
     my $avfound=0;
     foreach my $a (keys(%attrflt)) {
       if ($attrflt{$a} eq $attrs->{$a}) {
          $avfound=1;
          last;
          }
       }
     $doprint=0 unless $avfound;
     }
     

  if ($doprint && $flt_chr && $flt_chr ne $chr) {
    $doprint=0;
    }
  if ($doprint) {
    if ($flt_strand && $flt_strand ne $strand) {
      $doprint=0;
      }
    }
  if ($doprint && @flt_intv>0) {
    if ($r_range) { #t span overlap any range intervals
       $doprint=1 if checkOvlExons($fstart, $fend, \@flt_intv);
       }
     elsif ($c_range) { #t span contained in any of the intervals
       $doprint=1 if checkWithinExons($fstart, $fend, \@flt_intv);
       }
     elsif ($e_range) { # each exon checked for overlap of any interval
       for my $ed (@ex) {
         if (checkOvlExons($$ed[0], $$ed[1], \@flt_intv)) {
            $doprint=1;
            last;
            }
         }
       }
    }
  
  $doprint = ! $doprint if $fltinvert;
  next unless $doprint;
  #filter passed, print output
  if ($tab_attrs) {
     my @od;
     foreach my $a (@atab_cols) {
       if (uc($a) eq 'ID') {
          push(@od, $gffid);
          next;
          }
       push(@od, $attrs->{$a});
       }
     print join("\t",@od)."\n";
     next;
     }
   #the core attributes ID, Name must be there already
   unless (exists($attrs->{Name})) {
     foreach my $name (qw(gene_name gene geneID geneId geneid locus loc ID)) {
       if (exists($attrs->{$name})) {
           $attrs->{Name}=$attrs->{$name};
           last;
           }
          }
     }
   my ($tid, $tname)= (delete($attrs->{ID}), delete($attrs->{Name}));
   my ($gene_name, $gene, $locus);
   unless ($keepAll) {
     if (exists($attrs->{gene_name})) {
          $gene_name=delete($attrs->{gene_name});
          $gene_name=undef if ($gene_name eq $tname && !exists($oattrs{gene_name}));
          }
     if (exists($attrs->{gene})) {
          $gene=delete($attrs->{gene});
          $gene=undef if ($gene eq $tname || $gene eq $gene_name) && !exists($oattrs{gene});
          }
     if (exists($attrs->{locus})) {
          $locus=delete($attrs->{locus});
          $locus=undef if ($locus eq $gene_name || $locus eq $gene_name) && !exists($oattrs{locus});
          }
     }
   my $tattrs;  
   if ($printGTF) { #print GTF
     $tattrs='transcript_id "'.$tid.'"; gene_id "'.$tname.'";';
     $tattrs.=' gene_name "'.$gene_name.'";' if $gene_name;
     $tattrs.=' gene "'.$gene.'";' if $gene;
     $tattrs.=' locus "'.$locus.'";' if $locus;
     if ($keepAll) {
        foreach my $attr (keys(%$attrs)) {
           $tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
           }
        }
       else { # only a subset of attributes will be shown
        foreach my $attr (keys(%$attrs)) {
           next unless exists($oattrs{$attr});
           $tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
           }
        }
     }
    else { #print GFF3
     $tattrs='Parent='.$tid;
     my $pattrs="ID=$tid;Name=$tname";
     $pattrs.='gene_name='.$gene_name.';' if $gene_name;
     $pattrs.='gene='.$gene.';' if $gene;
     $pattrs.='locus='.$locus.';' if $locus;
     if ($keepAll) {
        foreach my $attr (keys(%$attrs)) {
           my $val=$attrs->{$attr};
           $pattrs.="$attr=$val;";
           }
        }
       else { # only a subset of attributes will be shown
        foreach my $attr (keys(%$attrs)) {
           next unless exists($oattrs{$attr});
           my $val=$attrs->{$attr};
           $pattrs.="$attr=$val;";
           }
        }
     print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n";
     }
   if ($CDSonly==0) { #write exons only when found in the input
        foreach my $ed (@ex) {
            print join("\t",$chr, $track, $subf, $$ed[0], $$ed[1], $$ed[2], $strand, $$ed[3], $tattrs)."\n";
            }
        }
   foreach my $cd (@cds) {
            print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $$cd[2], $strand, $$cd[3], $tattrs)."\n";
            }
  } #for each stored transcript
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 6 months ago) by gpertea
File size:	18716 byte(s)
Log Message:
Line	User	Rev	File contents
1	gpertea	23	#!/usr/bin/perl
2			use strict;
3			use Getopt::Std;
4			use FindBin;use lib $FindBin::Bin;
5
6			my $usage = q/Usage:
7			gffmanip [-v] [-o outrecords.txt] [{-r\|-c\|-e} <rangedata>] [-C] [-t <track>]\
8			[-i [<attr>:]<ids.txt>] [-a <attr>=<value>[,...]] [-f <feature1>[,..]] \
9			[-k <attrlist>] [-l [{-T\|-L\|-l <attrlist>}] [<gff\/gtf_file>]
10
11			Filtering options:
12			-C output only those transcripts having CDS features
13			-f output only the features matching any of the strings in the given
14			comma-delimited list (e.g. -f 'mRNA,exon,CDS')
15			-v show records that do NOT match the given filtering options (like grep's -v)
16			-r show only records overlapping any of the intervals given in <rangedata>,
17			which has the format:
18			[<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]]
19			-c same as -r, but only shows records fully contained in the interval(s) given
20			-e outputs records whose exon\/CDS segments overlap the interval(s) given
21			-i only shows records whose ID match any of the entries in the file <ids.txt>;
22			unless another attribute is given as a prefix (<attr>:), the ID attribute
23			is used for GFF3 input, and transcript_id for GTF
24			-a only shows records having the <attr> attribute with the value <value>;
25			multiple attribute\/value pairs can be given (comma delimited) and a record
26			will be shown if there is at least one attribute match
27
28			Output options:
29			-K keep all attributes ; by default only the core GFF\/GTF attributes are
30			shown (transcript_id, gene_id, gene_name, ID, Parent, Name, gene_name)
31			-k keep only the specified non-core attributes in the output records
32			-T output GTF format (default is GFF3)
33			-L output only the list of transcript IDs found in the file
34			(and matching the filtering options)
35			-l output the value of all attributes in <attrlist> for each record
36			(tab delimited if more than one attribute is given)
37			-t replace the 2nd column with the given <track> text
38			/;
39			umask 0002;
40			getopts('vTKACLf:r:l:c:e:i:a:t:k:o:') \|\| die($usage."\n");
41			die("$usage\n") unless @ARGV>0;
42			die("Only one input file is expected (or '-' for stdin)!\n") unless @ARGV==1;
43			my $input_gff=$ARGV[0];
44			my $outfile=$Getopt::Std::opt_o;
45			my %ignoredFeatures;
46			@ignoredFeatures{qw(intron cds_start cds_stop start_codon stop_codon start stop cdsstart cdsstop)}=();
47			my %oattrs; #hash with attrs to print
48			@oattrs{qw(ID transcript_id Parent gene_id gene gene_name)}=();
49			my %attrflt; # attribute=>value filter
50			my %idflt; # hash with IDs to keep
51			my %atab; # hash with attribute names whose values will be listed as tab delimited
52			my @atab_cols;
53			if ($outfile) {
54			open(OUTF, '>'.$outfile) \|\| die("Error creating output file $outfile\n");
55			select(OUTF);
56			}
57			# --
58			my ($fltinvert, $printGTF, $out_track)=
59			($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t);
60			my $only_ifCDS=$Getopt::Std::opt_v;
61			my ($idfile, $avflt, $featlist, $oattrlist)=
62			($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k);
63			my $keepAll=$Getopt::Std::opt_K \|\| $Getopt::Std::opt_A \|\| (lc($oattrlist) eq 'a' \|\| lc($oattrlist) eq 'all');
64			my ($r_range, $c_range, $e_range)=
65			($Getopt::Std::opt_r, $Getopt::Std::opt_c, $Getopt::Std::opt_e);
66			my $check_range;
67			if ($r_range) {
68			$check_range=$r_range;
69			die("Error: options -r, -c and -e are mutually exclusive!\n") if ($c_range \|\| $e_range);
70			}
71			elsif ($c_range) {
72			$check_range=$c_range;
73			die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range \|\| $e_range);
74			}
75			elsif ($e_range) {
76			$check_range=$e_range;
77			die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range \|\| $c_range);
78			}
79			my ($flt_chr, $flt_strand);
80			my @flt_intv;
81			if ($check_range) {
82			($flt_chr, my $ck_rlst)=split(/\:/,$check_range);
83			#die("$usage Incorrect format for the interval list!\n") unless $flt_chr && $ck_rlst;
84			my $flt_strand=substr($flt_chr,0,1);
85			if ($flt_strand eq '-' \|\| $flt_strand eq '+') {
86			substr($flt_chr,0,1)='';
87			}
88			else {
89			$flt_strand=undef;
90			my $e=chop($flt_chr);
91			if ($e eq '-' \|\| $e eq '+') {
92			$flt_strand=$e;
93			}
94			else { $flt_chr.=$e; }
95			} #no strand
96			my @flt_intv;
97			if ($ck_rlst) {
98			my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$ck_rlst));
99			foreach my $d (@rdata) {
100			($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
101			}
102			@flt_intv = sort { $a->[0] <=> $b->[0] } @rdata;
103			}
104			}
105			#my $range=$flt_chr.':'.$ck_rex[0]->[0].'-'.$ex[-1]->[1];
106
107			$oattrlist='' if $keepAll;
108			my $tab_attrs=$Getopt::Std::opt_l;
109			$tab_attrs='ID' if $Getopt::Std::opt_L;
110			if ($tab_attrs) {
111			@atab_cols=split(/\,/, $tab_attrs);
112			@atab{@atab_cols}=();
113			}
114			my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id
115			if ($idfile) {
116			unless (-f $idfile) {
117			my ($a,$f)=(split(/\:/,$idfile));
118			($idfile, $idfileattr)=($f,$a) if $f;
119			}
120			my $idf;
121			if ($idfile eq '-') {
122			open($idf, "<&=STDIN") \|\| die("Error: couldn't alias STDIN. $!\n");
123			}
124			else {
125			open($idf, $idfile) \|\| die("Error: cannot open $idfile $!\n");
126			}
127			while (<$idf>) {
128			my ($id)=(m/(\S+)/);
129			$idflt{$id}=1 if $id;
130			}
131			close($idf);
132			}
133			if ($avflt) {
134			my @avl=split(/\,/, $avflt);
135			foreach my $avpair (@avl) {
136			my ($attr, $value)=split(/\s=\s/,$avpair,2);
137			$value=~s/[" ]+$//;$value=~s/^[" ]+//;
138			$attrflt{$attr}=$value;
139			}
140			}
141			my %featflt; # feature list filter -- if $featlist was given
142			if ($featlist) {
143			#add to the list of attributes to be kept
144			my @fl=split(/\,/, $featlist);
145			@featflt{@fl}=(); # this is actually an input filter
146			}
147			if ($oattrlist) {
148			#add to the list of attributes to be kept
149			my @al=split(/\,/, $oattrlist);
150			@oattrs{@al}=();
151			}
152			# ----
153			my %gffrecs; # recID => [ chr, strand, feat_type, \%attrs, fstart, fend, [@exons], [@cds], isgff3, rejected, track, subfeat, fscore ]
154			# 0 1 2 3 4 5 6 7 8 9 10 , 11, 12
155			# recID has the prefix '<chr>\|' which should be removed before output
156			my $gffh;
157			if ($input_gff eq '-') {
158			open($gffh, "<&=STDIN") \|\| die("Error: couldn't alias STDIN $!\n");
159			}
160			else {
161			open($gffh, $input_gff) \|\| die("Error opening file $input_gff $!\n");
162			}
163
164			loadGff($gffh, \%gffrecs);
165			my @sorted_recs=sort sortByLoc keys(%gffrecs);
166			processGffRecs(\%gffrecs, \@sorted_recs);
167
168			# --
169			if ($outfile) {
170			select(STDOUT);
171			close(OUTF);
172			}
173
174			#********** Subroutines ************
175			sub sortByLoc {
176			my $da=$gffrecs{$a};
177			my $db=$gffrecs{$b};
178			if ($$da[0] eq $$db[0]) {
179			return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ;
180			}
181			else { return $$da[0] cmp $$db[0] ; }
182			}
183
184			sub checkOvlExons {
185			my ($a, $b, $rx)=@_;
186			return 0 if ($a>$$rx[-1]->[1] \|\| $b<$$rx[0]->[0]); # not overlapping the whole exon chain
187			foreach my $x (@$rx) {
188			return 1 if ($a<=$$x[1] && $b>=$$x[0]);
189			return 0 if $b<$$x[0];
190			}
191			}
192
193			sub checkWithinExons {
194			my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval
195			return 0 if ($a>$$rx[-1]->[1] \|\| $b<$$rx[0]->[0]); # not overlapping the whole exon chain
196			foreach my $x (@$rx) {
197			return 1 if ($a>=$$x[0] && $b<=$$x[1]);
198			return 0 if $b<$$x[0];
199			}
200			}
201
202
203			sub loadGff {
204			my ($gffhandle, $recs)=@_;
205			while (<$gffhandle>) {
206			next if m/^\s*#/;
207			chomp;
208			my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
209			next unless $fstart>1 && $lnum;
210			$track=$out_track if $out_track;
211			next if exists($ignoredFeatures{lc($f)});
212			$f='exon' if $f=~m/exon/i;
213			$f='CDS' if $f=~m/^cds$/i;
214			my $rejected;
215			$rejected=1 if ($featlist && not exists($featflt{$f}));
216			#next if $f eq 'gene' \|\| $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
217			my $gff3_ID;
218			my $gff3_Parent;
219			my ($gname,$tdescr);
220			my %attrs;
221			($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
222			#$lnum=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; within text between quotes
223			my @av=split(/\s\;\s/,$lnum);
224			($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
225			($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
226			my $isGFF3 = ($gff3_ID \|\| $gff3_Parent);
227			if ($isGFF3) { # GFF format
228			$gff3_ID=~tr/"//d; #"
229			$gff3_Parent=~tr/"//d; #"
230			$gff3_Parent='' if ($f =~m/RNA/); # we really don't care about parent for RNA features
231			if ($gff3_ID && !$gff3_Parent) { #top level feature
232			foreach my $a (@av) {
233			my ($attr, $value)=split(/\s=\s/,$a,2);
234			$value=~s/[" ]+$//;$value=~s/^[" ]+//;
235			$attrs{$attr}=$value;
236			}
237			if ($f=~m/RNA/i \|\| $f=~/gene/) {
238			# try to parse the description, if any
239
240			if ($lnum=~m/\b(?:descr\|tophit\|info\|product)\s=\s"?([^;"]+)/i) {
241			$tdescr=$1;
242			}
243			elsif ($lnum=~m/Name\s=\s"?([^;"]+)/) {
244			$tdescr=$1;
245			}
246			if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
247			$gname=$1;
248			}
249			elsif ($lnum=~m/Name\s=\s"?([^;"]+)/) {
250			$gname=$1;
251			}
252			$tdescr='' if ($tdescr eq $gname);
253			$gname='' if $gname eq $gff3_ID;
254			}
255			die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs->{"$chr\|$gff3_ID"}));
256			my $recID="$chr\|$gff3_ID";
257			$recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore];
258			next;
259			} # parent/top-level feature
260			} #GFF
261			else { #GTF format
262			if ($f eq 'transcript') { # GTF with parent 'transcript' feature
263			foreach my $a (@av) {
264			my ($attr, $value)=split(/\s+"/,$a,2); #"
265			$value=~s/[" ]+$//;
266			if ($attr eq 'transcript_id') {
267			$attr='ID';
268			}
269			elsif ($attr eq 'gene_id') {
270			$attr='Name';
271			}
272			$attrs{$attr}=$value;
273			}
274			my $recID=$attrs{'ID'};
275			die("Error: cannot find transcript_id for GTF 'transcript' line:\n$_\n") unless $recID;
276			die("Error: duplicate feature $recID on $chr\n") if (exists($recs->{"$chr\|$recID"}));
277			$recID=$chr.'\|'.$recID;
278			$recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore ];
279			next;
280			} # parent 'transcript' feature in GTF
281			}
282			# -------------- exon/CDS line here:
283			next if ($featlist && !exists($featflt{$f}));
284			my $recID;
285			($gname, $tdescr)=();
286			if ($isGFF3) {
287			$recID=$gff3_Parent;
288			}
289			elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\\|\-]+)/) {
290			$recID=$1;
291			$recID=~tr/"//d; #"
292			}
293			elsif ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
294			$recID=$chr.'.jsm.'.$lnum;
295			$gff3_Parent=$recID;
296			$isGFF3=1;
297			$f='CDS';
298			}
299			else {
300			die("Error: cannot parse locus/transcript name from input line:\n$_\n");
301			}
302			if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\\|\-]+)/) {
303			$gname=$1;
304			$gname=~tr/"//d; #"
305			}
306			$tdescr='' if index($recID, $tdescr)>=0;
307			$gname='' if index($recID, $gname)>=0;
308			$recID=$chr.'\|'.$recID;
309			my $ld = $recs->{$recID};
310			if ($ld) { #existing entry
311			my $i=($f eq 'CDS') ? 7 : 6;
312			my ($lstart, $lend)=($$ld[4], $$ld[5]);
313			$$ld[4]=$fstart if $fstart<$lstart;
314			$$ld[5]=$fend if $fend>$lend;
315			push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame]);
316			if ($f ne 'CDS') {
317			if ($$ld[11] && $$ld[11] ne $f) {
318			die("Error: multiple non-CDS subfeatures found for $recID ($$ld[11], $f)\n");
319			}
320			$$ld[11]=$f;
321			}
322			}
323			else { # first time seeing this locus/gene
324			# get the attributes from this first exon line
325			if ($gff3_Parent) { #has GFF3 Parent
326			foreach my $a (@av) {
327			my ($attr, $value)=split(/\s=\s/,$a,2); #"
328			$attr='ID' if $attr eq 'Parent';
329			$value=~s/[" ]+$//;$value=~s/^[" ]+//;
330			}
331			}
332			else { # GTF
333			foreach my $a (@av) {
334			my ($attr, $value)=split(/\s+"/,$a,2); #"
335			$value=~s/[" ]+$//;
336			if ($attr eq 'transcript_id') {
337			$attr='ID';
338			}
339			elsif ($attr eq 'gene_id') {
340			$attr='Name';
341			}
342			$attrs{$attr}=$value unless $attr=~m/^exon/;;
343			}
344			}
345			$recs->{$recID} = ($f eq 'CDS') ?
346			[$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [[$fstart, $fend, $fscore, $frame]], $isGFF3, $rejected, $track, '','.' ] :
347			[$chr, $strand, $f, {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame]], [], $isGFF3, $rejected, $track, $f,'.' ] ;
348			# 0 1 2 3 4 5 6(exons) 7 (CDS) 8 9 10
349			}
350			}
351			close($gffh);
352			}
353
354			sub processGffRecs {
355			#return if keys(%recs)==0;
356			my ($recs, $rlist)=@_;
357			my @recs_keys;
358			unless ($rlist) {
359			@recs_keys=keys(%$recs);
360			$rlist=\@recs_keys;
361			}
362			foreach my $recid (@$rlist) {
363			my $td=$$recs{$recid};
364			# 0 1 2 3 4 5 6 7 8 9 10 11 12
365			my ($chr, $strand, $feature, $attrs, $fstart, $fend, $er, $cr, $isGFF3, $rej, $track, $subf, $fscore) = @$td;
366			next if ($rej && !$fltinvert);
367			next if ($fltinvert && $featlist && !$rej);
368			# my ($mstart,$mend)=($fstart, $fend);
369			my $CDSonly=0; # set to 1 if only CDS segments were given
370			my $hasCDS=0;
371			my @ex;
372			my @cds;
373			#some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
374			if (@$er<1 && @$cr>0) {
375			@ex = sort { $a->[0] <=> $b->[0] } @$cr;
376			@cds=@ex;
377			$CDSonly=1;
378			$hasCDS=1;
379			}
380			else {
381			@ex = sort { $a->[0] <=> $b->[0] } @$er;
382			if (@$cr>0) { # sort cds segments too
383			@cds= sort { $a->[0] <=> $b->[0] } @$cr;
384			$hasCDS=1;
385			}
386			}
387			# --------------
388			# get the more accurate version of the start-end coords for the feature
389			my $covlen=0;
390			# map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
391			my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
392			my $gffid=$recid;
393			substr($gffid, 0, length($chr)+1)='';
394			die("Error: gffid ($gffid) not matching attrs\{ID\}(".$attrs-{'ID'}.")!\n")
395			unless ($gffid eq $attrs->{'ID'});
396			my $doprint=1;
397			#check all the filters
398			$doprint=0 if ($only_ifCDS && $hasCDS==0);
399			if ($doprint && $idfile) {
400			if ($idfileattr) {
401			$doprint=0 unless exists($idflt{$attrs->{$idfileattr}});
402			}
403			else {
404			$doprint=0 unless exists($idflt{$gffid});
405			}
406			}
407			if ($doprint && $avflt) {
408			my $avfound=0;
409			foreach my $a (keys(%attrflt)) {
410			if ($attrflt{$a} eq $attrs->{$a}) {
411			$avfound=1;
412			last;
413			}
414			}
415			$doprint=0 unless $avfound;
416			}
417
418
419			if ($doprint && $flt_chr && $flt_chr ne $chr) {
420			$doprint=0;
421			}
422			if ($doprint) {
423			if ($flt_strand && $flt_strand ne $strand) {
424			$doprint=0;
425			}
426			}
427			if ($doprint && @flt_intv>0) {
428			if ($r_range) { #t span overlap any range intervals
429			$doprint=1 if checkOvlExons($fstart, $fend, \@flt_intv);
430			}
431			elsif ($c_range) { #t span contained in any of the intervals
432			$doprint=1 if checkWithinExons($fstart, $fend, \@flt_intv);
433			}
434			elsif ($e_range) { # each exon checked for overlap of any interval
435			for my $ed (@ex) {
436			if (checkOvlExons($$ed[0], $$ed[1], \@flt_intv)) {
437			$doprint=1;
438			last;
439			}
440			}
441			}
442			}
443
444			$doprint = ! $doprint if $fltinvert;
445			next unless $doprint;
446			#filter passed, print output
447			if ($tab_attrs) {
448			my @od;
449			foreach my $a (@atab_cols) {
450			if (uc($a) eq 'ID') {
451			push(@od, $gffid);
452			next;
453			}
454			push(@od, $attrs->{$a});
455			}
456			print join("\t",@od)."\n";
457			next;
458			}
459			#the core attributes ID, Name must be there already
460			unless (exists($attrs->{Name})) {
461			foreach my $name (qw(gene_name gene geneID geneId geneid locus loc ID)) {
462			if (exists($attrs->{$name})) {
463			$attrs->{Name}=$attrs->{$name};
464			last;
465			}
466			}
467			}
468			my ($tid, $tname)= (delete($attrs->{ID}), delete($attrs->{Name}));
469			my ($gene_name, $gene, $locus);
470			unless ($keepAll) {
471			if (exists($attrs->{gene_name})) {
472			$gene_name=delete($attrs->{gene_name});
473			$gene_name=undef if ($gene_name eq $tname && !exists($oattrs{gene_name}));
474			}
475			if (exists($attrs->{gene})) {
476			$gene=delete($attrs->{gene});
477			$gene=undef if ($gene eq $tname \|\| $gene eq $gene_name) && !exists($oattrs{gene});
478			}
479			if (exists($attrs->{locus})) {
480			$locus=delete($attrs->{locus});
481			$locus=undef if ($locus eq $gene_name \|\| $locus eq $gene_name) && !exists($oattrs{locus});
482			}
483			}
484			my $tattrs;
485			if ($printGTF) { #print GTF
486			$tattrs='transcript_id "'.$tid.'"; gene_id "'.$tname.'";';
487			$tattrs.=' gene_name "'.$gene_name.'";' if $gene_name;
488			$tattrs.=' gene "'.$gene.'";' if $gene;
489			$tattrs.=' locus "'.$locus.'";' if $locus;
490			if ($keepAll) {
491			foreach my $attr (keys(%$attrs)) {
492			$tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
493			}
494			}
495			else { # only a subset of attributes will be shown
496			foreach my $attr (keys(%$attrs)) {
497			next unless exists($oattrs{$attr});
498			$tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
499			}
500			}
501			}
502			else { #print GFF3
503			$tattrs='Parent='.$tid;
504			my $pattrs="ID=$tid;Name=$tname";
505			$pattrs.='gene_name='.$gene_name.';' if $gene_name;
506			$pattrs.='gene='.$gene.';' if $gene;
507			$pattrs.='locus='.$locus.';' if $locus;
508			if ($keepAll) {
509			foreach my $attr (keys(%$attrs)) {
510			my $val=$attrs->{$attr};
511			$pattrs.="$attr=$val;";
512			}
513			}
514			else { # only a subset of attributes will be shown
515			foreach my $attr (keys(%$attrs)) {
516			next unless exists($oattrs{$attr});
517			my $val=$attrs->{$attr};
518			$pattrs.="$attr=$val;";
519			}
520			}
521			print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n";
522			}
523			if ($CDSonly==0) { #write exons only when found in the input
524			foreach my $ed (@ex) {
525			print join("\t",$chr, $track, $subf, $$ed[0], $$ed[1], $$ed[2], $strand, $$ed[3], $tattrs)."\n";
526			}
527			}
528			foreach my $cd (@cds) {
529			print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $$cd[2], $strand, $$cd[3], $tattrs)."\n";
530			}
531			} #for each stored transcript
532			}