[ViewVC] Annotation of: gclib/scripts/gfflt

#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;

my $usage = q/Usage:
gfflt [-c <mincov>] [-p <minpid>] [-P] [-l <fsize_file>] [-s<strand>][{-C|-E}] \
  [-D|-T] [-f <ftstr>] [-t <track>] [-r <start>-<end>] [-n<ID_suffix>] \
  [-i <maxintron>] [-u dupIDs.tab] [-w <map_out_file>] [-o <outfile>]

 Note: requires segment subfeatures (CDS\/exons) to be strictly 
       grouped with (i.e. follow) their parent mRNA feature

 -l    : file with extra mapping information, 3 tab-delimited columns
         are expected in each line: 
         <qry_ID> <qry_len> <qry_description>
 -P    : only for PMAP output files, take "Target" coordinates
         as nucleotide-based (divide by 3 to get the actual query protein
         coordinates)
         
Filtering options:
 -r    : only show gff records fully included in between the given coordinates
 -R    : like -r but also show mappings that intersect the given range
 -s    : only show gff records on the given strand
 -u    : duplicate IDs should be discarded according to the given ID table
 -c    : discard mappings with coverage lower than <mincov>;
         unless qreg or qlen attributes are found in the mRNA line,
         this option requires the -l option to provide query lengths
 -p    : discard mappings with percent identity lower than <minpid>;
         requires the score column or the PID attribute for the 'mRNA' 
         feature
 -x    : discard exon mappings with percent identity lower than <xminpid>;
        (coverage and percent identity will be adjusted accordingly)
 -C|-E : read only the CDS (-C) or only the exon segments (-E)
         from the input file (default: read both);
Output modifiers:

  -f   : replace the output segment feature name ("CDS" or "exon")
         with this string (useful for converting "CDS" into "exon" or
         the other way around); requires -C or -E option
  -n   : add the given alphanumeric suffix followed by a counter
         for each mapping of a query sequence (starting at 1)
         (only the first 4 letters of <ID_suffix> will be used)
  -N   : for -n option, the whole ID is replaced with the given string
  -M   : make the "Name" attribute to have the same value with "ID" attribute
  -t   : change the track name (2nd GFF column) into given label
  -D   : add mapping descriptions ("descr=" attribute) as found in the
         3rd column of the <fsize_file> (so it requires -l option)
  -V   : add "cov" and "pid" attributes to the mRNA lines if possible 
         and if not there already
  -S   : strip the output of any other attributes besides ID, Parent and Name
  -T   : write GTF format instead (this is incompatible with -D and -V)
  
/;
umask 0002;
my @inputfiles;
while ($ARGV[0] && $ARGV[0]!~m/^\-/) {
  push(@inputfiles, shift(@ARGV));
  }
getopts('i:t:r:R:l:s:c:x:p:w:f:u:n:o:PMSNCEDTV') || die($usage."\n");
unshift(@ARGV, @inputfiles) if @inputfiles>0;
#-- options
my $outfile=$Getopt::Std::opt_o;
my $dupfile=$Getopt::Std::opt_u;
my $maxintron=$Getopt::Std::opt_i || 450000;
my $stripAttrs=$Getopt::Std::opt_S;
my $fullRename=$Getopt::Std::opt_N;
my $resetName=$Getopt::Std::opt_M;
my $globalCounter=0;
my $inFeat;
$inFeat='exon' if $Getopt::Std::opt_E;
$inFeat='CDS' if $Getopt::Std::opt_C;
my $pmap=$Getopt::Std::opt_P;
my %dups; #duplicate IDs are kept here -- $dups{$id} => $mainid
my %uniq; #key is ctgid_qryid_startcoord_endcoord_covscore
#my $startOnly=$Getopt::Std::opt_S;
my ($minpid, $mincov, $addCounter, $addDescr, $outFeat, $GTF)=
 ($Getopt::Std::opt_p, $Getopt::Std::opt_c, $Getopt::Std::opt_n, 
  $Getopt::Std::opt_D, $Getopt::Std::opt_f, $Getopt::Std::opt_T);
my $g_range = $Getopt::Std::opt_r || $Getopt::Std::opt_R;
my $xrange = 1 if ($g_range && $Getopt::Std::opt_R);
my ($mincoord, $maxcoord);
if ($g_range) {
  ($mincoord, $maxcoord)=($g_range=~m/^(\d*)[\.\-_]+(\d*)$/);
  $mincoord=1 unless $mincoord>0;
  $maxcoord=4000000000 unless $maxcoord>0;
  #print STDERR "range filter: $mincoord .. $maxcoord\n";
  }
my $fltstrand=$Getopt::Std::opt_s;
my $xminpid = $Getopt::Std::opt_x;
my $ntrack=$Getopt::Std::opt_t;
$mincov=~tr/%//d;
$mincov=1 unless $mincov;
$minpid=20 unless $minpid;
$minpid=20 unless $minpid;
$xminpid=12 unless $xminpid;
my $fmapout=$Getopt::Std::opt_w;
if ($fmapout) {
 open(FMAP, '>'.$fmapout) || die("Error creating file $fmapout\n");
 }
my $fsizefile=$Getopt::Std::opt_l;
my %f;
my %n;
if ($fsizefile) {
  open(FSIZE, $fsizefile) || die("Error opening query info file $fsizefile!\n");
  while(<FSIZE>) {
    chomp;
    my @t=split(/\s+/,$_,3);
    next unless $t[1];
    $f{$t[0]}=[$t[1], $t[2]]; # seqlen, description
    }
  close(FSIZE);
  }
if ($dupfile) {
 open(DUPFILE, $dupfile) || die ("Error opening duplicate table file $dupfile!\n");
 while (<DUPFILE>) {
   chomp;
   my @t=split(/\t/);
   foreach my $id (@t) {
      $dups{$id}=$t[0];
      }
   }
 close(DUPFILE);
 }  
my $curqry; #current target (query) name
my ($curqstart, $curqend, $curqlen, $curcov, $curpid, $use_tcoords); #length of the current query
my $curdescr; #defline of the current query
my $covlen; #coverage of current query
my $eScore; #cummulated exon score
my ($gffid, $gchr, $gtrack, $gname, $gstart, $gend, $gscore, $gstrand, $ginfo);
#                       0          1        2       3      4      5        6
my @exd; # list of [exon_start, exon_end, phase, score, qstart, qend, exon_other_attrs]
my @cds; # list of [cds_start, cds_end,   phase, score, qstart, qend, cds_other_attrs]
if ($outfile) {
 open(OUTFILE, '>'.$outfile) || die ("Error creating file $outfile\n");
 select(OUTFILE);
 }
while (<>) {
 next if m/^\s*#/;
 chomp;
 my ($chr, $binfo, $ftype, $fstart, $fend, $fscore,
      $strand, $phase, $finfo)=split(/\t/);
 next if $ftype eq 'gene'; # just discard the superfluous 'gene' entries;
 if ($ftype eq 'mRNA') { #start of a new mapping
    &flush_gbuf();
    ($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $ginfo)=
     ($chr,  $binfo,  $fstart, $fend, $fscore, $strand, $finfo);
    ($gffid)=($ginfo=~m/ID=([^;]+)/);
    die("Error finding the current ID (in attribute field: $finfo)\n") unless $gffid;
    $ginfo=~s/ID=[^;]+\s*;?//;
    $ginfo=~s/Parent=[^;]+\s*;?//;
    if ($ginfo=~s/Name=([^;]+)\s*;?//) {
      $gname=$1;
      }
    if ($ginfo=~m/qlen=(\d+)/i) {
      $curqlen=$1;
      $ginfo=~s/qlen=\d+\s*;?//i;
      }
    if ($ginfo=~m/qreg=(\d+)\-(\d+)/i) {
      ($curqstart, $curqend)=($1,$2);
      $curqstart=1 if $curqstart==0;
      if ($ginfo=~m/qreg=\d+\-\d+\|(\d+)/i) {
         $curqlen=$1;
         $ginfo=~s/qreg=\d+\-\d+\|\d+\s*;?//i;
         }
        else {
         $ginfo=~s/qreg=\d+\-\d+\s*;?//i;
         }
      }
    if ($ginfo=~m/cov=([\d\.]+)/i) {
      $curcov=$1;
      $ginfo=~s/cov=[\d\.]+\s*;?//i;
      }
    if ($ginfo=~m/\bpid=([\d\.]+)/i) {
      $curpid=$1;
      $ginfo=~s/\bpid=[\d\.]+\s*;?//i;
      }
    if ($fscore>50 && $fscore<=100 && !$curpid) {
      $curpid=$fscore;
      }
    next;
    } # mRNA line 
 # vvvvvvvvv  exons or CDS segments vvvvvvvvvvv
 $ftype=~s/\w+\-exon$/exon/i; #jigsaw
 next unless $ftype eq 'exon' || $ftype eq 'CDS';
 my $gtfgeneid;
 my $id;
 if ($finfo=~m/(?:Parent|transcript_id)[=\s]+"?([^;^\s^"]+)/) {
   $id=$1;
   unless ($finfo=~s/Parent=[^;]+\s*;?//) {
      #must be GTF format -- discard anything else but the gene_id, if found;
      ($gtfgeneid)=($finfo=~m/gene_id[=\s]+"?([^;^\s^"]+)/);
      # we could parse all extra GTF attributes here and rebuild them in GFF3 format..
      # .. nah
      $finfo="";
      }
   }
  else { # jigsaw and other over-simplified GFF formats
   my ($geneid)=($finfo=~m/gene_id[=\s]+"?([^;^\s^"]+)/);
   if ($geneid) {
     $id=$geneid;
     $finfo="";
     }
    else {
     ($id)=($finfo=~m/^([^;^\s]+)/);
     if ($binfo=~m/^jigsaw/) {
          ($finfo)=($finfo=~m/(gene_score=[\-\d\.]+)/);
          }
      else { $finfo=""; }
    }
   }
 die("Error getting the current ID from attribute field: $finfo\n") unless $id;
 $finfo=~s/ID=[^;]+\s*;?//;
 $finfo=~s/Name=([^;]+)\s*;?//;
 if (!$gffid || ($gffid ne $id) || ($gchr && $gchr ne $chr)) {
   # start of new mapping here (change in gff ID or base sequence)
   #print STDERR "gffid=$gffid, id=$id (gchr=$gchr)\n";
   &flush_gbuf();
   ($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $ginfo)=
     ($chr,  $binfo,  $fstart, $fend, $fscore, $strand, $finfo);
   $ginfo=~s/Target=[^;]+\s*;?//i;
   $ginfo=~s/qreg=\d+\-\d+\s*;?//i;
   $gname=$gtfgeneid;
   }
 $finfo=~s/gene_score=[\-\d\.]+\s*\;?//i;
 $gffid=$id;
 my ($target, $t1, $t2, $tstrand);
 if ($finfo=~m/Target=([^;]+)/i) {
   my $alninfo=$1;
   ($target, $t1, $t2, $tstrand)=($alninfo=~m/^(\S+)\s+(\d+)\s+(\d+)\s+([\-+])/);
   $finfo=~s/Target=[^;]+\s*;?//i;
   $use_tcoords=1;
   }
  else {
    if ($finfo=~m/qreg=(\d+)\-(\d+)/i) {
      ($t1,$t2)=($1,$2);
      $t1=1 if $t1==0;
      $finfo=~s/qreg=\d+\-\d+\s*;?//i;
      }
   $target=$gffid;
   $target=~s/\.[a-z]{1,5}\d+$//;
   }
 if ($curqry) {
   die("Error: CDS/exon Target entry found out of place ($_)\n")
              if ($target ne $curqry);
   }
  else {
    $curqry=$target; #first time assignment of curqry
    if ($fsizefile) {
         # if ($mincov) {
         my $qd=$f{$curqry};
         die("Error getting data for $curqry from $fsizefile!\n") unless $qd;
         ($curqlen, $curdescr)=@$qd;
         #print STDERR "=====> descr for $curqry: $curdescr\n";
         }
    $covlen=0;
    } # first time assignment of curqry
   $phase='.' if $phase<0;
   ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
   ($t1, $t2)=($t2, $t1) if ($t2<$t1);
   if ($t2 && $pmap) { 
      ($t1, $t2)=(int($t1/3), int($t2/3));
      $t1=1 if $t1<1;
      };
   if ($ftype eq 'exon') {
     #              0        1     2         3       4   5      6
     push(@exd, [$fstart, $fend, $phase,  $fscore, $t1, $t2, $finfo]);
     next;
     }
    else { # ftype must be CDS here
     # --exon entries should come first ?
     #  die("Error: missing query heading for: $_\n") unless $curqry;
     #next if ($minpid && $fscore<$minpid);
     push(@cds, [$fstart, $fend, $phase, $fscore, $t1, $t2, $finfo]);
     }
} # line parsing loop

flush_gbuf() if $curqry;
select(STDOUT);
close(OUTFILE) if ($outfile);
close(FMAP) if ($fmapout);
# END here

#================ SUBROUTINES ============

sub flush_gbuf {
  return unless $gffid;
  check_gff();
  ($gffid, $curqry, $curqlen, $covlen, $curqstart, $curqend, $curcov, $curpid, $use_tcoords)=
   (undef, undef,    0,        0,         0,            0,      0,      0, 0);
  ($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $gname, $ginfo)=(undef) x 8;
  @exd=();
  @cds=();
  $eScore=0;
}

sub check_gff {
 #die ("Error at calling flush_gbuf(): current length must NOT be zero!\n") unless $curqlen>0;
 #$covlen=$covlen/3 if $pmap;
 #my $cov=$curqlen ? ($covlen*100.00)/$curqlen : 0; 
 #return if ($mincov && $cov<$mincov); #failed coverage condition
 
 my $hasCDSonly; # for pblat or some GTF file, we may have no exon segments
 if (@exd==0) {
   @exd=sort { $main::a->[0] <=> $main::b->[0] } @cds; 
   $hasCDSonly=1;
   @cds=();
   }
  else {
   @exd = sort { $main::a->[0] <=> $main::b->[0] } @exd;
   @cds = sort { $main::a->[0] <=> $main::b->[0] } @cds;
   }
 my $prev_end=0;
 #my $covscore=0;
 my $i=0;
 my $numexons=0;
 my @fexd; #filtered set of exons
 my $qcovlen=0; #total aligned length
 my $gcovlen=0; #covered length (in nucleotides)
 my $covscore=0;
 foreach my $x (@exd) {
   $i++;
   my $xpid = ($$x[3]>=10 && $$x[3]<=100.0)? $$x[3] : 0;
   next if ($xpid && $xpid<$xminpid && (($i>1 && $i<@exd) || @exd==1)); #skip internal exons with lower pid
   if ($prev_end>0) {
      my $intronlen=$$x[0]-$prev_end;
      return if $intronlen>$maxintron;
      }
   $prev_end=$$x[1];
   $covscore+=($$x[1]-$$x[0]+1)*$xpid;
   $gcovlen+=($$x[1]-$$x[0]+1);   
   $qcovlen+=($$x[5]-$$x[4]+1) if $$x[5]>0;
   push(@fexd, $x);
   $numexons++;
   }
 return if @fexd==0;
 if ($g_range) {
   if ($xrange) {
    #everything that intersects given range
    return if ($gstart>$maxcoord || $gend<$mincoord);
    }
   else {
    return unless ($gstart>=$mincoord && $gend<=$maxcoord);
    }
   }
 return if ($fltstrand && $gstrand ne $fltstrand);
 my $totpid=sprintf('%.2f',$covscore/$gcovlen);
 my $qlen=$curqlen; #query length, in nucleotides
 my $qcov = ($qcovlen>5 && $qlen>0) ? ($qcovlen*100.00)/$qlen : $curcov;
 #print STDERR "=====> gffID: $gffid  qcov=$qcov\n";
 return if ($numexons<1 || ($totpid>10 && $totpid<$minpid) || ($qcov && $qcov<$mincov));
 #my ($parent)=($ginfo=~m/\bID\s*=\s*([^;]+)/);
 my $parent=$gffid;
 if ($curqry) {
   my $q=$dups{$curqry} || $curqry;
   my $uid="$gchr|$q|$gstart|$gend|$qcov";
   return if exists($uniq{$uid});
   $uniq{$uid}=1;
   }
 if ($addCounter) {
   if ($fullRename) {
     $globalCounter++;
     $parent=$addCounter.$globalCounter;
     }
    else { 
     die("Error: no curqry available for gff ID: $gffid!\n") unless $curqry;
     my $mnum = ++$n{$curqry};
     $parent=$curqry.'.'.$addCounter.$mnum;
     }
   
   }
 my $gffattrs="ID=$parent";
 my $gffname=$gname || $parent;
 if ($stripAttrs || $resetName) {
    $gffname=$parent;
    } 
 $gffattrs.=";Name=$gffname"; #just use a name there so argo can show it
 
 if ($use_tcoords) {
   my @td=sort { $main::a->[4] <=> $main::b->[4] } @fexd;
   $curqstart=$td[0]->[4];
   $curqend=$td[-1]->[5];
   }
 if ($qcov>1) {
   $qcov=100 if $qcov>100;
   $gffattrs.=';cov='.sprintf('%.1f',$qcov) unless $stripAttrs;
   }
 unless ($totpid>0) {
    $totpid=$gscore if $gscore>10 && $gscore<=100.0;
    }
 unless ($covscore) {
    my $rqlen=$qlen;
    $rqlen*=3 if $pmap;
    $covscore=(($rqlen*$qcov)/100.00)*$totpid;
    }
 unless ($stripAttrs) {
  $gffattrs.=';pid='.sprintf('%.2f',$totpid) if $totpid>2; # unless $ginfo=~m/Identity=/;
  $gffattrs.=';covscore='.sprintf('%d',$covscore) if $covscore>10; 
 #$ginfo.=';score='.sprintf('%d',($eScore/100.00)) unless $ginfo=~m/score=/i;
 if ($curqstart && $curqend) {
   $gffattrs.=";qreg=$curqstart-$curqend";
   $gffattrs.="|$curqlen" if $curqlen;
   }
 else {
   $gffattrs.=';qlen='.$curqlen if $curqlen;
   }
  }
 my ($gffdescr)=($ginfo=~m/\b(?:descr|info)=\"([^"]+)/);
 unless ($stripAttrs) { 
 $ginfo=~s/;$//;
 $ginfo=~s/^;//;
 if ($curdescr) {
   $curdescr =~ tr/"\t;=/' .:/s;  #"  
   $curdescr=~s/[\.\;\, ]+$//;
   $ginfo=~s/\b(?:descr|info)=\"[^"]+\"\s*;?//i;
   if ($ginfo) {
      $ginfo.=';descr="'.$curdescr.'"';
      }
    else { $ginfo=';descr="'.$curdescr.'"'; }
   }
 $gffattrs.=";$ginfo" if $ginfo;
 }
 $gffattrs=~tr/; /; /s;
 
 my $gfftrack= $ntrack || $gtrack;  
print join("\t", $gchr, $gfftrack, 'mRNA', $gstart, $gend, $gscore, $gstrand, '.', $gffattrs)."\n" unless $GTF;
# exons:
my $fonly=$outFeat;
if ($outFeat) {
  $fonly=$outFeat;
  }
 else {
  $fonly= $hasCDSonly ? 'CDS':'exon';
  }
my $wroteExons=0;
if (!$inFeat || ($inFeat eq 'exon') || $hasCDSonly) {
  $wroteExons=1;
  foreach my $x (@fexd) {     
     my $xinfo="Parent=$parent";
     unless ($stripAttrs) {
      $xinfo.=";qreg=$$x[4]-$$x[5]" if ($$x[4]>0);
      $$x[6]=~s/^;+//;
      $xinfo.=";$$x[6]" if $$x[6];
      }
     print join("\t", $gchr, $gfftrack, $fonly, $$x[0], $$x[1], $$x[3], $gstrand, $$x[2],$xinfo)."\n";
     }
  }
if ($fmapout) {
  my $descr=$curdescr || $gffdescr;
  print FMAP join("\t", $gchr, $gstrand, $gstart, $gend, $curqry, sprintf('%d',$covscore),
              sprintf('%.1f',$totpid), sprintf('%.1f',$qcov), "$curqstart-$curqend|$curqlen", $descr)."\n";
  }
return if ($outFeat &&  $wroteExons);
$fonly=$outFeat || 'CDS';
if (@cds>0 && (!$inFeat || ($inFeat eq 'CDS'))) {
  foreach my $x (@cds) {
     my $xinfo="Parent=$parent";
     unless ($stripAttrs) {
      $xinfo.=";qreg=$$x[4]-$$x[5]" if ($$x[4]>0);
      $$x[6]=~s/^;+//;
      $xinfo.=";$$x[6]" if $$x[6];
      }
     print join("\t", $gchr, $gfftrack, $fonly, $$x[0], $$x[1], $$x[3], $gstrand, $$x[2], $xinfo)."\n";
    }
  }
}#sub flush_gbuf()
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 6 months ago) by gpertea
File size:	15938 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	use FindBin;use lib $FindBin::Bin;
5
6	my $usage = q/Usage:
7	gfflt [-c <mincov>] [-p <minpid>] [-P] [-l <fsize_file>] [-s<strand>][{-C\|-E}] \
8	[-D\|-T] [-f <ftstr>] [-t <track>] [-r <start>-<end>] [-n<ID_suffix>] \
9	[-i <maxintron>] [-u dupIDs.tab] [-w <map_out_file>] [-o <outfile>]
10
11	Note: requires segment subfeatures (CDS\/exons) to be strictly
12	grouped with (i.e. follow) their parent mRNA feature
13
14	-l : file with extra mapping information, 3 tab-delimited columns
15	are expected in each line:
16	<qry_ID> <qry_len> <qry_description>
17	-P : only for PMAP output files, take "Target" coordinates
18	as nucleotide-based (divide by 3 to get the actual query protein
19	coordinates)
20
21	Filtering options:
22	-r : only show gff records fully included in between the given coordinates
23	-R : like -r but also show mappings that intersect the given range
24	-s : only show gff records on the given strand
25	-u : duplicate IDs should be discarded according to the given ID table
26	-c : discard mappings with coverage lower than <mincov>;
27	unless qreg or qlen attributes are found in the mRNA line,
28	this option requires the -l option to provide query lengths
29	-p : discard mappings with percent identity lower than <minpid>;
30	requires the score column or the PID attribute for the 'mRNA'
31	feature
32	-x : discard exon mappings with percent identity lower than <xminpid>;
33	(coverage and percent identity will be adjusted accordingly)
34	-C\|-E : read only the CDS (-C) or only the exon segments (-E)
35	from the input file (default: read both);
36	Output modifiers:
37
38	-f : replace the output segment feature name ("CDS" or "exon")
39	with this string (useful for converting "CDS" into "exon" or
40	the other way around); requires -C or -E option
41	-n : add the given alphanumeric suffix followed by a counter
42	for each mapping of a query sequence (starting at 1)
43	(only the first 4 letters of <ID_suffix> will be used)
44	-N : for -n option, the whole ID is replaced with the given string
45	-M : make the "Name" attribute to have the same value with "ID" attribute
46	-t : change the track name (2nd GFF column) into given label
47	-D : add mapping descriptions ("descr=" attribute) as found in the
48	3rd column of the <fsize_file> (so it requires -l option)
49	-V : add "cov" and "pid" attributes to the mRNA lines if possible
50	and if not there already
51	-S : strip the output of any other attributes besides ID, Parent and Name
52	-T : write GTF format instead (this is incompatible with -D and -V)
53
54	/;
55	umask 0002;
56	my @inputfiles;
57	while ($ARGV[0] && $ARGV[0]!~m/^\-/) {
58	push(@inputfiles, shift(@ARGV));
59	}
60	getopts('i:t:r:R:l:s:c:x:p:w:f:u:n:o:PMSNCEDTV') \|\| die($usage."\n");
61	unshift(@ARGV, @inputfiles) if @inputfiles>0;
62	#-- options
63	my $outfile=$Getopt::Std::opt_o;
64	my $dupfile=$Getopt::Std::opt_u;
65	my $maxintron=$Getopt::Std::opt_i \|\| 450000;
66	my $stripAttrs=$Getopt::Std::opt_S;
67	my $fullRename=$Getopt::Std::opt_N;
68	my $resetName=$Getopt::Std::opt_M;
69	my $globalCounter=0;
70	my $inFeat;
71	$inFeat='exon' if $Getopt::Std::opt_E;
72	$inFeat='CDS' if $Getopt::Std::opt_C;
73	my $pmap=$Getopt::Std::opt_P;
74	my %dups; #duplicate IDs are kept here -- $dups{$id} => $mainid
75	my %uniq; #key is ctgid_qryid_startcoord_endcoord_covscore
76	#my $startOnly=$Getopt::Std::opt_S;
77	my ($minpid, $mincov, $addCounter, $addDescr, $outFeat, $GTF)=
78	($Getopt::Std::opt_p, $Getopt::Std::opt_c, $Getopt::Std::opt_n,
79	$Getopt::Std::opt_D, $Getopt::Std::opt_f, $Getopt::Std::opt_T);
80	my $g_range = $Getopt::Std::opt_r \|\| $Getopt::Std::opt_R;
81	my $xrange = 1 if ($g_range && $Getopt::Std::opt_R);
82	my ($mincoord, $maxcoord);
83	if ($g_range) {
84	($mincoord, $maxcoord)=($g_range=~m/^(\d)[\.\-_]+(\d)$/);
85	$mincoord=1 unless $mincoord>0;
86	$maxcoord=4000000000 unless $maxcoord>0;
87	#print STDERR "range filter: $mincoord .. $maxcoord\n";
88	}
89	my $fltstrand=$Getopt::Std::opt_s;
90	my $xminpid = $Getopt::Std::opt_x;
91	my $ntrack=$Getopt::Std::opt_t;
92	$mincov=~tr/%//d;
93	$mincov=1 unless $mincov;
94	$minpid=20 unless $minpid;
95	$minpid=20 unless $minpid;
96	$xminpid=12 unless $xminpid;
97	my $fmapout=$Getopt::Std::opt_w;
98	if ($fmapout) {
99	open(FMAP, '>'.$fmapout) \|\| die("Error creating file $fmapout\n");
100	}
101	my $fsizefile=$Getopt::Std::opt_l;
102	my %f;
103	my %n;
104	if ($fsizefile) {
105	open(FSIZE, $fsizefile) \|\| die("Error opening query info file $fsizefile!\n");
106	while(<FSIZE>) {
107	chomp;
108	my @t=split(/\s+/,$_,3);
109	next unless $t[1];
110	$f{$t[0]}=[$t[1], $t[2]]; # seqlen, description
111	}
112	close(FSIZE);
113	}
114	if ($dupfile) {
115	open(DUPFILE, $dupfile) \|\| die ("Error opening duplicate table file $dupfile!\n");
116	while (<DUPFILE>) {
117	chomp;
118	my @t=split(/\t/);
119	foreach my $id (@t) {
120	$dups{$id}=$t[0];
121	}
122	}
123	close(DUPFILE);
124	}
125	my $curqry; #current target (query) name
126	my ($curqstart, $curqend, $curqlen, $curcov, $curpid, $use_tcoords); #length of the current query
127	my $curdescr; #defline of the current query
128	my $covlen; #coverage of current query
129	my $eScore; #cummulated exon score
130	my ($gffid, $gchr, $gtrack, $gname, $gstart, $gend, $gscore, $gstrand, $ginfo);
131	# 0 1 2 3 4 5 6
132	my @exd; # list of [exon_start, exon_end, phase, score, qstart, qend, exon_other_attrs]
133	my @cds; # list of [cds_start, cds_end, phase, score, qstart, qend, cds_other_attrs]
134	if ($outfile) {
135	open(OUTFILE, '>'.$outfile) \|\| die ("Error creating file $outfile\n");
136	select(OUTFILE);
137	}
138	while (<>) {
139	next if m/^\s*#/;
140	chomp;
141	my ($chr, $binfo, $ftype, $fstart, $fend, $fscore,
142	$strand, $phase, $finfo)=split(/\t/);
143	next if $ftype eq 'gene'; # just discard the superfluous 'gene' entries;
144	if ($ftype eq 'mRNA') { #start of a new mapping
145	&flush_gbuf();
146	($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $ginfo)=
147	($chr, $binfo, $fstart, $fend, $fscore, $strand, $finfo);
148	($gffid)=($ginfo=~m/ID=([^;]+)/);
149	die("Error finding the current ID (in attribute field: $finfo)\n") unless $gffid;
150	$ginfo=~s/ID=[^;]+\s*;?//;
151	$ginfo=~s/Parent=[^;]+\s*;?//;
152	if ($ginfo=~s/Name=([^;]+)\s*;?//) {
153	$gname=$1;
154	}
155	if ($ginfo=~m/qlen=(\d+)/i) {
156	$curqlen=$1;
157	$ginfo=~s/qlen=\d+\s*;?//i;
158	}
159	if ($ginfo=~m/qreg=(\d+)\-(\d+)/i) {
160	($curqstart, $curqend)=($1,$2);
161	$curqstart=1 if $curqstart==0;
162	if ($ginfo=~m/qreg=\d+\-\d+\\|(\d+)/i) {
163	$curqlen=$1;
164	$ginfo=~s/qreg=\d+\-\d+\\|\d+\s*;?//i;
165	}
166	else {
167	$ginfo=~s/qreg=\d+\-\d+\s*;?//i;
168	}
169	}
170	if ($ginfo=~m/cov=([\d\.]+)/i) {
171	$curcov=$1;
172	$ginfo=~s/cov=[\d\.]+\s*;?//i;
173	}
174	if ($ginfo=~m/\bpid=([\d\.]+)/i) {
175	$curpid=$1;
176	$ginfo=~s/\bpid=[\d\.]+\s*;?//i;
177	}
178	if ($fscore>50 && $fscore<=100 && !$curpid) {
179	$curpid=$fscore;
180	}
181	next;
182	} # mRNA line
183	# vvvvvvvvv exons or CDS segments vvvvvvvvvvv
184	$ftype=~s/\w+\-exon$/exon/i; #jigsaw
185	next unless $ftype eq 'exon' \|\| $ftype eq 'CDS';
186	my $gtfgeneid;
187	my $id;
188	if ($finfo=~m/(?:Parent\|transcript_id)[=\s]+"?([^;^\s^"]+)/) {
189	$id=$1;
190	unless ($finfo=~s/Parent=[^;]+\s*;?//) {
191	#must be GTF format -- discard anything else but the gene_id, if found;
192	($gtfgeneid)=($finfo=~m/gene_id[=\s]+"?([^;^\s^"]+)/);
193	# we could parse all extra GTF attributes here and rebuild them in GFF3 format..
194	# .. nah
195	$finfo="";
196	}
197	}
198	else { # jigsaw and other over-simplified GFF formats
199	my ($geneid)=($finfo=~m/gene_id[=\s]+"?([^;^\s^"]+)/);
200	if ($geneid) {
201	$id=$geneid;
202	$finfo="";
203	}
204	else {
205	($id)=($finfo=~m/^([^;^\s]+)/);
206	if ($binfo=~m/^jigsaw/) {
207	($finfo)=($finfo=~m/(gene_score=[\-\d\.]+)/);
208	}
209	else { $finfo=""; }
210	}
211	}
212	die("Error getting the current ID from attribute field: $finfo\n") unless $id;
213	$finfo=~s/ID=[^;]+\s*;?//;
214	$finfo=~s/Name=([^;]+)\s*;?//;
215	if (!$gffid \|\| ($gffid ne $id) \|\| ($gchr && $gchr ne $chr)) {
216	# start of new mapping here (change in gff ID or base sequence)
217	#print STDERR "gffid=$gffid, id=$id (gchr=$gchr)\n";
218	&flush_gbuf();
219	($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $ginfo)=
220	($chr, $binfo, $fstart, $fend, $fscore, $strand, $finfo);
221	$ginfo=~s/Target=[^;]+\s*;?//i;
222	$ginfo=~s/qreg=\d+\-\d+\s*;?//i;
223	$gname=$gtfgeneid;
224	}
225	$finfo=~s/gene_score=[\-\d\.]+\s*\;?//i;
226	$gffid=$id;
227	my ($target, $t1, $t2, $tstrand);
228	if ($finfo=~m/Target=([^;]+)/i) {
229	my $alninfo=$1;
230	($target, $t1, $t2, $tstrand)=($alninfo=~m/^(\S+)\s+(\d+)\s+(\d+)\s+([\-+])/);
231	$finfo=~s/Target=[^;]+\s*;?//i;
232	$use_tcoords=1;
233	}
234	else {
235	if ($finfo=~m/qreg=(\d+)\-(\d+)/i) {
236	($t1,$t2)=($1,$2);
237	$t1=1 if $t1==0;
238	$finfo=~s/qreg=\d+\-\d+\s*;?//i;
239	}
240	$target=$gffid;
241	$target=~s/\.[a-z]{1,5}\d+$//;
242	}
243	if ($curqry) {
244	die("Error: CDS/exon Target entry found out of place ($_)\n")
245	if ($target ne $curqry);
246	}
247	else {
248	$curqry=$target; #first time assignment of curqry
249	if ($fsizefile) {
250	# if ($mincov) {
251	my $qd=$f{$curqry};
252	die("Error getting data for $curqry from $fsizefile!\n") unless $qd;
253	($curqlen, $curdescr)=@$qd;
254	#print STDERR "=====> descr for $curqry: $curdescr\n";
255	}
256	$covlen=0;
257	} # first time assignment of curqry
258	$phase='.' if $phase<0;
259	($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
260	($t1, $t2)=($t2, $t1) if ($t2<$t1);
261	if ($t2 && $pmap) {
262	($t1, $t2)=(int($t1/3), int($t2/3));
263	$t1=1 if $t1<1;
264	};
265	if ($ftype eq 'exon') {
266	# 0 1 2 3 4 5 6
267	push(@exd, [$fstart, $fend, $phase, $fscore, $t1, $t2, $finfo]);
268	next;
269	}
270	else { # ftype must be CDS here
271	# --exon entries should come first ?
272	# die("Error: missing query heading for: $_\n") unless $curqry;
273	#next if ($minpid && $fscore<$minpid);
274	push(@cds, [$fstart, $fend, $phase, $fscore, $t1, $t2, $finfo]);
275	}
276	} # line parsing loop
277
278	flush_gbuf() if $curqry;
279	select(STDOUT);
280	close(OUTFILE) if ($outfile);
281	close(FMAP) if ($fmapout);
282	# END here
283
284	#================ SUBROUTINES ============
285
286	sub flush_gbuf {
287	return unless $gffid;
288	check_gff();
289	($gffid, $curqry, $curqlen, $covlen, $curqstart, $curqend, $curcov, $curpid, $use_tcoords)=
290	(undef, undef, 0, 0, 0, 0, 0, 0, 0);
291	($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $gname, $ginfo)=(undef) x 8;
292	@exd=();
293	@cds=();
294	$eScore=0;
295	}
296
297	sub check_gff {
298	#die ("Error at calling flush_gbuf(): current length must NOT be zero!\n") unless $curqlen>0;
299	#$covlen=$covlen/3 if $pmap;
300	#my $cov=$curqlen ? ($covlen*100.00)/$curqlen : 0;
301	#return if ($mincov && $cov<$mincov); #failed coverage condition
302
303	my $hasCDSonly; # for pblat or some GTF file, we may have no exon segments
304	if (@exd==0) {
305	@exd=sort { $main::a->[0] <=> $main::b->[0] } @cds;
306	$hasCDSonly=1;
307	@cds=();
308	}
309	else {
310	@exd = sort { $main::a->[0] <=> $main::b->[0] } @exd;
311	@cds = sort { $main::a->[0] <=> $main::b->[0] } @cds;
312	}
313	my $prev_end=0;
314	#my $covscore=0;
315	my $i=0;
316	my $numexons=0;
317	my @fexd; #filtered set of exons
318	my $qcovlen=0; #total aligned length
319	my $gcovlen=0; #covered length (in nucleotides)
320	my $covscore=0;
321	foreach my $x (@exd) {
322	$i++;
323	my $xpid = ($$x[3]>=10 && $$x[3]<=100.0)? $$x[3] : 0;
324	next if ($xpid && $xpid<$xminpid && (($i>1 && $i<@exd) \|\| @exd==1)); #skip internal exons with lower pid
325	if ($prev_end>0) {
326	my $intronlen=$$x[0]-$prev_end;
327	return if $intronlen>$maxintron;
328	}
329	$prev_end=$$x[1];
330	$covscore+=($$x[1]-$$x[0]+1)*$xpid;
331	$gcovlen+=($$x[1]-$$x[0]+1);
332	$qcovlen+=($$x[5]-$$x[4]+1) if $$x[5]>0;
333	push(@fexd, $x);
334	$numexons++;
335	}
336	return if @fexd==0;
337	if ($g_range) {
338	if ($xrange) {
339	#everything that intersects given range
340	return if ($gstart>$maxcoord \|\| $gend<$mincoord);
341	}
342	else {
343	return unless ($gstart>=$mincoord && $gend<=$maxcoord);
344	}
345	}
346	return if ($fltstrand && $gstrand ne $fltstrand);
347	my $totpid=sprintf('%.2f',$covscore/$gcovlen);
348	my $qlen=$curqlen; #query length, in nucleotides
349	my $qcov = ($qcovlen>5 && $qlen>0) ? ($qcovlen*100.00)/$qlen : $curcov;
350	#print STDERR "=====> gffID: $gffid qcov=$qcov\n";
351	return if ($numexons<1 \|\| ($totpid>10 && $totpid<$minpid) \|\| ($qcov && $qcov<$mincov));
352	#my ($parent)=($ginfo=~m/\bID\s=\s([^;]+)/);
353	my $parent=$gffid;
354	if ($curqry) {
355	my $q=$dups{$curqry} \|\| $curqry;
356	my $uid="$gchr\|$q\|$gstart\|$gend\|$qcov";
357	return if exists($uniq{$uid});
358	$uniq{$uid}=1;
359	}
360	if ($addCounter) {
361	if ($fullRename) {
362	$globalCounter++;
363	$parent=$addCounter.$globalCounter;
364	}
365	else {
366	die("Error: no curqry available for gff ID: $gffid!\n") unless $curqry;
367	my $mnum = ++$n{$curqry};
368	$parent=$curqry.'.'.$addCounter.$mnum;
369	}
370
371	}
372	my $gffattrs="ID=$parent";
373	my $gffname=$gname \|\| $parent;
374	if ($stripAttrs \|\| $resetName) {
375	$gffname=$parent;
376	}
377	$gffattrs.=";Name=$gffname"; #just use a name there so argo can show it
378
379	if ($use_tcoords) {
380	my @td=sort { $main::a->[4] <=> $main::b->[4] } @fexd;
381	$curqstart=$td[0]->[4];
382	$curqend=$td[-1]->[5];
383	}
384	if ($qcov>1) {
385	$qcov=100 if $qcov>100;
386	$gffattrs.=';cov='.sprintf('%.1f',$qcov) unless $stripAttrs;
387	}
388	unless ($totpid>0) {
389	$totpid=$gscore if $gscore>10 && $gscore<=100.0;
390	}
391	unless ($covscore) {
392	my $rqlen=$qlen;
393	$rqlen*=3 if $pmap;
394	$covscore=(($rqlen$qcov)/100.00)$totpid;
395	}
396	unless ($stripAttrs) {
397	$gffattrs.=';pid='.sprintf('%.2f',$totpid) if $totpid>2; # unless $ginfo=~m/Identity=/;
398	$gffattrs.=';covscore='.sprintf('%d',$covscore) if $covscore>10;
399	#$ginfo.=';score='.sprintf('%d',($eScore/100.00)) unless $ginfo=~m/score=/i;
400	if ($curqstart && $curqend) {
401	$gffattrs.=";qreg=$curqstart-$curqend";
402	$gffattrs.="\|$curqlen" if $curqlen;
403	}
404	else {
405	$gffattrs.=';qlen='.$curqlen if $curqlen;
406	}
407	}
408	my ($gffdescr)=($ginfo=~m/\b(?:descr\|info)=\"([^"]+)/);
409	unless ($stripAttrs) {
410	$ginfo=~s/;$//;
411	$ginfo=~s/^;//;
412	if ($curdescr) {
413	$curdescr =~ tr/"\t;=/' .:/s; #"
414	$curdescr=~s/[\.\;\, ]+$//;
415	$ginfo=~s/\b(?:descr\|info)=\"[^"]+\"\s*;?//i;
416	if ($ginfo) {
417	$ginfo.=';descr="'.$curdescr.'"';
418	}
419	else { $ginfo=';descr="'.$curdescr.'"'; }
420	}
421	$gffattrs.=";$ginfo" if $ginfo;
422	}
423	$gffattrs=~tr/; /; /s;
424
425	my $gfftrack= $ntrack \|\| $gtrack;
426	print join("\t", $gchr, $gfftrack, 'mRNA', $gstart, $gend, $gscore, $gstrand, '.', $gffattrs)."\n" unless $GTF;
427	# exons:
428	my $fonly=$outFeat;
429	if ($outFeat) {
430	$fonly=$outFeat;
431	}
432	else {
433	$fonly= $hasCDSonly ? 'CDS':'exon';
434	}
435	my $wroteExons=0;
436	if (!$inFeat \|\| ($inFeat eq 'exon') \|\| $hasCDSonly) {
437	$wroteExons=1;
438	foreach my $x (@fexd) {
439	my $xinfo="Parent=$parent";
440	unless ($stripAttrs) {
441	$xinfo.=";qreg=$$x[4]-$$x[5]" if ($$x[4]>0);
442	$$x[6]=~s/^;+//;
443	$xinfo.=";$$x[6]" if $$x[6];
444	}
445	print join("\t", $gchr, $gfftrack, $fonly, $$x[0], $$x[1], $$x[3], $gstrand, $$x[2],$xinfo)."\n";
446	}
447	}
448	if ($fmapout) {
449	my $descr=$curdescr \|\| $gffdescr;
450	print FMAP join("\t", $gchr, $gstrand, $gstart, $gend, $curqry, sprintf('%d',$covscore),
451	sprintf('%.1f',$totpid), sprintf('%.1f',$qcov), "$curqstart-$curqend\|$curqlen", $descr)."\n";
452	}
453	return if ($outFeat && $wroteExons);
454	$fonly=$outFeat \|\| 'CDS';
455	if (@cds>0 && (!$inFeat \|\| ($inFeat eq 'CDS'))) {
456	foreach my $x (@cds) {
457	my $xinfo="Parent=$parent";
458	unless ($stripAttrs) {
459	$xinfo.=";qreg=$$x[4]-$$x[5]" if ($$x[4]>0);
460	$$x[6]=~s/^;+//;
461	$xinfo.=";$$x[6]" if $$x[6];
462	}
463	print join("\t", $gchr, $gfftrack, $fonly, $$x[0], $$x[1], $$x[3], $gstrand, $$x[2], $xinfo)."\n";
464	}
465	}
466	}#sub flush_gbuf()