gclib/scripts/gffann.pl

#!/usr/bin/perl
use strict;
use Getopt::Std;
my $usage=q/Usage:
gffann.pl [-o <outpuf.gff3>] [-t <fprefix>] [-R] [-m <mrna_gmap.iit>]
  -p <uniref_pmap.iit> [-P <uniref.fa.cidx> <models.gff>]
 
Create a gff file with the same entries like <models.gff> but with
an annotation attribute ("TopHit=...") added to the last field, based
on the best pmap alignment with UniRef proteins.

Prior to running this, the pmap output should be converted with 
gff2iit to the required input file <uniref_pmap.iit>.

If -t option was given, the top 5 protein mappings will be written in gff3 
format into the file <fprefix>.top5pmap.gff3; if -m option was also given
an equivalent gff file with the best mRNA\/EST hits will be created as
<fprefix>.top5gmap.gff3


/;
umask 0002;
getopts('Dp:m:P:o:t:') || die($usage."\n");

my $ftophits=$Getopt::Std::opt_t;
my $outfile=$Getopt::Std::opt_o;
my $debug=$Getopt::Std::opt_D;
my $iitfile=$Getopt::Std::opt_p;
my $miitfile=$Getopt::Std::opt_m;
my $mrefseq=$Getopt::Std::opt_M; #use the -m file as refseq (strong evidence) for annotation
my $protcdb=$Getopt::Std::opt_P;
my $gff=shift(@ARGV) || die("$usage\n");

#my %dup; #to avoid duplicate pmap/gmap entries

#-------- local array holdin uniformative annotation patterns:
my @uninformative=(
'\bunknown\b',
#'unknown protein\b',
'\bhypothetical\b',
'[A-Z]+\d+ cDNA sequence',
'cDNA sequence [A-Z]+\d+',
'uncharacterized protein',
'unnamed protein product',
'open reading frame',
'\borf\b',
'\bputative\b',
'\bhomologue\b',
'\bsimilar to',
'^expressed sequence \S+$',
'\bHA\d{4}\b',
'\bDKFZP\S+\b',
'PROTEIN FOR MGC:\d+',
'PROTEIN FOR IMAGE:\d+',
'\bR\d{5}\_\d\b',
'\bPRO\d{4}\b',
# 'KIAA\d+ GENE PRODUCT',
#'KIAA\d+ PROTEIN',
#'\bKIAA\d+\b',
'\bHSPC\d+\b',
# HSPC\d+ PROTEIN
#'\bC\d+ORF\d+\b',
'FLJ\d+ PROTEIN',
'\bDJ\d+[A-Z]\d+(\.\d+)*',
'NOVEL PROTEIN',
'CG\d+ PROTEIN',
'CG\d+ GENE PRODUCT',
'^\s*CG\d+\s*$',
'CGI\-\d+ PROTEIN',
'CGI\-\d+',
'CDNA:? FLJ\d+ FIS, CLONE \w+',
'BA\d+[A-Z]\d+[A-Z]?\.\d(\.\d)?',
#'\bRIKEN CDNA .{10} GENE\b',
'\bRIKEN.+?CDNA\b',
'MRNA, COMPLETE CDS, CLONE:\d+(\+\d[A-Z])?\-\d+',
'MRNA, COMPLETE CDS, CLONE:SMAP\d+\-\w+',
'BRAIN CDNA, CLONE MNCB-\d+',
'.{10}RIK PROTEIN',
'^MY\d{3}\s*$',
'MY\d{3} PROTEIN^',
'^probable\b',
'BRAIN MY\d{3}$',
'NPD\d{3} PROTEIN',
'[A-Z]\d{2}[A-Z0-9]+\.\d+ PROTEIN',
'WUGSC:H_\w+\.\w+ PROTEIN',
#'DNA SEGMENT, CHR [0-9XY]+, WAYNE STATE UNIVERSITY \d+, EXPRESSED',
#'DNA SEGMENT, CHR [0-9XY]+, KL MOHLKE \d+',
#'DNA SEGMENT, CHR [0-9XY]+, BAYLOR \d+',
'\bDNA SEGMENT\b',
'PROTEIN HSPC\d+',
#'HYPOTHETICAL [\.\d]+\s*KDA PROTEIN \S+ IN CHROMOSOME \S+',
'EG:[0-9A-Z\.]+ PROTEIN',
'GENOMIC DNA, CHROMOSOME \d+, P1 CLONE:\S+',
'[^,]+, RIKEN FULL-LENGTH ENRICHED LIBRARY, CLONE:.{10}, FULL INSERT SEQUENCE',
'ZK\d+\.\d+ PROTEIN',
'\bEST \w+',
'B2 ELEMENT'
);

open(GFF, $gff) || die ("Error: cannot open $gff\n");

#expects the models to be given in the proper order
# 'CDS' exons are the only ones used;
# the first 'mRNA' entry is going to be annotated
my @linebuf; #current gff gene/mRNA line buffer
my ($gchr, $gstrand, @m_cds, $m_cdslen) ;  #current predicted mRNA info
my $curmodel;

if ($outfile) {
  open(OUTF, '>'.$outfile) || die ("Error creating $outfile\n");
  select OUTF;
  }

if ($ftophits) {
  open(TOPHIT, ">$ftophits.top5pmap.gff3") || die ("Error creating $ftophits.top5pmap.gff3\n");
  if ($miitfile) {
    open(MTOPHIT, ">$ftophits.top5gmap.gff3") || die ("Error creating $ftophits.top5gmap.gff3\n");
    }
  }
#print STDERR "gffann.pl processing $gff ..\n";
my $gffline=1;
while ($gffline) {
 $gffline=<GFF>;
 if (defined($gffline) && ($gffline=~m/^\s*#/ || $gffline=~m/^\s+$/ || length($gffline)<4)) {
   #print $_;
   next;
   };
 chomp($gffline);
 my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $info)=split(/\t/, $gffline);
 next if ($f eq 'gene');# skip useless 'gene' entries and other nonsensical lines
 if ($f eq 'mRNA' || !defined($gffline)) { #new record starting, or EOF
   my $line=$gffline;
   if (@m_cds>0) { #previous model's exons are loaded
     @m_cds= sort { $main::a->[0] <=> $main::b->[0] } @m_cds;
     my @phits;
     my ($model_id)=($linebuf[0]=~m/ID=([^;]+)/);
     my ($pann, $pgeneid, $p_mcov, $p_xgaps, $alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps)
                    =annModel($iitfile, \@phits); #protein annotations
     my @mhits;
     my ($mann, $mgeneid, $m_mcov, $m_xgaps, $alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps)
                    =annModel($miitfile, \@mhits, 1) if $miitfile; #mrna/refseq annotations
     if ($p_mcov<2) { 
         if ($m_mcov<2) {
            print STDERR "Warning (gffann.pl $gff): model $model_id has no protein or mRNA coverage!\n";
            }
        else {
            print STDERR "Warning (gffann.pl $gff): model $model_id has no protein coverage!\n";
            }
        }
     
     #--
     my ($ann, $geneid,   $mcov,   $qxgaps,  $altann,   $altgid,      $altcov,  $altxgaps) = 
                ($mgeneid && ($m_mcov>=90 || $p_mcov-$m_mcov<10)) ? 
         # if we have a decent RefSeq annotation, use it
        ($mann, $mgeneid, $m_mcov, $m_xgaps, $alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) : 
        ($pann, $pgeneid, $p_mcov, $p_xgaps, $alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps);
     unless ($altann) {
       ($altann, $altgid, $altcov, $altxgaps) = $alt_mann ? ($alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) : 
                                                            ($alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps);
       }
#      if ($mcov<95) {
#        if ($mcov<$m_mcov) {
#         $ann=$mann; $mcov=$m_mcov; $qxgaps=$m_xgaps;
#         }
#        elsif ($mcov<$p_mcov) {
#         $ann=$pann;$mcov=$p_mcov; $qxgaps=$p_xgaps;
#         }
        print STDERR "Warning (gffann.pl $gff): possible merge? model $model_id coverage by best hit is only $mcov\%!\n" 
           if $mcov<80 && $mcov>1;
#       }
     if ($mcov>10) {
       $linebuf[0].=";mcov=$mcov";
       $linebuf[0].=";geneId=$geneid" if $geneid;
       $linebuf[0].=";qxgap=$qxgaps" if $qxgaps;
       if ($ann) {
         my @s=split(/\x01/, $ann);
         $ann=$s[0] if @s>1;
         $linebuf[0].=";tophit=\"$ann\"";
         }
       $linebuf[0].=";altGeneId=$altgid" if $altgid;
       if ($altann) {
         my @s=split(/\x01/, $altann);
         $altann=$s[0] if @s>1;
         $linebuf[0].=";altTopHit=\"$altann\"";
         $linebuf[0].=";altCov=\"$altcov\"";
         $linebuf[0].=";altQXGap=\"$altxgaps\"" if $altxgaps;
         }
       }
     # -- check the model exons values for exons with 0 evidence count
     foreach my $d (@m_cds) {
       #if ($$d[3]==0) {
      $linebuf[$$d[4]].=';mappingEvCount='.int($$d[3]);
      #if ($$d[3]==0) {
      # print STDERR "Warning: exon $$d[0]-$$d[1] of model $model_id ($geneid) has no mapping evidence registered!\n";
      # }
     }
     print join("\n",@linebuf)."\n";
     
     @m_cds=();$m_cdslen=0;
     @linebuf=();
     @phits=();
     @mhits=();
     $gstrand=$strand;
     $gchr=$chr;
     }
   last unless $gffline;
   chomp($line);
   push(@linebuf, $line);
   next;
   }
 chomp($gffline);  
 if ($f eq 'CDS') { # model exon
   ($curmodel)=($info=~m/Parent=(['"\:\w\|\-\.]+)/);
   $gstrand=$strand;
   $gchr=$chr;
   ($fstart, $fend)=($fend, $fstart) if ($fstart>$fend);
   push(@m_cds, [$fstart, $fend, $frame, 0, scalar(@linebuf)]);
   $m_cdslen+=$fend-$fstart+1;
   }
 push(@linebuf, $gffline);
} #while <GFF>
 
close(GFF);

# if (@m_cds>0) { #final record
#  my ($ann, $geneid, $mcov)=annModel($iitfile);
#  annModel($miitfile, 1) if $miitfile; #discard mrna annotation, just build the top5 file
#  $linebuf[0].=";GeneId=\"$geneid\"" if $geneid;
#  $linebuf[0].=";TopHit=\"$ann\"" if $ann; 
#  print join("\n",@linebuf)."\n";
#  }


sub annModel {
 my ($iitdb, $hits, $mrna)=@_;
 return '' unless (@m_cds>0);
 my ($mstart, $mend)=($m_cds[0]->[0], $m_cds[-1]->[1]);
 #get all overlapping features from iit database
 print STDERR "iit_get $iitdb $mstart $mend\n" if $debug;
 open(IITGET, "iit_get $iitdb $mstart $mend |") ||
  die("Error opening pipe from: iit_get $iitdb $mstart $mend \n");
 #my $maxscore=0;
 #map { $maxscore+= $_->[1]-$_->[0]+1 } @m_cds;
 my ($l_id, $l_exon, $l_cds, $l_info, $l_cov, $qgaps, $track);
 my $line=1;
 while ($line) {
   $line=<IITGET>;
   if (!$line || $line=~m/^>\S+\s/) {
     # header line -- new record
     
     if ($l_id) { ### process the previous record's lines
        $l_exon=$l_cds unless $l_exon;
        my @intv=split(/\,/, $l_exon);
        my @xdata;
        my ($ovlscore, $ovlen)=checkOverlap(\@intv, $l_id, \@xdata, $l_exon);
        if ($ovlscore>0) { #has exon overlap
           my $pid=sprintf('%4.1f',($ovlscore*100.0)/$ovlen);
           my $mcov=sprintf('%d', ($ovlen*100)/$m_cdslen); #model coverage by this hit
           my @cdata; #cds overlap data, if any
           #if ($l_cds) {
           # my @cintv=split(/\,/,$l_cds);
           # my ($cdsovlscore, $cdsovlen)=checkOverlap(\@cintv, $l_id, \@cdata, $l_cds);
           # @cdata=() unless $cdsovlen>10;
           # }
           my $geneid;
           if ($l_info=~m/(?:gid|Gene)\:(\S+)/) {
               $geneid=$1;
               $geneid=~s/\]$//;
               }
              else {
               my $n=$l_id;
               $n=~s/\.[pmrnaexobltsi]+\d+$//;
               #$n=~s/\.mrna\d+$//;
               $geneid=$1 if $n=~m/\|gid\|(.+)$/;
               }
           push(@$hits, [$l_id, $ovlscore, [@xdata], [@cdata], $l_cov, $l_info, $pid, $geneid, $mcov, $track, $qgaps]);
           print STDERR "..adding hit: $l_id, $ovlscore, $l_cov, $l_info, $pid, $geneid, $mcov, $track, $qgaps\n" if $debug;
           }
        #-----
        }
     last unless $line; #end of <IITGET> ?
     ($l_exon, $l_cds, $l_info, $l_cov, $track, $qgaps)=();
     my @d=split(/\s+/, $line);
     if (substr($d[3], -1) ne $gstrand) {
      $l_id=undef;
      next; #not a valid match (opposite strand)
      }
     # get the name of this
     $track=$d[-1];
     $track=~s/gblat/blat/;
     ($l_id)=($line=~m/^>(\S+)/);
     next;
     } #>header line
 next unless $l_id; #skip record
 print STDERR ">parsing data for hit $l_id :\n" if $debug;
 print STDERR "       $line\n" if $debug;
 if ($line=~m/^([iCvx])\:(.+)/) {
    my ($t, $d)=($1,$2);
    if ($t eq 'C') { $l_cds=$d; }
      elsif ($t eq 'v') { $l_cov=$d; }
      elsif ($t eq 'i') { $l_info=$d; 
         $l_info=~s/(Rec|Sub)Name: Full=//;
         $l_info=~s/\. Flags: [^\{]+/. /;
         $l_info=~s/ EC=[\d\-\._]+//g;
         $l_info=~s/\. AltName: [^\{]+/. /;
         $l_info=~s/\. Short=[^\{]+/. /;
         }
      elsif ($t eq 'x') { $qgaps=$d; # print STDERR "added qxgaps: $qgaps\n" if $debug;}
                            }
    }
   elsif ($line=~m/^\d+\-\d+/){
    $l_exon=$line;
    }
 } #while <IITGET>

 close(IITGET);
 return getBestHits($hits, $mrna);
}

if ($outfile) {
 select STDOUT;
 close(OUTF);
 }

if ($ftophits) {
 close(TOPHIT);
 close(MTOPHIT) if $miitfile;
 }

sub checkOverlap {
 my ($intv, $id, $xdata, $l)=@_;
 my ($ovlscore, $ovlen)=(0,0);
 foreach my $iv (@$intv) {
    my ($xstart, $xend, $xscore)=($iv=~m/^(\d+)\-(\d+)\:(\d+)/);
    #$xscore='89' if $xscore eq '.';
    die ("Error parsing $id exon line: $l\n") unless $xstart>0 && $xstart<=$xend;
     my ($ovlsc, $ovll)=&checkExonOvl($xstart, $xend, $xscore);
     push(@$xdata, [$xstart, $xend, $xscore, $ovll]);
     $ovlscore+=$ovlsc;
     $ovlen+=$ovll;
     }
 return ($ovlscore, $ovlen);
}

#---
sub checkExonOvl {
 my ($cstart, $cend, $cscore)=@_;
 # uses global variable @cds for the current model's exons to scan
 # returns the score sum of (overlap_length * $cscore/100 for each exon) 
 my $ovlscore=0;
 my $covlen=0; #cumulative bases coverage
 foreach my $d (@m_cds) { #for each model's exon
   last if $$d[0]>$cend;
   my $covl;
   #get the overlap size:
   if ($$d[0]<$cstart) {
      if ($cstart<$$d[1]) {
        $covl=($cend>$$d[1])? ($$d[1]-$cstart+1) : ($cend-$cstart+1);
        }
      }
     else {
      if ($$d[0]<$cend) {
        $covl=($cend<$$d[1])? $cend-$$d[0]+1 : $$d[1]-$$d[0]+1;
        }
      }
   if ($covl) { #there was overlap with this exon
     $$d[3]++; # increase evidence overlap counter for this model's exon
     $ovlscore += (($covl*$cscore)/100.00);
     $covlen+=$covl;
     }
   }# for each exon of the model
 return ($ovlscore, $covlen);
}


sub ovlTest {
 my ($a1, $a2, $b1, $b2)=@_;
 return ($a1<$b1) ? $b1<$a2 : $a1<$b2;
}

sub getBestHits {
 my ($href, $mrna)=@_;
 #sort by decreasing score
 my @hits = sort { $main::b->[1] <=> $main::a->[1] } @$href;
 $href=[@hits];
 #print STDERR ">$curmodel $gstrand ($mstart .. $mend) :\n" if $debug && !$mrna;
 my ($firsthid, $firstdescr, $firstgid, $first_mcov, $first_qxgaps);
 # uses cdbyank -F -a $pID $protcdb to get the defline of the protein $$href[0]->[0]
 # skips to next best hit until isInformative confirms it
 my ($besthid, $besthit, $best_gid, $best_mcov, $best_qxgaps, 
     $alt_hid, $alt_hit, $alt_gid, $alt_mcov, $alt_qxgaps);
 my $hcount=0;
 my @fregion; #first region covered by the best hit
 foreach my $h (@hits) {
    #my ($prot, $pscore, $pmapr,  $cov, $descr, $mcov) = @$h;
    my ($hid, $ovlscore, $xmapr, $cmapr, $cov, $descr, $pid, $geneid, $mcov, $track, $qgaps)=@$h;
    my $qid=$hid;
    $qid=~s/\.[mrnapbltexosi]+\d+$//; 
    #$qid=~s/\.p[mp]\d+$//;
    my @region=($xmapr->[0]->[0], $xmapr->[-1]->[1]); #start-end coordinates for this hit
    if (@fregion && $firstdescr) { # is this another region?
      unless (&ovlTest(@region, @fregion)) {
        #alternate partial annotation
        $alt_hid=$qid;
        $alt_hit=$descr;
        $alt_gid=$geneid;
        $alt_mcov=$mcov;
        $alt_qxgaps=$qgaps;
        }
      }
     else {
      @fregion=@region;
      }
   #----
   if ($mrna) { #mrna hit
      unless ($firstdescr) {
        $firsthid=$qid;
        $firstdescr=$descr; 
        $firstgid=$geneid;
        $first_mcov=$mcov;
        $first_qxgaps=$qgaps;
        }
      unless ($besthit) {
        if (&isInformative($descr)) {
             $besthid=$qid;
             $besthit=$descr;
             $best_gid=$geneid;
             $best_mcov=$mcov;
             $best_qxgaps=$qgaps;
             }
        }
      if ($ftophits && $hcount<5) {
        my @pex=@$xmapr;
        print MTOPHIT join("\t", $gchr, $track, 'mRNA', $pex[0]->[0], $pex[-1]->[1], '.',
                $gstrand, '.', "ID=$hid;Name=$hid");
        print MTOPHIT ";cov=$cov" if ($cov);
        print MTOPHIT ";gene=$geneid" if $geneid; # shouldn't this be Name instead?
        print MTOPHIT ";qxgap=$qgaps" if $qgaps;
        print MTOPHIT ";info=\"$descr\"" if $descr;
        
        print MTOPHIT "\n";
        my $c=0;
        foreach my $px (@pex) {
         print MTOPHIT join("\t", $gchr, $track, 'exon', $px->[0], $px->[1], $px->[2],
                $gstrand, '.', "Parent=$hid")."\n";
         $c++;
         }
        $hcount++; 
        }
      } #mRNA
     else { #protein hit
      my $defline = $descr;
      if ($protcdb && !$defline) {
        $defline=`cdbyank -F -a '$qid' $protcdb`;
        printf STDERR "  (%04d) %s", $ovlscore, $defline if $debug;
        chomp($defline);
        $defline=~s/^\S+\s+//;
        $defline=~s/^\[\d+\]\s+//;#my old Uniref entries
        $defline=~s/( \- )/ \[$hid\] /;
        }
      unless ($firstdescr) {
        $firsthid=$qid;
        $firstdescr=$defline; 
        $firstgid=$geneid;
        $first_mcov=$mcov;
        $first_qxgaps=$qgaps;
        }
      unless ($besthit) {
        if (&isInformative($defline)) {
             $besthid=$qid;
             $besthit=$defline;
             $best_gid=$geneid;
             $best_mcov=$mcov;
             $best_qxgaps=$qgaps;
             }
        }
      if ($ftophits && $hcount<5) {
        my @pex=@$xmapr;
        print TOPHIT join("\t", $gchr, $track, 'mRNA', $pex[0]->[0], $pex[-1]->[1], '.',
                $gstrand, '.', "ID=$hid;Name=$hid");

        print TOPHIT ";cov=$cov" if $cov;        
        print TOPHIT ";gene=$geneid" if $geneid;
        print TOPHIT ";qxgap=$qgaps" if $qgaps;
        print TOPHIT ";info=\"$defline\"" if $defline;
        print TOPHIT "\n";
        my $c=0;
        foreach my $px (@pex) {
         print TOPHIT join("\t", $gchr, $track, 'exon', $px->[0], $px->[1], $px->[2],
               # $gstrand, '.', "ID=$prot.e$c;Parent=$prot")."\n";
                $gstrand, '.', "Parent=$hid")."\n"; #optimized for argo applet
         $c++;
         }
        $hcount++; 
        }
       }
 } #foreach hit
 unless ($besthit) {
  $besthit=$firstdescr;
  $besthid=$firsthid;
  $best_gid=$firstgid;
  $best_mcov=$first_mcov;
  $best_qxgaps=$first_qxgaps;
  }
 $alt_hit=$alt_hid.' '.$alt_hit if $alt_hit;
 $besthit=$besthid.' '.$besthit if $besthit;
 return ($besthit, $best_gid, $best_mcov, $best_qxgaps,
        $alt_hit, $alt_gid, $alt_mcov, $alt_qxgaps);
}


#===============================================
# bool isInformative($description) 
# expects only the descripts - not the accession
#===============================================
sub isInformative {
 local $_=$_[0];
 s/^\s+//g;s/\s+$//g;
 return 0 if length($_)<2;
 foreach my $pat (@uninformative) {
   if (m/$pat/i) {
     #&flog("uninformative by /$pat/i : '$_'") if ($debug);
     return 0;
     }
   }
return 1;
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years ago) by gpertea
File size:	17342 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	my $usage=q/Usage:
5	gffann.pl [-o <outpuf.gff3>] [-t <fprefix>] [-R] [-m <mrna_gmap.iit>]
6	-p <uniref_pmap.iit> [-P <uniref.fa.cidx> <models.gff>]
7
8	Create a gff file with the same entries like <models.gff> but with
9	an annotation attribute ("TopHit=...") added to the last field, based
10	on the best pmap alignment with UniRef proteins.
11
12	Prior to running this, the pmap output should be converted with
13	gff2iit to the required input file <uniref_pmap.iit>.
14
15	If -t option was given, the top 5 protein mappings will be written in gff3
16	format into the file <fprefix>.top5pmap.gff3; if -m option was also given
17	an equivalent gff file with the best mRNA\/EST hits will be created as
18	<fprefix>.top5gmap.gff3
19
20
21	/;
22	umask 0002;
23	getopts('Dp:m:P:o:t:') \|\| die($usage."\n");
24
25	my $ftophits=$Getopt::Std::opt_t;
26	my $outfile=$Getopt::Std::opt_o;
27	my $debug=$Getopt::Std::opt_D;
28	my $iitfile=$Getopt::Std::opt_p;
29	my $miitfile=$Getopt::Std::opt_m;
30	my $mrefseq=$Getopt::Std::opt_M; #use the -m file as refseq (strong evidence) for annotation
31	my $protcdb=$Getopt::Std::opt_P;
32	my $gff=shift(@ARGV) \|\| die("$usage\n");
33
34	#my %dup; #to avoid duplicate pmap/gmap entries
35
36	#-------- local array holdin uniformative annotation patterns:
37	my @uninformative=(
38	'\bunknown\b',
39	#'unknown protein\b',
40	'\bhypothetical\b',
41	'[A-Z]+\d+ cDNA sequence',
42	'cDNA sequence [A-Z]+\d+',
43	'uncharacterized protein',
44	'unnamed protein product',
45	'open reading frame',
46	'\borf\b',
47	'\bputative\b',
48	'\bhomologue\b',
49	'\bsimilar to',
50	'^expressed sequence \S+$',
51	'\bHA\d{4}\b',
52	'\bDKFZP\S+\b',
53	'PROTEIN FOR MGC:\d+',
54	'PROTEIN FOR IMAGE:\d+',
55	'\bR\d{5}\_\d\b',
56	'\bPRO\d{4}\b',
57	# 'KIAA\d+ GENE PRODUCT',
58	#'KIAA\d+ PROTEIN',
59	#'\bKIAA\d+\b',
60	'\bHSPC\d+\b',
61	# HSPC\d+ PROTEIN
62	#'\bC\d+ORF\d+\b',
63	'FLJ\d+ PROTEIN',
64	'\bDJ\d+[A-Z]\d+(\.\d+)*',
65	'NOVEL PROTEIN',
66	'CG\d+ PROTEIN',
67	'CG\d+ GENE PRODUCT',
68	'^\sCG\d+\s$',
69	'CGI\-\d+ PROTEIN',
70	'CGI\-\d+',
71	'CDNA:? FLJ\d+ FIS, CLONE \w+',
72	'BA\d+[A-Z]\d+[A-Z]?\.\d(\.\d)?',
73	#'\bRIKEN CDNA .{10} GENE\b',
74	'\bRIKEN.+?CDNA\b',
75	'MRNA, COMPLETE CDS, CLONE:\d+(\+\d[A-Z])?\-\d+',
76	'MRNA, COMPLETE CDS, CLONE:SMAP\d+\-\w+',
77	'BRAIN CDNA, CLONE MNCB-\d+',
78	'.{10}RIK PROTEIN',
79	'^MY\d{3}\s*$',
80	'MY\d{3} PROTEIN^',
81	'^probable\b',
82	'BRAIN MY\d{3}$',
83	'NPD\d{3} PROTEIN',
84	'[A-Z]\d{2}[A-Z0-9]+\.\d+ PROTEIN',
85	'WUGSC:H_\w+\.\w+ PROTEIN',
86	#'DNA SEGMENT, CHR [0-9XY]+, WAYNE STATE UNIVERSITY \d+, EXPRESSED',
87	#'DNA SEGMENT, CHR [0-9XY]+, KL MOHLKE \d+',
88	#'DNA SEGMENT, CHR [0-9XY]+, BAYLOR \d+',
89	'\bDNA SEGMENT\b',
90	'PROTEIN HSPC\d+',
91	#'HYPOTHETICAL [\.\d]+\s*KDA PROTEIN \S+ IN CHROMOSOME \S+',
92	'EG:[0-9A-Z\.]+ PROTEIN',
93	'GENOMIC DNA, CHROMOSOME \d+, P1 CLONE:\S+',
94	'[^,]+, RIKEN FULL-LENGTH ENRICHED LIBRARY, CLONE:.{10}, FULL INSERT SEQUENCE',
95	'ZK\d+\.\d+ PROTEIN',
96	'\bEST \w+',
97	'B2 ELEMENT'
98	);
99
100	open(GFF, $gff) \|\| die ("Error: cannot open $gff\n");
101
102	#expects the models to be given in the proper order
103	# 'CDS' exons are the only ones used;
104	# the first 'mRNA' entry is going to be annotated
105	my @linebuf; #current gff gene/mRNA line buffer
106	my ($gchr, $gstrand, @m_cds, $m_cdslen) ; #current predicted mRNA info
107	my $curmodel;
108
109	if ($outfile) {
110	open(OUTF, '>'.$outfile) \|\| die ("Error creating $outfile\n");
111	select OUTF;
112	}
113
114	if ($ftophits) {
115	open(TOPHIT, ">$ftophits.top5pmap.gff3") \|\| die ("Error creating $ftophits.top5pmap.gff3\n");
116	if ($miitfile) {
117	open(MTOPHIT, ">$ftophits.top5gmap.gff3") \|\| die ("Error creating $ftophits.top5gmap.gff3\n");
118	}
119	}
120	#print STDERR "gffann.pl processing $gff ..\n";
121	my $gffline=1;
122	while ($gffline) {
123	$gffline=<GFF>;
124	if (defined($gffline) && ($gffline=~m/^\s*#/ \|\| $gffline=~m/^\s+$/ \|\| length($gffline)<4)) {
125	#print $_;
126	next;
127	};
128	chomp($gffline);
129	my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $info)=split(/\t/, $gffline);
130	next if ($f eq 'gene');# skip useless 'gene' entries and other nonsensical lines
131	if ($f eq 'mRNA' \|\| !defined($gffline)) { #new record starting, or EOF
132	my $line=$gffline;
133	if (@m_cds>0) { #previous model's exons are loaded
134	@m_cds= sort { $main::a->[0] <=> $main::b->[0] } @m_cds;
135	my @phits;
136	my ($model_id)=($linebuf[0]=~m/ID=([^;]+)/);
137	my ($pann, $pgeneid, $p_mcov, $p_xgaps, $alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps)
138	=annModel($iitfile, \@phits); #protein annotations
139	my @mhits;
140	my ($mann, $mgeneid, $m_mcov, $m_xgaps, $alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps)
141	=annModel($miitfile, \@mhits, 1) if $miitfile; #mrna/refseq annotations
142	if ($p_mcov<2) {
143	if ($m_mcov<2) {
144	print STDERR "Warning (gffann.pl $gff): model $model_id has no protein or mRNA coverage!\n";
145	}
146	else {
147	print STDERR "Warning (gffann.pl $gff): model $model_id has no protein coverage!\n";
148	}
149	}
150
151	#--
152	my ($ann, $geneid, $mcov, $qxgaps, $altann, $altgid, $altcov, $altxgaps) =
153	($mgeneid && ($m_mcov>=90 \|\| $p_mcov-$m_mcov<10)) ?
154	# if we have a decent RefSeq annotation, use it
155	($mann, $mgeneid, $m_mcov, $m_xgaps, $alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) :
156	($pann, $pgeneid, $p_mcov, $p_xgaps, $alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps);
157	unless ($altann) {
158	($altann, $altgid, $altcov, $altxgaps) = $alt_mann ? ($alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) :
159	($alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps);
160	}
161	# if ($mcov<95) {
162	# if ($mcov<$m_mcov) {
163	# $ann=$mann; $mcov=$m_mcov; $qxgaps=$m_xgaps;
164	# }
165	# elsif ($mcov<$p_mcov) {
166	# $ann=$pann;$mcov=$p_mcov; $qxgaps=$p_xgaps;
167	# }
168	print STDERR "Warning (gffann.pl $gff): possible merge? model $model_id coverage by best hit is only $mcov\%!\n"
169	if $mcov<80 && $mcov>1;
170	# }
171	if ($mcov>10) {
172	$linebuf[0].=";mcov=$mcov";
173	$linebuf[0].=";geneId=$geneid" if $geneid;
174	$linebuf[0].=";qxgap=$qxgaps" if $qxgaps;
175	if ($ann) {
176	my @s=split(/\x01/, $ann);
177	$ann=$s[0] if @s>1;
178	$linebuf[0].=";tophit=\"$ann\"";
179	}
180	$linebuf[0].=";altGeneId=$altgid" if $altgid;
181	if ($altann) {
182	my @s=split(/\x01/, $altann);
183	$altann=$s[0] if @s>1;
184	$linebuf[0].=";altTopHit=\"$altann\"";
185	$linebuf[0].=";altCov=\"$altcov\"";
186	$linebuf[0].=";altQXGap=\"$altxgaps\"" if $altxgaps;
187	}
188	}
189	# -- check the model exons values for exons with 0 evidence count
190	foreach my $d (@m_cds) {
191	#if ($$d[3]==0) {
192	$linebuf[$$d[4]].=';mappingEvCount='.int($$d[3]);
193	#if ($$d[3]==0) {
194	# print STDERR "Warning: exon $$d[0]-$$d[1] of model $model_id ($geneid) has no mapping evidence registered!\n";
195	# }
196	}
197	print join("\n",@linebuf)."\n";
198
199	@m_cds=();$m_cdslen=0;
200	@linebuf=();
201	@phits=();
202	@mhits=();
203	$gstrand=$strand;
204	$gchr=$chr;
205	}
206	last unless $gffline;
207	chomp($line);
208	push(@linebuf, $line);
209	next;
210	}
211	chomp($gffline);
212	if ($f eq 'CDS') { # model exon
213	($curmodel)=($info=~m/Parent=(['"\:\w\\|\-\.]+)/);
214	$gstrand=$strand;
215	$gchr=$chr;
216	($fstart, $fend)=($fend, $fstart) if ($fstart>$fend);
217	push(@m_cds, [$fstart, $fend, $frame, 0, scalar(@linebuf)]);
218	$m_cdslen+=$fend-$fstart+1;
219	}
220	push(@linebuf, $gffline);
221	} #while <GFF>
222
223	close(GFF);
224
225	# if (@m_cds>0) { #final record
226	# my ($ann, $geneid, $mcov)=annModel($iitfile);
227	# annModel($miitfile, 1) if $miitfile; #discard mrna annotation, just build the top5 file
228	# $linebuf[0].=";GeneId=\"$geneid\"" if $geneid;
229	# $linebuf[0].=";TopHit=\"$ann\"" if $ann;
230	# print join("\n",@linebuf)."\n";
231	# }
232
233
234	sub annModel {
235	my ($iitdb, $hits, $mrna)=@_;
236	return '' unless (@m_cds>0);
237	my ($mstart, $mend)=($m_cds[0]->[0], $m_cds[-1]->[1]);
238	#get all overlapping features from iit database
239	print STDERR "iit_get $iitdb $mstart $mend\n" if $debug;
240	open(IITGET, "iit_get $iitdb $mstart $mend \|") \|\|
241	die("Error opening pipe from: iit_get $iitdb $mstart $mend \n");
242	#my $maxscore=0;
243	#map { $maxscore+= $_->[1]-$_->[0]+1 } @m_cds;
244	my ($l_id, $l_exon, $l_cds, $l_info, $l_cov, $qgaps, $track);
245	my $line=1;
246	while ($line) {
247	$line=<IITGET>;
248	if (!$line \|\| $line=~m/^>\S+\s/) {
249	# header line -- new record
250
251	if ($l_id) { ### process the previous record's lines
252	$l_exon=$l_cds unless $l_exon;
253	my @intv=split(/\,/, $l_exon);
254	my @xdata;
255	my ($ovlscore, $ovlen)=checkOverlap(\@intv, $l_id, \@xdata, $l_exon);
256	if ($ovlscore>0) { #has exon overlap
257	my $pid=sprintf('%4.1f',($ovlscore*100.0)/$ovlen);
258	my $mcov=sprintf('%d', ($ovlen*100)/$m_cdslen); #model coverage by this hit
259	my @cdata; #cds overlap data, if any
260	#if ($l_cds) {
261	# my @cintv=split(/\,/,$l_cds);
262	# my ($cdsovlscore, $cdsovlen)=checkOverlap(\@cintv, $l_id, \@cdata, $l_cds);
263	# @cdata=() unless $cdsovlen>10;
264	# }
265	my $geneid;
266	if ($l_info=~m/(?:gid\|Gene)\:(\S+)/) {
267	$geneid=$1;
268	$geneid=~s/\]$//;
269	}
270	else {
271	my $n=$l_id;
272	$n=~s/\.[pmrnaexobltsi]+\d+$//;
273	#$n=~s/\.mrna\d+$//;
274	$geneid=$1 if $n=~m/\\|gid\\|(.+)$/;
275	}
276	push(@$hits, [$l_id, $ovlscore, [@xdata], [@cdata], $l_cov, $l_info, $pid, $geneid, $mcov, $track, $qgaps]);
277	print STDERR "..adding hit: $l_id, $ovlscore, $l_cov, $l_info, $pid, $geneid, $mcov, $track, $qgaps\n" if $debug;
278	}
279	#-----
280	}
281	last unless $line; #end of <IITGET> ?
282	($l_exon, $l_cds, $l_info, $l_cov, $track, $qgaps)=();
283	my @d=split(/\s+/, $line);
284	if (substr($d[3], -1) ne $gstrand) {
285	$l_id=undef;
286	next; #not a valid match (opposite strand)
287	}
288	# get the name of this
289	$track=$d[-1];
290	$track=~s/gblat/blat/;
291	($l_id)=($line=~m/^>(\S+)/);
292	next;
293	} #>header line
294	next unless $l_id; #skip record
295	print STDERR ">parsing data for hit $l_id :\n" if $debug;
296	print STDERR " $line\n" if $debug;
297	if ($line=~m/^([iCvx])\:(.+)/) {
298	my ($t, $d)=($1,$2);
299	if ($t eq 'C') { $l_cds=$d; }
300	elsif ($t eq 'v') { $l_cov=$d; }
301	elsif ($t eq 'i') { $l_info=$d;
302	$l_info=~s/(Rec\|Sub)Name: Full=//;
303	$l_info=~s/\. Flags: [^\{]+/. /;
304	$l_info=~s/ EC=[\d\-\._]+//g;
305	$l_info=~s/\. AltName: [^\{]+/. /;
306	$l_info=~s/\. Short=[^\{]+/. /;
307	}
308	elsif ($t eq 'x') { $qgaps=$d; # print STDERR "added qxgaps: $qgaps\n" if $debug;}
309	}
310	}
311	elsif ($line=~m/^\d+\-\d+/){
312	$l_exon=$line;
313	}
314	} #while <IITGET>
315
316	close(IITGET);
317	return getBestHits($hits, $mrna);
318	}
319
320	if ($outfile) {
321	select STDOUT;
322	close(OUTF);
323	}
324
325	if ($ftophits) {
326	close(TOPHIT);
327	close(MTOPHIT) if $miitfile;
328	}
329
330	sub checkOverlap {
331	my ($intv, $id, $xdata, $l)=@_;
332	my ($ovlscore, $ovlen)=(0,0);
333	foreach my $iv (@$intv) {
334	my ($xstart, $xend, $xscore)=($iv=~m/^(\d+)\-(\d+)\:(\d+)/);
335	#$xscore='89' if $xscore eq '.';
336	die ("Error parsing $id exon line: $l\n") unless $xstart>0 && $xstart<=$xend;
337	my ($ovlsc, $ovll)=&checkExonOvl($xstart, $xend, $xscore);
338	push(@$xdata, [$xstart, $xend, $xscore, $ovll]);
339	$ovlscore+=$ovlsc;
340	$ovlen+=$ovll;
341	}
342	return ($ovlscore, $ovlen);
343	}
344
345	#---
346	sub checkExonOvl {
347	my ($cstart, $cend, $cscore)=@_;
348	# uses global variable @cds for the current model's exons to scan
349	# returns the score sum of (overlap_length * $cscore/100 for each exon)
350	my $ovlscore=0;
351	my $covlen=0; #cumulative bases coverage
352	foreach my $d (@m_cds) { #for each model's exon
353	last if $$d[0]>$cend;
354	my $covl;
355	#get the overlap size:
356	if ($$d[0]<$cstart) {
357	if ($cstart<$$d[1]) {
358	$covl=($cend>$$d[1])? ($$d[1]-$cstart+1) : ($cend-$cstart+1);
359	}
360	}
361	else {
362	if ($$d[0]<$cend) {
363	$covl=($cend<$$d[1])? $cend-$$d[0]+1 : $$d[1]-$$d[0]+1;
364	}
365	}
366	if ($covl) { #there was overlap with this exon
367	$$d[3]++; # increase evidence overlap counter for this model's exon
368	$ovlscore += (($covl*$cscore)/100.00);
369	$covlen+=$covl;
370	}
371	}# for each exon of the model
372	return ($ovlscore, $covlen);
373	}
374
375
376	sub ovlTest {
377	my ($a1, $a2, $b1, $b2)=@_;
378	return ($a1<$b1) ? $b1<$a2 : $a1<$b2;
379	}
380
381	sub getBestHits {
382	my ($href, $mrna)=@_;
383	#sort by decreasing score
384	my @hits = sort { $main::b->[1] <=> $main::a->[1] } @$href;
385	$href=[@hits];
386	#print STDERR ">$curmodel $gstrand ($mstart .. $mend) :\n" if $debug && !$mrna;
387	my ($firsthid, $firstdescr, $firstgid, $first_mcov, $first_qxgaps);
388	# uses cdbyank -F -a $pID $protcdb to get the defline of the protein $$href[0]->[0]
389	# skips to next best hit until isInformative confirms it
390	my ($besthid, $besthit, $best_gid, $best_mcov, $best_qxgaps,
391	$alt_hid, $alt_hit, $alt_gid, $alt_mcov, $alt_qxgaps);
392	my $hcount=0;
393	my @fregion; #first region covered by the best hit
394	foreach my $h (@hits) {
395	#my ($prot, $pscore, $pmapr, $cov, $descr, $mcov) = @$h;
396	my ($hid, $ovlscore, $xmapr, $cmapr, $cov, $descr, $pid, $geneid, $mcov, $track, $qgaps)=@$h;
397	my $qid=$hid;
398	$qid=~s/\.[mrnapbltexosi]+\d+$//;
399	#$qid=~s/\.p[mp]\d+$//;
400	my @region=($xmapr->[0]->[0], $xmapr->[-1]->[1]); #start-end coordinates for this hit
401	if (@fregion && $firstdescr) { # is this another region?
402	unless (&ovlTest(@region, @fregion)) {
403	#alternate partial annotation
404	$alt_hid=$qid;
405	$alt_hit=$descr;
406	$alt_gid=$geneid;
407	$alt_mcov=$mcov;
408	$alt_qxgaps=$qgaps;
409	}
410	}
411	else {
412	@fregion=@region;
413	}
414	#----
415	if ($mrna) { #mrna hit
416	unless ($firstdescr) {
417	$firsthid=$qid;
418	$firstdescr=$descr;
419	$firstgid=$geneid;
420	$first_mcov=$mcov;
421	$first_qxgaps=$qgaps;
422	}
423	unless ($besthit) {
424	if (&isInformative($descr)) {
425	$besthid=$qid;
426	$besthit=$descr;
427	$best_gid=$geneid;
428	$best_mcov=$mcov;
429	$best_qxgaps=$qgaps;
430	}
431	}
432	if ($ftophits && $hcount<5) {
433	my @pex=@$xmapr;
434	print MTOPHIT join("\t", $gchr, $track, 'mRNA', $pex[0]->[0], $pex[-1]->[1], '.',
435	$gstrand, '.', "ID=$hid;Name=$hid");
436	print MTOPHIT ";cov=$cov" if ($cov);
437	print MTOPHIT ";gene=$geneid" if $geneid; # shouldn't this be Name instead?
438	print MTOPHIT ";qxgap=$qgaps" if $qgaps;
439	print MTOPHIT ";info=\"$descr\"" if $descr;
440
441	print MTOPHIT "\n";
442	my $c=0;
443	foreach my $px (@pex) {
444	print MTOPHIT join("\t", $gchr, $track, 'exon', $px->[0], $px->[1], $px->[2],
445	$gstrand, '.', "Parent=$hid")."\n";
446	$c++;
447	}
448	$hcount++;
449	}
450	} #mRNA
451	else { #protein hit
452	my $defline = $descr;
453	if ($protcdb && !$defline) {
454	$defline=`cdbyank -F -a '$qid' $protcdb`;
455	printf STDERR " (%04d) %s", $ovlscore, $defline if $debug;
456	chomp($defline);
457	$defline=~s/^\S+\s+//;
458	$defline=~s/^\[\d+\]\s+//;#my old Uniref entries
459	$defline=~s/( \- )/ \[$hid\] /;
460	}
461	unless ($firstdescr) {
462	$firsthid=$qid;
463	$firstdescr=$defline;
464	$firstgid=$geneid;
465	$first_mcov=$mcov;
466	$first_qxgaps=$qgaps;
467	}
468	unless ($besthit) {
469	if (&isInformative($defline)) {
470	$besthid=$qid;
471	$besthit=$defline;
472	$best_gid=$geneid;
473	$best_mcov=$mcov;
474	$best_qxgaps=$qgaps;
475	}
476	}
477	if ($ftophits && $hcount<5) {
478	my @pex=@$xmapr;
479	print TOPHIT join("\t", $gchr, $track, 'mRNA', $pex[0]->[0], $pex[-1]->[1], '.',
480	$gstrand, '.', "ID=$hid;Name=$hid");
481
482	print TOPHIT ";cov=$cov" if $cov;
483	print TOPHIT ";gene=$geneid" if $geneid;
484	print TOPHIT ";qxgap=$qgaps" if $qgaps;
485	print TOPHIT ";info=\"$defline\"" if $defline;
486	print TOPHIT "\n";
487	my $c=0;
488	foreach my $px (@pex) {
489	print TOPHIT join("\t", $gchr, $track, 'exon', $px->[0], $px->[1], $px->[2],
490	# $gstrand, '.', "ID=$prot.e$c;Parent=$prot")."\n";
491	$gstrand, '.', "Parent=$hid")."\n"; #optimized for argo applet
492	$c++;
493	}
494	$hcount++;
495	}
496	}
497	} #foreach hit
498	unless ($besthit) {
499	$besthit=$firstdescr;
500	$besthid=$firsthid;
501	$best_gid=$firstgid;
502	$best_mcov=$first_mcov;
503	$best_qxgaps=$first_qxgaps;
504	}
505	$alt_hit=$alt_hid.' '.$alt_hit if $alt_hit;
506	$besthit=$besthid.' '.$besthit if $besthit;
507	return ($besthit, $best_gid, $best_mcov, $best_qxgaps,
508	$alt_hit, $alt_gid, $alt_mcov, $alt_qxgaps);
509	}
510
511
512
513	#===============================================
514	# bool isInformative($description)
515	# expects only the descripts - not the accession
516	#===============================================
517	sub isInformative {
518	local $_=$_[0];
519	s/^\s+//g;s/\s+$//g;
520	return 0 if length($_)<2;
521	foreach my $pat (@uninformative) {
522	if (m/$pat/i) {
523	#&flog("uninformative by /$pat/i : '$_'") if ($debug);
524	return 0;
525	}
526	}
527	return 1;
528	}