1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
my $usage=q/Usage: |
5 |
gffann.pl [-o <outpuf.gff3>] [-t <fprefix>] [-R] [-m <mrna_gmap.iit>] |
6 |
-p <uniref_pmap.iit> [-P <uniref.fa.cidx> <models.gff>] |
7 |
|
8 |
Create a gff file with the same entries like <models.gff> but with |
9 |
an annotation attribute ("TopHit=...") added to the last field, based |
10 |
on the best pmap alignment with UniRef proteins. |
11 |
|
12 |
Prior to running this, the pmap output should be converted with |
13 |
gff2iit to the required input file <uniref_pmap.iit>. |
14 |
|
15 |
If -t option was given, the top 5 protein mappings will be written in gff3 |
16 |
format into the file <fprefix>.top5pmap.gff3; if -m option was also given |
17 |
an equivalent gff file with the best mRNA\/EST hits will be created as |
18 |
<fprefix>.top5gmap.gff3 |
19 |
|
20 |
|
21 |
/; |
22 |
umask 0002; |
23 |
getopts('Dp:m:P:o:t:') || die($usage."\n"); |
24 |
|
25 |
my $ftophits=$Getopt::Std::opt_t; |
26 |
my $outfile=$Getopt::Std::opt_o; |
27 |
my $debug=$Getopt::Std::opt_D; |
28 |
my $iitfile=$Getopt::Std::opt_p; |
29 |
my $miitfile=$Getopt::Std::opt_m; |
30 |
my $mrefseq=$Getopt::Std::opt_M; #use the -m file as refseq (strong evidence) for annotation |
31 |
my $protcdb=$Getopt::Std::opt_P; |
32 |
my $gff=shift(@ARGV) || die("$usage\n"); |
33 |
|
34 |
#my %dup; #to avoid duplicate pmap/gmap entries |
35 |
|
36 |
#-------- local array holdin uniformative annotation patterns: |
37 |
my @uninformative=( |
38 |
'\bunknown\b', |
39 |
#'unknown protein\b', |
40 |
'\bhypothetical\b', |
41 |
'[A-Z]+\d+ cDNA sequence', |
42 |
'cDNA sequence [A-Z]+\d+', |
43 |
'uncharacterized protein', |
44 |
'unnamed protein product', |
45 |
'open reading frame', |
46 |
'\borf\b', |
47 |
'\bputative\b', |
48 |
'\bhomologue\b', |
49 |
'\bsimilar to', |
50 |
'^expressed sequence \S+$', |
51 |
'\bHA\d{4}\b', |
52 |
'\bDKFZP\S+\b', |
53 |
'PROTEIN FOR MGC:\d+', |
54 |
'PROTEIN FOR IMAGE:\d+', |
55 |
'\bR\d{5}\_\d\b', |
56 |
'\bPRO\d{4}\b', |
57 |
# 'KIAA\d+ GENE PRODUCT', |
58 |
#'KIAA\d+ PROTEIN', |
59 |
#'\bKIAA\d+\b', |
60 |
'\bHSPC\d+\b', |
61 |
# HSPC\d+ PROTEIN |
62 |
#'\bC\d+ORF\d+\b', |
63 |
'FLJ\d+ PROTEIN', |
64 |
'\bDJ\d+[A-Z]\d+(\.\d+)*', |
65 |
'NOVEL PROTEIN', |
66 |
'CG\d+ PROTEIN', |
67 |
'CG\d+ GENE PRODUCT', |
68 |
'^\s*CG\d+\s*$', |
69 |
'CGI\-\d+ PROTEIN', |
70 |
'CGI\-\d+', |
71 |
'CDNA:? FLJ\d+ FIS, CLONE \w+', |
72 |
'BA\d+[A-Z]\d+[A-Z]?\.\d(\.\d)?', |
73 |
#'\bRIKEN CDNA .{10} GENE\b', |
74 |
'\bRIKEN.+?CDNA\b', |
75 |
'MRNA, COMPLETE CDS, CLONE:\d+(\+\d[A-Z])?\-\d+', |
76 |
'MRNA, COMPLETE CDS, CLONE:SMAP\d+\-\w+', |
77 |
'BRAIN CDNA, CLONE MNCB-\d+', |
78 |
'.{10}RIK PROTEIN', |
79 |
'^MY\d{3}\s*$', |
80 |
'MY\d{3} PROTEIN^', |
81 |
'^probable\b', |
82 |
'BRAIN MY\d{3}$', |
83 |
'NPD\d{3} PROTEIN', |
84 |
'[A-Z]\d{2}[A-Z0-9]+\.\d+ PROTEIN', |
85 |
'WUGSC:H_\w+\.\w+ PROTEIN', |
86 |
#'DNA SEGMENT, CHR [0-9XY]+, WAYNE STATE UNIVERSITY \d+, EXPRESSED', |
87 |
#'DNA SEGMENT, CHR [0-9XY]+, KL MOHLKE \d+', |
88 |
#'DNA SEGMENT, CHR [0-9XY]+, BAYLOR \d+', |
89 |
'\bDNA SEGMENT\b', |
90 |
'PROTEIN HSPC\d+', |
91 |
#'HYPOTHETICAL [\.\d]+\s*KDA PROTEIN \S+ IN CHROMOSOME \S+', |
92 |
'EG:[0-9A-Z\.]+ PROTEIN', |
93 |
'GENOMIC DNA, CHROMOSOME \d+, P1 CLONE:\S+', |
94 |
'[^,]+, RIKEN FULL-LENGTH ENRICHED LIBRARY, CLONE:.{10}, FULL INSERT SEQUENCE', |
95 |
'ZK\d+\.\d+ PROTEIN', |
96 |
'\bEST \w+', |
97 |
'B2 ELEMENT' |
98 |
); |
99 |
|
100 |
open(GFF, $gff) || die ("Error: cannot open $gff\n"); |
101 |
|
102 |
#expects the models to be given in the proper order |
103 |
# 'CDS' exons are the only ones used; |
104 |
# the first 'mRNA' entry is going to be annotated |
105 |
my @linebuf; #current gff gene/mRNA line buffer |
106 |
my ($gchr, $gstrand, @m_cds, $m_cdslen) ; #current predicted mRNA info |
107 |
my $curmodel; |
108 |
|
109 |
if ($outfile) { |
110 |
open(OUTF, '>'.$outfile) || die ("Error creating $outfile\n"); |
111 |
select OUTF; |
112 |
} |
113 |
|
114 |
if ($ftophits) { |
115 |
open(TOPHIT, ">$ftophits.top5pmap.gff3") || die ("Error creating $ftophits.top5pmap.gff3\n"); |
116 |
if ($miitfile) { |
117 |
open(MTOPHIT, ">$ftophits.top5gmap.gff3") || die ("Error creating $ftophits.top5gmap.gff3\n"); |
118 |
} |
119 |
} |
120 |
#print STDERR "gffann.pl processing $gff ..\n"; |
121 |
my $gffline=1; |
122 |
while ($gffline) { |
123 |
$gffline=<GFF>; |
124 |
if (defined($gffline) && ($gffline=~m/^\s*#/ || $gffline=~m/^\s+$/ || length($gffline)<4)) { |
125 |
#print $_; |
126 |
next; |
127 |
}; |
128 |
chomp($gffline); |
129 |
my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $info)=split(/\t/, $gffline); |
130 |
next if ($f eq 'gene');# skip useless 'gene' entries and other nonsensical lines |
131 |
if ($f eq 'mRNA' || !defined($gffline)) { #new record starting, or EOF |
132 |
my $line=$gffline; |
133 |
if (@m_cds>0) { #previous model's exons are loaded |
134 |
@m_cds= sort { $main::a->[0] <=> $main::b->[0] } @m_cds; |
135 |
my @phits; |
136 |
my ($model_id)=($linebuf[0]=~m/ID=([^;]+)/); |
137 |
my ($pann, $pgeneid, $p_mcov, $p_xgaps, $alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps) |
138 |
=annModel($iitfile, \@phits); #protein annotations |
139 |
my @mhits; |
140 |
my ($mann, $mgeneid, $m_mcov, $m_xgaps, $alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) |
141 |
=annModel($miitfile, \@mhits, 1) if $miitfile; #mrna/refseq annotations |
142 |
if ($p_mcov<2) { |
143 |
if ($m_mcov<2) { |
144 |
print STDERR "Warning (gffann.pl $gff): model $model_id has no protein or mRNA coverage!\n"; |
145 |
} |
146 |
else { |
147 |
print STDERR "Warning (gffann.pl $gff): model $model_id has no protein coverage!\n"; |
148 |
} |
149 |
} |
150 |
|
151 |
#-- |
152 |
my ($ann, $geneid, $mcov, $qxgaps, $altann, $altgid, $altcov, $altxgaps) = |
153 |
($mgeneid && ($m_mcov>=90 || $p_mcov-$m_mcov<10)) ? |
154 |
# if we have a decent RefSeq annotation, use it |
155 |
($mann, $mgeneid, $m_mcov, $m_xgaps, $alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) : |
156 |
($pann, $pgeneid, $p_mcov, $p_xgaps, $alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps); |
157 |
unless ($altann) { |
158 |
($altann, $altgid, $altcov, $altxgaps) = $alt_mann ? ($alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) : |
159 |
($alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps); |
160 |
} |
161 |
# if ($mcov<95) { |
162 |
# if ($mcov<$m_mcov) { |
163 |
# $ann=$mann; $mcov=$m_mcov; $qxgaps=$m_xgaps; |
164 |
# } |
165 |
# elsif ($mcov<$p_mcov) { |
166 |
# $ann=$pann;$mcov=$p_mcov; $qxgaps=$p_xgaps; |
167 |
# } |
168 |
print STDERR "Warning (gffann.pl $gff): possible merge? model $model_id coverage by best hit is only $mcov\%!\n" |
169 |
if $mcov<80 && $mcov>1; |
170 |
# } |
171 |
if ($mcov>10) { |
172 |
$linebuf[0].=";mcov=$mcov"; |
173 |
$linebuf[0].=";geneId=$geneid" if $geneid; |
174 |
$linebuf[0].=";qxgap=$qxgaps" if $qxgaps; |
175 |
if ($ann) { |
176 |
my @s=split(/\x01/, $ann); |
177 |
$ann=$s[0] if @s>1; |
178 |
$linebuf[0].=";tophit=\"$ann\""; |
179 |
} |
180 |
$linebuf[0].=";altGeneId=$altgid" if $altgid; |
181 |
if ($altann) { |
182 |
my @s=split(/\x01/, $altann); |
183 |
$altann=$s[0] if @s>1; |
184 |
$linebuf[0].=";altTopHit=\"$altann\""; |
185 |
$linebuf[0].=";altCov=\"$altcov\""; |
186 |
$linebuf[0].=";altQXGap=\"$altxgaps\"" if $altxgaps; |
187 |
} |
188 |
} |
189 |
# -- check the model exons values for exons with 0 evidence count |
190 |
foreach my $d (@m_cds) { |
191 |
#if ($$d[3]==0) { |
192 |
$linebuf[$$d[4]].=';mappingEvCount='.int($$d[3]); |
193 |
#if ($$d[3]==0) { |
194 |
# print STDERR "Warning: exon $$d[0]-$$d[1] of model $model_id ($geneid) has no mapping evidence registered!\n"; |
195 |
# } |
196 |
} |
197 |
print join("\n",@linebuf)."\n"; |
198 |
|
199 |
@m_cds=();$m_cdslen=0; |
200 |
@linebuf=(); |
201 |
@phits=(); |
202 |
@mhits=(); |
203 |
$gstrand=$strand; |
204 |
$gchr=$chr; |
205 |
} |
206 |
last unless $gffline; |
207 |
chomp($line); |
208 |
push(@linebuf, $line); |
209 |
next; |
210 |
} |
211 |
chomp($gffline); |
212 |
if ($f eq 'CDS') { # model exon |
213 |
($curmodel)=($info=~m/Parent=(['"\:\w\|\-\.]+)/); |
214 |
$gstrand=$strand; |
215 |
$gchr=$chr; |
216 |
($fstart, $fend)=($fend, $fstart) if ($fstart>$fend); |
217 |
push(@m_cds, [$fstart, $fend, $frame, 0, scalar(@linebuf)]); |
218 |
$m_cdslen+=$fend-$fstart+1; |
219 |
} |
220 |
push(@linebuf, $gffline); |
221 |
} #while <GFF> |
222 |
|
223 |
close(GFF); |
224 |
|
225 |
# if (@m_cds>0) { #final record |
226 |
# my ($ann, $geneid, $mcov)=annModel($iitfile); |
227 |
# annModel($miitfile, 1) if $miitfile; #discard mrna annotation, just build the top5 file |
228 |
# $linebuf[0].=";GeneId=\"$geneid\"" if $geneid; |
229 |
# $linebuf[0].=";TopHit=\"$ann\"" if $ann; |
230 |
# print join("\n",@linebuf)."\n"; |
231 |
# } |
232 |
|
233 |
|
234 |
sub annModel { |
235 |
my ($iitdb, $hits, $mrna)=@_; |
236 |
return '' unless (@m_cds>0); |
237 |
my ($mstart, $mend)=($m_cds[0]->[0], $m_cds[-1]->[1]); |
238 |
#get all overlapping features from iit database |
239 |
print STDERR "iit_get $iitdb $mstart $mend\n" if $debug; |
240 |
open(IITGET, "iit_get $iitdb $mstart $mend |") || |
241 |
die("Error opening pipe from: iit_get $iitdb $mstart $mend \n"); |
242 |
#my $maxscore=0; |
243 |
#map { $maxscore+= $_->[1]-$_->[0]+1 } @m_cds; |
244 |
my ($l_id, $l_exon, $l_cds, $l_info, $l_cov, $qgaps, $track); |
245 |
my $line=1; |
246 |
while ($line) { |
247 |
$line=<IITGET>; |
248 |
if (!$line || $line=~m/^>\S+\s/) { |
249 |
# header line -- new record |
250 |
|
251 |
if ($l_id) { ### process the previous record's lines |
252 |
$l_exon=$l_cds unless $l_exon; |
253 |
my @intv=split(/\,/, $l_exon); |
254 |
my @xdata; |
255 |
my ($ovlscore, $ovlen)=checkOverlap(\@intv, $l_id, \@xdata, $l_exon); |
256 |
if ($ovlscore>0) { #has exon overlap |
257 |
my $pid=sprintf('%4.1f',($ovlscore*100.0)/$ovlen); |
258 |
my $mcov=sprintf('%d', ($ovlen*100)/$m_cdslen); #model coverage by this hit |
259 |
my @cdata; #cds overlap data, if any |
260 |
#if ($l_cds) { |
261 |
# my @cintv=split(/\,/,$l_cds); |
262 |
# my ($cdsovlscore, $cdsovlen)=checkOverlap(\@cintv, $l_id, \@cdata, $l_cds); |
263 |
# @cdata=() unless $cdsovlen>10; |
264 |
# } |
265 |
my $geneid; |
266 |
if ($l_info=~m/(?:gid|Gene)\:(\S+)/) { |
267 |
$geneid=$1; |
268 |
$geneid=~s/\]$//; |
269 |
} |
270 |
else { |
271 |
my $n=$l_id; |
272 |
$n=~s/\.[pmrnaexobltsi]+\d+$//; |
273 |
#$n=~s/\.mrna\d+$//; |
274 |
$geneid=$1 if $n=~m/\|gid\|(.+)$/; |
275 |
} |
276 |
push(@$hits, [$l_id, $ovlscore, [@xdata], [@cdata], $l_cov, $l_info, $pid, $geneid, $mcov, $track, $qgaps]); |
277 |
print STDERR "..adding hit: $l_id, $ovlscore, $l_cov, $l_info, $pid, $geneid, $mcov, $track, $qgaps\n" if $debug; |
278 |
} |
279 |
#----- |
280 |
} |
281 |
last unless $line; #end of <IITGET> ? |
282 |
($l_exon, $l_cds, $l_info, $l_cov, $track, $qgaps)=(); |
283 |
my @d=split(/\s+/, $line); |
284 |
if (substr($d[3], -1) ne $gstrand) { |
285 |
$l_id=undef; |
286 |
next; #not a valid match (opposite strand) |
287 |
} |
288 |
# get the name of this |
289 |
$track=$d[-1]; |
290 |
$track=~s/gblat/blat/; |
291 |
($l_id)=($line=~m/^>(\S+)/); |
292 |
next; |
293 |
} #>header line |
294 |
next unless $l_id; #skip record |
295 |
print STDERR ">parsing data for hit $l_id :\n" if $debug; |
296 |
print STDERR " $line\n" if $debug; |
297 |
if ($line=~m/^([iCvx])\:(.+)/) { |
298 |
my ($t, $d)=($1,$2); |
299 |
if ($t eq 'C') { $l_cds=$d; } |
300 |
elsif ($t eq 'v') { $l_cov=$d; } |
301 |
elsif ($t eq 'i') { $l_info=$d; |
302 |
$l_info=~s/(Rec|Sub)Name: Full=//; |
303 |
$l_info=~s/\. Flags: [^\{]+/. /; |
304 |
$l_info=~s/ EC=[\d\-\._]+//g; |
305 |
$l_info=~s/\. AltName: [^\{]+/. /; |
306 |
$l_info=~s/\. Short=[^\{]+/. /; |
307 |
} |
308 |
elsif ($t eq 'x') { $qgaps=$d; # print STDERR "added qxgaps: $qgaps\n" if $debug;} |
309 |
} |
310 |
} |
311 |
elsif ($line=~m/^\d+\-\d+/){ |
312 |
$l_exon=$line; |
313 |
} |
314 |
} #while <IITGET> |
315 |
|
316 |
close(IITGET); |
317 |
return getBestHits($hits, $mrna); |
318 |
} |
319 |
|
320 |
if ($outfile) { |
321 |
select STDOUT; |
322 |
close(OUTF); |
323 |
} |
324 |
|
325 |
if ($ftophits) { |
326 |
close(TOPHIT); |
327 |
close(MTOPHIT) if $miitfile; |
328 |
} |
329 |
|
330 |
sub checkOverlap { |
331 |
my ($intv, $id, $xdata, $l)=@_; |
332 |
my ($ovlscore, $ovlen)=(0,0); |
333 |
foreach my $iv (@$intv) { |
334 |
my ($xstart, $xend, $xscore)=($iv=~m/^(\d+)\-(\d+)\:(\d+)/); |
335 |
#$xscore='89' if $xscore eq '.'; |
336 |
die ("Error parsing $id exon line: $l\n") unless $xstart>0 && $xstart<=$xend; |
337 |
my ($ovlsc, $ovll)=&checkExonOvl($xstart, $xend, $xscore); |
338 |
push(@$xdata, [$xstart, $xend, $xscore, $ovll]); |
339 |
$ovlscore+=$ovlsc; |
340 |
$ovlen+=$ovll; |
341 |
} |
342 |
return ($ovlscore, $ovlen); |
343 |
} |
344 |
|
345 |
#--- |
346 |
sub checkExonOvl { |
347 |
my ($cstart, $cend, $cscore)=@_; |
348 |
# uses global variable @cds for the current model's exons to scan |
349 |
# returns the score sum of (overlap_length * $cscore/100 for each exon) |
350 |
my $ovlscore=0; |
351 |
my $covlen=0; #cumulative bases coverage |
352 |
foreach my $d (@m_cds) { #for each model's exon |
353 |
last if $$d[0]>$cend; |
354 |
my $covl; |
355 |
#get the overlap size: |
356 |
if ($$d[0]<$cstart) { |
357 |
if ($cstart<$$d[1]) { |
358 |
$covl=($cend>$$d[1])? ($$d[1]-$cstart+1) : ($cend-$cstart+1); |
359 |
} |
360 |
} |
361 |
else { |
362 |
if ($$d[0]<$cend) { |
363 |
$covl=($cend<$$d[1])? $cend-$$d[0]+1 : $$d[1]-$$d[0]+1; |
364 |
} |
365 |
} |
366 |
if ($covl) { #there was overlap with this exon |
367 |
$$d[3]++; # increase evidence overlap counter for this model's exon |
368 |
$ovlscore += (($covl*$cscore)/100.00); |
369 |
$covlen+=$covl; |
370 |
} |
371 |
}# for each exon of the model |
372 |
return ($ovlscore, $covlen); |
373 |
} |
374 |
|
375 |
|
376 |
sub ovlTest { |
377 |
my ($a1, $a2, $b1, $b2)=@_; |
378 |
return ($a1<$b1) ? $b1<$a2 : $a1<$b2; |
379 |
} |
380 |
|
381 |
sub getBestHits { |
382 |
my ($href, $mrna)=@_; |
383 |
#sort by decreasing score |
384 |
my @hits = sort { $main::b->[1] <=> $main::a->[1] } @$href; |
385 |
$href=[@hits]; |
386 |
#print STDERR ">$curmodel $gstrand ($mstart .. $mend) :\n" if $debug && !$mrna; |
387 |
my ($firsthid, $firstdescr, $firstgid, $first_mcov, $first_qxgaps); |
388 |
# uses cdbyank -F -a $pID $protcdb to get the defline of the protein $$href[0]->[0] |
389 |
# skips to next best hit until isInformative confirms it |
390 |
my ($besthid, $besthit, $best_gid, $best_mcov, $best_qxgaps, |
391 |
$alt_hid, $alt_hit, $alt_gid, $alt_mcov, $alt_qxgaps); |
392 |
my $hcount=0; |
393 |
my @fregion; #first region covered by the best hit |
394 |
foreach my $h (@hits) { |
395 |
#my ($prot, $pscore, $pmapr, $cov, $descr, $mcov) = @$h; |
396 |
my ($hid, $ovlscore, $xmapr, $cmapr, $cov, $descr, $pid, $geneid, $mcov, $track, $qgaps)=@$h; |
397 |
my $qid=$hid; |
398 |
$qid=~s/\.[mrnapbltexosi]+\d+$//; |
399 |
#$qid=~s/\.p[mp]\d+$//; |
400 |
my @region=($xmapr->[0]->[0], $xmapr->[-1]->[1]); #start-end coordinates for this hit |
401 |
if (@fregion && $firstdescr) { # is this another region? |
402 |
unless (&ovlTest(@region, @fregion)) { |
403 |
#alternate partial annotation |
404 |
$alt_hid=$qid; |
405 |
$alt_hit=$descr; |
406 |
$alt_gid=$geneid; |
407 |
$alt_mcov=$mcov; |
408 |
$alt_qxgaps=$qgaps; |
409 |
} |
410 |
} |
411 |
else { |
412 |
@fregion=@region; |
413 |
} |
414 |
#---- |
415 |
if ($mrna) { #mrna hit |
416 |
unless ($firstdescr) { |
417 |
$firsthid=$qid; |
418 |
$firstdescr=$descr; |
419 |
$firstgid=$geneid; |
420 |
$first_mcov=$mcov; |
421 |
$first_qxgaps=$qgaps; |
422 |
} |
423 |
unless ($besthit) { |
424 |
if (&isInformative($descr)) { |
425 |
$besthid=$qid; |
426 |
$besthit=$descr; |
427 |
$best_gid=$geneid; |
428 |
$best_mcov=$mcov; |
429 |
$best_qxgaps=$qgaps; |
430 |
} |
431 |
} |
432 |
if ($ftophits && $hcount<5) { |
433 |
my @pex=@$xmapr; |
434 |
print MTOPHIT join("\t", $gchr, $track, 'mRNA', $pex[0]->[0], $pex[-1]->[1], '.', |
435 |
$gstrand, '.', "ID=$hid;Name=$hid"); |
436 |
print MTOPHIT ";cov=$cov" if ($cov); |
437 |
print MTOPHIT ";gene=$geneid" if $geneid; # shouldn't this be Name instead? |
438 |
print MTOPHIT ";qxgap=$qgaps" if $qgaps; |
439 |
print MTOPHIT ";info=\"$descr\"" if $descr; |
440 |
|
441 |
print MTOPHIT "\n"; |
442 |
my $c=0; |
443 |
foreach my $px (@pex) { |
444 |
print MTOPHIT join("\t", $gchr, $track, 'exon', $px->[0], $px->[1], $px->[2], |
445 |
$gstrand, '.', "Parent=$hid")."\n"; |
446 |
$c++; |
447 |
} |
448 |
$hcount++; |
449 |
} |
450 |
} #mRNA |
451 |
else { #protein hit |
452 |
my $defline = $descr; |
453 |
if ($protcdb && !$defline) { |
454 |
$defline=`cdbyank -F -a '$qid' $protcdb`; |
455 |
printf STDERR " (%04d) %s", $ovlscore, $defline if $debug; |
456 |
chomp($defline); |
457 |
$defline=~s/^\S+\s+//; |
458 |
$defline=~s/^\[\d+\]\s+//;#my old Uniref entries |
459 |
$defline=~s/( \- )/ \[$hid\] /; |
460 |
} |
461 |
unless ($firstdescr) { |
462 |
$firsthid=$qid; |
463 |
$firstdescr=$defline; |
464 |
$firstgid=$geneid; |
465 |
$first_mcov=$mcov; |
466 |
$first_qxgaps=$qgaps; |
467 |
} |
468 |
unless ($besthit) { |
469 |
if (&isInformative($defline)) { |
470 |
$besthid=$qid; |
471 |
$besthit=$defline; |
472 |
$best_gid=$geneid; |
473 |
$best_mcov=$mcov; |
474 |
$best_qxgaps=$qgaps; |
475 |
} |
476 |
} |
477 |
if ($ftophits && $hcount<5) { |
478 |
my @pex=@$xmapr; |
479 |
print TOPHIT join("\t", $gchr, $track, 'mRNA', $pex[0]->[0], $pex[-1]->[1], '.', |
480 |
$gstrand, '.', "ID=$hid;Name=$hid"); |
481 |
|
482 |
print TOPHIT ";cov=$cov" if $cov; |
483 |
print TOPHIT ";gene=$geneid" if $geneid; |
484 |
print TOPHIT ";qxgap=$qgaps" if $qgaps; |
485 |
print TOPHIT ";info=\"$defline\"" if $defline; |
486 |
print TOPHIT "\n"; |
487 |
my $c=0; |
488 |
foreach my $px (@pex) { |
489 |
print TOPHIT join("\t", $gchr, $track, 'exon', $px->[0], $px->[1], $px->[2], |
490 |
# $gstrand, '.', "ID=$prot.e$c;Parent=$prot")."\n"; |
491 |
$gstrand, '.', "Parent=$hid")."\n"; #optimized for argo applet |
492 |
$c++; |
493 |
} |
494 |
$hcount++; |
495 |
} |
496 |
} |
497 |
} #foreach hit |
498 |
unless ($besthit) { |
499 |
$besthit=$firstdescr; |
500 |
$besthid=$firsthid; |
501 |
$best_gid=$firstgid; |
502 |
$best_mcov=$first_mcov; |
503 |
$best_qxgaps=$first_qxgaps; |
504 |
} |
505 |
$alt_hit=$alt_hid.' '.$alt_hit if $alt_hit; |
506 |
$besthit=$besthid.' '.$besthit if $besthit; |
507 |
return ($besthit, $best_gid, $best_mcov, $best_qxgaps, |
508 |
$alt_hit, $alt_gid, $alt_mcov, $alt_qxgaps); |
509 |
} |
510 |
|
511 |
|
512 |
|
513 |
#=============================================== |
514 |
# bool isInformative($description) |
515 |
# expects only the descripts - not the accession |
516 |
#=============================================== |
517 |
sub isInformative { |
518 |
local $_=$_[0]; |
519 |
s/^\s+//g;s/\s+$//g; |
520 |
return 0 if length($_)<2; |
521 |
foreach my $pat (@uninformative) { |
522 |
if (m/$pat/i) { |
523 |
#&flog("uninformative by /$pat/i : '$_'") if ($debug); |
524 |
return 0; |
525 |
} |
526 |
} |
527 |
return 1; |
528 |
} |