ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gffann.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years ago) by gpertea
File size: 17342 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 my $usage=q/Usage:
5 gffann.pl [-o <outpuf.gff3>] [-t <fprefix>] [-R] [-m <mrna_gmap.iit>]
6 -p <uniref_pmap.iit> [-P <uniref.fa.cidx> <models.gff>]
7
8 Create a gff file with the same entries like <models.gff> but with
9 an annotation attribute ("TopHit=...") added to the last field, based
10 on the best pmap alignment with UniRef proteins.
11
12 Prior to running this, the pmap output should be converted with
13 gff2iit to the required input file <uniref_pmap.iit>.
14
15 If -t option was given, the top 5 protein mappings will be written in gff3
16 format into the file <fprefix>.top5pmap.gff3; if -m option was also given
17 an equivalent gff file with the best mRNA\/EST hits will be created as
18 <fprefix>.top5gmap.gff3
19
20
21 /;
22 umask 0002;
23 getopts('Dp:m:P:o:t:') || die($usage."\n");
24
25 my $ftophits=$Getopt::Std::opt_t;
26 my $outfile=$Getopt::Std::opt_o;
27 my $debug=$Getopt::Std::opt_D;
28 my $iitfile=$Getopt::Std::opt_p;
29 my $miitfile=$Getopt::Std::opt_m;
30 my $mrefseq=$Getopt::Std::opt_M; #use the -m file as refseq (strong evidence) for annotation
31 my $protcdb=$Getopt::Std::opt_P;
32 my $gff=shift(@ARGV) || die("$usage\n");
33
34 #my %dup; #to avoid duplicate pmap/gmap entries
35
36 #-------- local array holdin uniformative annotation patterns:
37 my @uninformative=(
38 '\bunknown\b',
39 #'unknown protein\b',
40 '\bhypothetical\b',
41 '[A-Z]+\d+ cDNA sequence',
42 'cDNA sequence [A-Z]+\d+',
43 'uncharacterized protein',
44 'unnamed protein product',
45 'open reading frame',
46 '\borf\b',
47 '\bputative\b',
48 '\bhomologue\b',
49 '\bsimilar to',
50 '^expressed sequence \S+$',
51 '\bHA\d{4}\b',
52 '\bDKFZP\S+\b',
53 'PROTEIN FOR MGC:\d+',
54 'PROTEIN FOR IMAGE:\d+',
55 '\bR\d{5}\_\d\b',
56 '\bPRO\d{4}\b',
57 # 'KIAA\d+ GENE PRODUCT',
58 #'KIAA\d+ PROTEIN',
59 #'\bKIAA\d+\b',
60 '\bHSPC\d+\b',
61 # HSPC\d+ PROTEIN
62 #'\bC\d+ORF\d+\b',
63 'FLJ\d+ PROTEIN',
64 '\bDJ\d+[A-Z]\d+(\.\d+)*',
65 'NOVEL PROTEIN',
66 'CG\d+ PROTEIN',
67 'CG\d+ GENE PRODUCT',
68 '^\s*CG\d+\s*$',
69 'CGI\-\d+ PROTEIN',
70 'CGI\-\d+',
71 'CDNA:? FLJ\d+ FIS, CLONE \w+',
72 'BA\d+[A-Z]\d+[A-Z]?\.\d(\.\d)?',
73 #'\bRIKEN CDNA .{10} GENE\b',
74 '\bRIKEN.+?CDNA\b',
75 'MRNA, COMPLETE CDS, CLONE:\d+(\+\d[A-Z])?\-\d+',
76 'MRNA, COMPLETE CDS, CLONE:SMAP\d+\-\w+',
77 'BRAIN CDNA, CLONE MNCB-\d+',
78 '.{10}RIK PROTEIN',
79 '^MY\d{3}\s*$',
80 'MY\d{3} PROTEIN^',
81 '^probable\b',
82 'BRAIN MY\d{3}$',
83 'NPD\d{3} PROTEIN',
84 '[A-Z]\d{2}[A-Z0-9]+\.\d+ PROTEIN',
85 'WUGSC:H_\w+\.\w+ PROTEIN',
86 #'DNA SEGMENT, CHR [0-9XY]+, WAYNE STATE UNIVERSITY \d+, EXPRESSED',
87 #'DNA SEGMENT, CHR [0-9XY]+, KL MOHLKE \d+',
88 #'DNA SEGMENT, CHR [0-9XY]+, BAYLOR \d+',
89 '\bDNA SEGMENT\b',
90 'PROTEIN HSPC\d+',
91 #'HYPOTHETICAL [\.\d]+\s*KDA PROTEIN \S+ IN CHROMOSOME \S+',
92 'EG:[0-9A-Z\.]+ PROTEIN',
93 'GENOMIC DNA, CHROMOSOME \d+, P1 CLONE:\S+',
94 '[^,]+, RIKEN FULL-LENGTH ENRICHED LIBRARY, CLONE:.{10}, FULL INSERT SEQUENCE',
95 'ZK\d+\.\d+ PROTEIN',
96 '\bEST \w+',
97 'B2 ELEMENT'
98 );
99
100 open(GFF, $gff) || die ("Error: cannot open $gff\n");
101
102 #expects the models to be given in the proper order
103 # 'CDS' exons are the only ones used;
104 # the first 'mRNA' entry is going to be annotated
105 my @linebuf; #current gff gene/mRNA line buffer
106 my ($gchr, $gstrand, @m_cds, $m_cdslen) ; #current predicted mRNA info
107 my $curmodel;
108
109 if ($outfile) {
110 open(OUTF, '>'.$outfile) || die ("Error creating $outfile\n");
111 select OUTF;
112 }
113
114 if ($ftophits) {
115 open(TOPHIT, ">$ftophits.top5pmap.gff3") || die ("Error creating $ftophits.top5pmap.gff3\n");
116 if ($miitfile) {
117 open(MTOPHIT, ">$ftophits.top5gmap.gff3") || die ("Error creating $ftophits.top5gmap.gff3\n");
118 }
119 }
120 #print STDERR "gffann.pl processing $gff ..\n";
121 my $gffline=1;
122 while ($gffline) {
123 $gffline=<GFF>;
124 if (defined($gffline) && ($gffline=~m/^\s*#/ || $gffline=~m/^\s+$/ || length($gffline)<4)) {
125 #print $_;
126 next;
127 };
128 chomp($gffline);
129 my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $info)=split(/\t/, $gffline);
130 next if ($f eq 'gene');# skip useless 'gene' entries and other nonsensical lines
131 if ($f eq 'mRNA' || !defined($gffline)) { #new record starting, or EOF
132 my $line=$gffline;
133 if (@m_cds>0) { #previous model's exons are loaded
134 @m_cds= sort { $main::a->[0] <=> $main::b->[0] } @m_cds;
135 my @phits;
136 my ($model_id)=($linebuf[0]=~m/ID=([^;]+)/);
137 my ($pann, $pgeneid, $p_mcov, $p_xgaps, $alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps)
138 =annModel($iitfile, \@phits); #protein annotations
139 my @mhits;
140 my ($mann, $mgeneid, $m_mcov, $m_xgaps, $alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps)
141 =annModel($miitfile, \@mhits, 1) if $miitfile; #mrna/refseq annotations
142 if ($p_mcov<2) {
143 if ($m_mcov<2) {
144 print STDERR "Warning (gffann.pl $gff): model $model_id has no protein or mRNA coverage!\n";
145 }
146 else {
147 print STDERR "Warning (gffann.pl $gff): model $model_id has no protein coverage!\n";
148 }
149 }
150
151 #--
152 my ($ann, $geneid, $mcov, $qxgaps, $altann, $altgid, $altcov, $altxgaps) =
153 ($mgeneid && ($m_mcov>=90 || $p_mcov-$m_mcov<10)) ?
154 # if we have a decent RefSeq annotation, use it
155 ($mann, $mgeneid, $m_mcov, $m_xgaps, $alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) :
156 ($pann, $pgeneid, $p_mcov, $p_xgaps, $alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps);
157 unless ($altann) {
158 ($altann, $altgid, $altcov, $altxgaps) = $alt_mann ? ($alt_mann, $alt_mgeneid, $alt_mcov, $alt_mxgaps) :
159 ($alt_pann, $alt_pgeneid, $alt_pcov, $alt_pxgaps);
160 }
161 # if ($mcov<95) {
162 # if ($mcov<$m_mcov) {
163 # $ann=$mann; $mcov=$m_mcov; $qxgaps=$m_xgaps;
164 # }
165 # elsif ($mcov<$p_mcov) {
166 # $ann=$pann;$mcov=$p_mcov; $qxgaps=$p_xgaps;
167 # }
168 print STDERR "Warning (gffann.pl $gff): possible merge? model $model_id coverage by best hit is only $mcov\%!\n"
169 if $mcov<80 && $mcov>1;
170 # }
171 if ($mcov>10) {
172 $linebuf[0].=";mcov=$mcov";
173 $linebuf[0].=";geneId=$geneid" if $geneid;
174 $linebuf[0].=";qxgap=$qxgaps" if $qxgaps;
175 if ($ann) {
176 my @s=split(/\x01/, $ann);
177 $ann=$s[0] if @s>1;
178 $linebuf[0].=";tophit=\"$ann\"";
179 }
180 $linebuf[0].=";altGeneId=$altgid" if $altgid;
181 if ($altann) {
182 my @s=split(/\x01/, $altann);
183 $altann=$s[0] if @s>1;
184 $linebuf[0].=";altTopHit=\"$altann\"";
185 $linebuf[0].=";altCov=\"$altcov\"";
186 $linebuf[0].=";altQXGap=\"$altxgaps\"" if $altxgaps;
187 }
188 }
189 # -- check the model exons values for exons with 0 evidence count
190 foreach my $d (@m_cds) {
191 #if ($$d[3]==0) {
192 $linebuf[$$d[4]].=';mappingEvCount='.int($$d[3]);
193 #if ($$d[3]==0) {
194 # print STDERR "Warning: exon $$d[0]-$$d[1] of model $model_id ($geneid) has no mapping evidence registered!\n";
195 # }
196 }
197 print join("\n",@linebuf)."\n";
198
199 @m_cds=();$m_cdslen=0;
200 @linebuf=();
201 @phits=();
202 @mhits=();
203 $gstrand=$strand;
204 $gchr=$chr;
205 }
206 last unless $gffline;
207 chomp($line);
208 push(@linebuf, $line);
209 next;
210 }
211 chomp($gffline);
212 if ($f eq 'CDS') { # model exon
213 ($curmodel)=($info=~m/Parent=(['"\:\w\|\-\.]+)/);
214 $gstrand=$strand;
215 $gchr=$chr;
216 ($fstart, $fend)=($fend, $fstart) if ($fstart>$fend);
217 push(@m_cds, [$fstart, $fend, $frame, 0, scalar(@linebuf)]);
218 $m_cdslen+=$fend-$fstart+1;
219 }
220 push(@linebuf, $gffline);
221 } #while <GFF>
222
223 close(GFF);
224
225 # if (@m_cds>0) { #final record
226 # my ($ann, $geneid, $mcov)=annModel($iitfile);
227 # annModel($miitfile, 1) if $miitfile; #discard mrna annotation, just build the top5 file
228 # $linebuf[0].=";GeneId=\"$geneid\"" if $geneid;
229 # $linebuf[0].=";TopHit=\"$ann\"" if $ann;
230 # print join("\n",@linebuf)."\n";
231 # }
232
233
234 sub annModel {
235 my ($iitdb, $hits, $mrna)=@_;
236 return '' unless (@m_cds>0);
237 my ($mstart, $mend)=($m_cds[0]->[0], $m_cds[-1]->[1]);
238 #get all overlapping features from iit database
239 print STDERR "iit_get $iitdb $mstart $mend\n" if $debug;
240 open(IITGET, "iit_get $iitdb $mstart $mend |") ||
241 die("Error opening pipe from: iit_get $iitdb $mstart $mend \n");
242 #my $maxscore=0;
243 #map { $maxscore+= $_->[1]-$_->[0]+1 } @m_cds;
244 my ($l_id, $l_exon, $l_cds, $l_info, $l_cov, $qgaps, $track);
245 my $line=1;
246 while ($line) {
247 $line=<IITGET>;
248 if (!$line || $line=~m/^>\S+\s/) {
249 # header line -- new record
250
251 if ($l_id) { ### process the previous record's lines
252 $l_exon=$l_cds unless $l_exon;
253 my @intv=split(/\,/, $l_exon);
254 my @xdata;
255 my ($ovlscore, $ovlen)=checkOverlap(\@intv, $l_id, \@xdata, $l_exon);
256 if ($ovlscore>0) { #has exon overlap
257 my $pid=sprintf('%4.1f',($ovlscore*100.0)/$ovlen);
258 my $mcov=sprintf('%d', ($ovlen*100)/$m_cdslen); #model coverage by this hit
259 my @cdata; #cds overlap data, if any
260 #if ($l_cds) {
261 # my @cintv=split(/\,/,$l_cds);
262 # my ($cdsovlscore, $cdsovlen)=checkOverlap(\@cintv, $l_id, \@cdata, $l_cds);
263 # @cdata=() unless $cdsovlen>10;
264 # }
265 my $geneid;
266 if ($l_info=~m/(?:gid|Gene)\:(\S+)/) {
267 $geneid=$1;
268 $geneid=~s/\]$//;
269 }
270 else {
271 my $n=$l_id;
272 $n=~s/\.[pmrnaexobltsi]+\d+$//;
273 #$n=~s/\.mrna\d+$//;
274 $geneid=$1 if $n=~m/\|gid\|(.+)$/;
275 }
276 push(@$hits, [$l_id, $ovlscore, [@xdata], [@cdata], $l_cov, $l_info, $pid, $geneid, $mcov, $track, $qgaps]);
277 print STDERR "..adding hit: $l_id, $ovlscore, $l_cov, $l_info, $pid, $geneid, $mcov, $track, $qgaps\n" if $debug;
278 }
279 #-----
280 }
281 last unless $line; #end of <IITGET> ?
282 ($l_exon, $l_cds, $l_info, $l_cov, $track, $qgaps)=();
283 my @d=split(/\s+/, $line);
284 if (substr($d[3], -1) ne $gstrand) {
285 $l_id=undef;
286 next; #not a valid match (opposite strand)
287 }
288 # get the name of this
289 $track=$d[-1];
290 $track=~s/gblat/blat/;
291 ($l_id)=($line=~m/^>(\S+)/);
292 next;
293 } #>header line
294 next unless $l_id; #skip record
295 print STDERR ">parsing data for hit $l_id :\n" if $debug;
296 print STDERR " $line\n" if $debug;
297 if ($line=~m/^([iCvx])\:(.+)/) {
298 my ($t, $d)=($1,$2);
299 if ($t eq 'C') { $l_cds=$d; }
300 elsif ($t eq 'v') { $l_cov=$d; }
301 elsif ($t eq 'i') { $l_info=$d;
302 $l_info=~s/(Rec|Sub)Name: Full=//;
303 $l_info=~s/\. Flags: [^\{]+/. /;
304 $l_info=~s/ EC=[\d\-\._]+//g;
305 $l_info=~s/\. AltName: [^\{]+/. /;
306 $l_info=~s/\. Short=[^\{]+/. /;
307 }
308 elsif ($t eq 'x') { $qgaps=$d; # print STDERR "added qxgaps: $qgaps\n" if $debug;}
309 }
310 }
311 elsif ($line=~m/^\d+\-\d+/){
312 $l_exon=$line;
313 }
314 } #while <IITGET>
315
316 close(IITGET);
317 return getBestHits($hits, $mrna);
318 }
319
320 if ($outfile) {
321 select STDOUT;
322 close(OUTF);
323 }
324
325 if ($ftophits) {
326 close(TOPHIT);
327 close(MTOPHIT) if $miitfile;
328 }
329
330 sub checkOverlap {
331 my ($intv, $id, $xdata, $l)=@_;
332 my ($ovlscore, $ovlen)=(0,0);
333 foreach my $iv (@$intv) {
334 my ($xstart, $xend, $xscore)=($iv=~m/^(\d+)\-(\d+)\:(\d+)/);
335 #$xscore='89' if $xscore eq '.';
336 die ("Error parsing $id exon line: $l\n") unless $xstart>0 && $xstart<=$xend;
337 my ($ovlsc, $ovll)=&checkExonOvl($xstart, $xend, $xscore);
338 push(@$xdata, [$xstart, $xend, $xscore, $ovll]);
339 $ovlscore+=$ovlsc;
340 $ovlen+=$ovll;
341 }
342 return ($ovlscore, $ovlen);
343 }
344
345 #---
346 sub checkExonOvl {
347 my ($cstart, $cend, $cscore)=@_;
348 # uses global variable @cds for the current model's exons to scan
349 # returns the score sum of (overlap_length * $cscore/100 for each exon)
350 my $ovlscore=0;
351 my $covlen=0; #cumulative bases coverage
352 foreach my $d (@m_cds) { #for each model's exon
353 last if $$d[0]>$cend;
354 my $covl;
355 #get the overlap size:
356 if ($$d[0]<$cstart) {
357 if ($cstart<$$d[1]) {
358 $covl=($cend>$$d[1])? ($$d[1]-$cstart+1) : ($cend-$cstart+1);
359 }
360 }
361 else {
362 if ($$d[0]<$cend) {
363 $covl=($cend<$$d[1])? $cend-$$d[0]+1 : $$d[1]-$$d[0]+1;
364 }
365 }
366 if ($covl) { #there was overlap with this exon
367 $$d[3]++; # increase evidence overlap counter for this model's exon
368 $ovlscore += (($covl*$cscore)/100.00);
369 $covlen+=$covl;
370 }
371 }# for each exon of the model
372 return ($ovlscore, $covlen);
373 }
374
375
376 sub ovlTest {
377 my ($a1, $a2, $b1, $b2)=@_;
378 return ($a1<$b1) ? $b1<$a2 : $a1<$b2;
379 }
380
381 sub getBestHits {
382 my ($href, $mrna)=@_;
383 #sort by decreasing score
384 my @hits = sort { $main::b->[1] <=> $main::a->[1] } @$href;
385 $href=[@hits];
386 #print STDERR ">$curmodel $gstrand ($mstart .. $mend) :\n" if $debug && !$mrna;
387 my ($firsthid, $firstdescr, $firstgid, $first_mcov, $first_qxgaps);
388 # uses cdbyank -F -a $pID $protcdb to get the defline of the protein $$href[0]->[0]
389 # skips to next best hit until isInformative confirms it
390 my ($besthid, $besthit, $best_gid, $best_mcov, $best_qxgaps,
391 $alt_hid, $alt_hit, $alt_gid, $alt_mcov, $alt_qxgaps);
392 my $hcount=0;
393 my @fregion; #first region covered by the best hit
394 foreach my $h (@hits) {
395 #my ($prot, $pscore, $pmapr, $cov, $descr, $mcov) = @$h;
396 my ($hid, $ovlscore, $xmapr, $cmapr, $cov, $descr, $pid, $geneid, $mcov, $track, $qgaps)=@$h;
397 my $qid=$hid;
398 $qid=~s/\.[mrnapbltexosi]+\d+$//;
399 #$qid=~s/\.p[mp]\d+$//;
400 my @region=($xmapr->[0]->[0], $xmapr->[-1]->[1]); #start-end coordinates for this hit
401 if (@fregion && $firstdescr) { # is this another region?
402 unless (&ovlTest(@region, @fregion)) {
403 #alternate partial annotation
404 $alt_hid=$qid;
405 $alt_hit=$descr;
406 $alt_gid=$geneid;
407 $alt_mcov=$mcov;
408 $alt_qxgaps=$qgaps;
409 }
410 }
411 else {
412 @fregion=@region;
413 }
414 #----
415 if ($mrna) { #mrna hit
416 unless ($firstdescr) {
417 $firsthid=$qid;
418 $firstdescr=$descr;
419 $firstgid=$geneid;
420 $first_mcov=$mcov;
421 $first_qxgaps=$qgaps;
422 }
423 unless ($besthit) {
424 if (&isInformative($descr)) {
425 $besthid=$qid;
426 $besthit=$descr;
427 $best_gid=$geneid;
428 $best_mcov=$mcov;
429 $best_qxgaps=$qgaps;
430 }
431 }
432 if ($ftophits && $hcount<5) {
433 my @pex=@$xmapr;
434 print MTOPHIT join("\t", $gchr, $track, 'mRNA', $pex[0]->[0], $pex[-1]->[1], '.',
435 $gstrand, '.', "ID=$hid;Name=$hid");
436 print MTOPHIT ";cov=$cov" if ($cov);
437 print MTOPHIT ";gene=$geneid" if $geneid; # shouldn't this be Name instead?
438 print MTOPHIT ";qxgap=$qgaps" if $qgaps;
439 print MTOPHIT ";info=\"$descr\"" if $descr;
440
441 print MTOPHIT "\n";
442 my $c=0;
443 foreach my $px (@pex) {
444 print MTOPHIT join("\t", $gchr, $track, 'exon', $px->[0], $px->[1], $px->[2],
445 $gstrand, '.', "Parent=$hid")."\n";
446 $c++;
447 }
448 $hcount++;
449 }
450 } #mRNA
451 else { #protein hit
452 my $defline = $descr;
453 if ($protcdb && !$defline) {
454 $defline=`cdbyank -F -a '$qid' $protcdb`;
455 printf STDERR " (%04d) %s", $ovlscore, $defline if $debug;
456 chomp($defline);
457 $defline=~s/^\S+\s+//;
458 $defline=~s/^\[\d+\]\s+//;#my old Uniref entries
459 $defline=~s/( \- )/ \[$hid\] /;
460 }
461 unless ($firstdescr) {
462 $firsthid=$qid;
463 $firstdescr=$defline;
464 $firstgid=$geneid;
465 $first_mcov=$mcov;
466 $first_qxgaps=$qgaps;
467 }
468 unless ($besthit) {
469 if (&isInformative($defline)) {
470 $besthid=$qid;
471 $besthit=$defline;
472 $best_gid=$geneid;
473 $best_mcov=$mcov;
474 $best_qxgaps=$qgaps;
475 }
476 }
477 if ($ftophits && $hcount<5) {
478 my @pex=@$xmapr;
479 print TOPHIT join("\t", $gchr, $track, 'mRNA', $pex[0]->[0], $pex[-1]->[1], '.',
480 $gstrand, '.', "ID=$hid;Name=$hid");
481
482 print TOPHIT ";cov=$cov" if $cov;
483 print TOPHIT ";gene=$geneid" if $geneid;
484 print TOPHIT ";qxgap=$qgaps" if $qgaps;
485 print TOPHIT ";info=\"$defline\"" if $defline;
486 print TOPHIT "\n";
487 my $c=0;
488 foreach my $px (@pex) {
489 print TOPHIT join("\t", $gchr, $track, 'exon', $px->[0], $px->[1], $px->[2],
490 # $gstrand, '.', "ID=$prot.e$c;Parent=$prot")."\n";
491 $gstrand, '.', "Parent=$hid")."\n"; #optimized for argo applet
492 $c++;
493 }
494 $hcount++;
495 }
496 }
497 } #foreach hit
498 unless ($besthit) {
499 $besthit=$firstdescr;
500 $besthid=$firsthid;
501 $best_gid=$firstgid;
502 $best_mcov=$first_mcov;
503 $best_qxgaps=$first_qxgaps;
504 }
505 $alt_hit=$alt_hid.' '.$alt_hit if $alt_hit;
506 $besthit=$besthid.' '.$besthit if $besthit;
507 return ($besthit, $best_gid, $best_mcov, $best_qxgaps,
508 $alt_hit, $alt_gid, $alt_mcov, $alt_qxgaps);
509 }
510
511
512
513 #===============================================
514 # bool isInformative($description)
515 # expects only the descripts - not the accession
516 #===============================================
517 sub isInformative {
518 local $_=$_[0];
519 s/^\s+//g;s/\s+$//g;
520 return 0 if length($_)<2;
521 foreach my $pat (@uninformative) {
522 if (m/$pat/i) {
523 #&flog("uninformative by /$pat/i : '$_'") if ($debug);
524 return 0;
525 }
526 }
527 return 1;
528 }

Properties

Name Value
svn:executable *