ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff_add_descr.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 1476 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 my $usage=q/
4 gff_add_descr.pl <cdbyank_file.cidx> <gfflines..>
5
6 Adds an extra "Descr" attribute to the main gff lines
7 (mRNA or gene) of the input, by querying the ID
8 from the <cdbyank_file.cidx> file.
9
10 /;
11 my %cache; # ID => descr
12 my $cdbidx=shift(@ARGV) || die("$usage\nError: no cidx file given\n");
13 my $skipped=0;
14 my $skip;
15 while (<>) {
16 if (m/^\s*#/) { print $_;next; }
17 my @t=split('\t');
18 next unless($t[8]); #can't be a valid gff3 line
19 my $descr;
20 my $f=lc($t[2]);
21 if ($f eq 'mrna' || $f eq 'gene') {
22 if ($t[8]=~m/(?:Descr|Info)\s*=\s*"?([^;"]+)/i) {
23 $descr=$1;
24 if (length($descr)>6) {
25 $skipped++;
26 print $_;
27 next;
28 }
29 $t[8]=~s/(?:Descr|Info)\s*=\s*"?[^;"]+//i;
30 $t[8]=~tr/;/;/s;
31 }#has descr/info already
32 # -- retrieve the description here..
33 my ($id)=($t[8]=~m/\bID\s*=\s*"?([^;" ]+)/i);
34 die("Error: no ID found for gff $f line: $_\n") unless $id;
35 $id=~s/\.(\d+)$/.m$1/;
36 $t[8]=~s/\bID\s*=\s*"?([^;" ]+)/ID=$id/;
37 $id=~s/\.[a-z]*\d+$//;
38 $descr=fetchDescr($id);
39 chomp($t[8]);
40 $t[8].=';descr="'.$descr.'"';
41 $_=join("\t",@t)."\n";
42 } #mRNA/gene line
43 else { #
44 s/(Parent=[\w\|]+)\.(\d+)$/$1.m$2/;
45 }
46 print $_;
47 } #while
48
49 sub fetchDescr {
50 my $id=$_[0];
51 my $def=$cache{$id};
52 return $def if $def;
53 #print STDERR "fetching: cdbyank -a '$id' -F $cdbidx\n";
54 $def=`cdbyank -a '$id' -F $cdbidx`;
55 chomp($def);
56 $def=~s/^\S+\s*//; #remove first token (the ID)
57 return $def;
58 }

Properties

Name Value
svn:executable *