1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
|
5 |
my $usage = q/Usage: |
6 |
fltgff4gv [-t <outtrackname>] [-o <outgff>] [-i <max_intron>] [-b <out_bookmarks>] |
7 |
[-I] <gmap_or_pmap_gff3..> |
8 |
|
9 |
Use the -I option to preserve the full original IDs found in |
10 |
the input gff stream. |
11 |
|
12 |
/; |
13 |
my %gdup; #prevents the duplicate mappings in the input.. |
14 |
umask 0002; |
15 |
getopts('i:It:o:b:') || die($usage."\n"); |
16 |
my $track=$Getopt::Std::opt_t; |
17 |
my $fullid=$Getopt::Std::opt_I; |
18 |
my $bfile=$Getopt::Std::opt_b; |
19 |
my $maxintron=$Getopt::Std::opt_i || 320000; |
20 |
if ($ARGV[0] && (-s $ARGV[0] < 60)) { |
21 |
exit 0; |
22 |
} |
23 |
if ($bfile) { |
24 |
open(BFILE, '>'.$bfile) || die ("Error creating bookmark file $bfile!\n"); |
25 |
} |
26 |
|
27 |
my $outfile=$Getopt::Std::opt_o; |
28 |
if ($outfile) { |
29 |
open(OUTFILE, '>'.$outfile) || die ("Error creating file $outfile!\n"); |
30 |
select(OUTFILE); |
31 |
} |
32 |
|
33 |
my $skiprec=0; |
34 |
my $parentID=''; |
35 |
my @mbuf; #buffer mRNA lines so we can filter out some |
36 |
my @mbex; #buffered mRNA exons |
37 |
while (<>) { |
38 |
next if m/^\s*#/; |
39 |
chomp; |
40 |
my ($chr, $tr, $ftype, $fstart, $fend, $fscore, |
41 |
$fstrand, $frame, $ndata)=split(/\t/); |
42 |
$tr=$track if $track; |
43 |
if ($ftype eq 'mRNA') { |
44 |
flushmbuf(); |
45 |
my ($info)=($ndata=~m/Info="([^"]+)/i); |
46 |
unless ($info) { |
47 |
($info)=($ndata=~m/TopHit="([^"]+)/i); |
48 |
unless ($info) { |
49 |
($info)=($ndata=~m/Descr="([^"]+)/i); |
50 |
} |
51 |
if ($info) { |
52 |
my @nri=split(/\x01/, $info); |
53 |
$info=$nri[0] if (@nri>1); |
54 |
} |
55 |
} |
56 |
my ($cov)=($ndata=~m/Cov=([\d\.]+)/i); |
57 |
my ($pi)=($ndata=~m/Identity=([\d\.]+)/i); |
58 |
my ($id)=($ndata=~m/ID=([^;]+)/); |
59 |
my ($gffname)=($ndata=~m/Name=([^;]+)/); |
60 |
my ($gid, $xid, $name); |
61 |
if (m/GeneId=([\w\.\-]+)/) { |
62 |
$gid=$1; |
63 |
} |
64 |
elsif ($tr=~/jigsaw/ && $id ne $gffname) { |
65 |
$gid=$gffname; |
66 |
} |
67 |
if ($fullid) { |
68 |
$xid=$id; |
69 |
$name=$id; |
70 |
#$info=~s/^\w+\|(\w+)/$1/; |
71 |
$info=~s/gid\://; |
72 |
$info=~s/CDS:\d+\-\d+\s*//; |
73 |
} |
74 |
else { |
75 |
if ($id=~m/^UP/) { |
76 |
if ($id=~m/\|gid\|([\w\-]+)/) { |
77 |
$gid=$1; |
78 |
} |
79 |
else { ($gid)=($id=~m/^UP\w*\|(\w+)/) } |
80 |
$info=~s/^UniRef\w+\s*//; |
81 |
} |
82 |
else { |
83 |
if ($info=~s/gid:(\S+)\s*//) { |
84 |
$gid=$1; |
85 |
} |
86 |
$info=~s/CDS:\d+\-\d+\s*//; |
87 |
} |
88 |
my ($suffixnum)=($id=~m/\.([pbmrnag]*\d+)$/); |
89 |
my ($num)=($suffixnum=~m/(\d+)$/); |
90 |
my ($suffix)=($suffixnum=~m/^([^\d]+)/); |
91 |
#$suffix='m' unless $suffix; |
92 |
$suffix='m'; |
93 |
$num=1 unless $num; |
94 |
($xid)=($id=~m/^\w+\|([\w]+)/); |
95 |
if ($xid) { |
96 |
$xid=$tr.'|'.$xid.'.'.$suffix.$num; |
97 |
} |
98 |
else { #jigsaw, or anything simple |
99 |
#$xid=$tr.'|'.$id; |
100 |
$xid=$id; |
101 |
} |
102 |
$name=$gid || $xid; |
103 |
} |
104 |
|
105 |
#skip duplicates: same ID, track and same start-end coords |
106 |
$skiprec = ($gdup{"$xid.$tr.$fstart.$fend"}++); |
107 |
# $gdup{"$xid.$fstart.$fend"}++; |
108 |
if ($skiprec) { |
109 |
#print "skipping record $xid.$tr.$fstart.$fend\n"; |
110 |
$parentID=''; |
111 |
next; |
112 |
} |
113 |
my $xinfo; |
114 |
$xinfo='cov:'.$cov.'%' if $cov; |
115 |
$xinfo.=', ident:'.$pi.'%' if $pi; |
116 |
$xinfo=~s/^\, //; |
117 |
$info.=' ('.$xinfo.')' if $xinfo; |
118 |
if ($tr =~ m/jigsaw/i) { |
119 |
$xid.='|'.$gid if $gid; |
120 |
$ndata='ID='.$xid.';Name='.$name.';info="'.$info.'"'; |
121 |
} |
122 |
else { |
123 |
my $s=(index(uc($info), uc($name))>=0)? $info : $name.' '.$info; |
124 |
$ndata='ID='.$xid.';Name='.$name.';info="'.$s.'"'; |
125 |
} |
126 |
$parentID=$xid; |
127 |
if ($bfile) { |
128 |
my $bid=$xid; |
129 |
$bid.="|$gid" if $gid && $xid!~m/\Q|$gid/; |
130 |
print BFILE join("\t",$bid,$fstart)."\n"; |
131 |
} |
132 |
} #mRNA line processing until here |
133 |
elsif ($ftype eq 'exon' || $ftype eq 'CDS') { |
134 |
next if $skiprec; |
135 |
next unless $parentID; |
136 |
# my ($pid)=($ndata=~m/Parent=([^;]+)/); |
137 |
# my $pxid; |
138 |
#if ($fullid) { |
139 |
# $pxid=$pid; |
140 |
# } |
141 |
# else { |
142 |
# ($pxid)=($pid=~m/^\w+\|([\w]+)/); |
143 |
# if ($pxid) { |
144 |
# my ($pnum)=($pid=~m/\.mrna(\d+)$/); |
145 |
# $pxid.='.m'.$pnum; |
146 |
# } |
147 |
# else { $pxid=$pid; } |
148 |
# $ndata='Parent='.$pxid; |
149 |
# } |
150 |
$ndata='Parent='.$parentID; |
151 |
push(@mbex, ($fend<$fstart) ? [ $fend, $fstart ] : [$fstart, $fend] ); |
152 |
} |
153 |
else { #next; keep any unrecognized features/markers (e.g. seqgap) |
154 |
flushmbuf(); |
155 |
print join("\t",$chr, $tr, $ftype, $fstart, $fend, $fscore, |
156 |
$fstrand, $frame, $ndata)."\n"; |
157 |
next; |
158 |
} |
159 |
# mRNA, exon/CDS go through here: |
160 |
push(@mbuf, join("\t",$chr, $tr, $ftype, $fstart, $fend, $fscore, |
161 |
$fstrand, $frame, $ndata)); |
162 |
} |
163 |
|
164 |
flushmbuf(); |
165 |
|
166 |
close(BFILE) if $bfile; |
167 |
if ($outfile) { |
168 |
select STDOUT; |
169 |
close(OUTFILE); |
170 |
} |
171 |
|
172 |
sub flushmbuf { |
173 |
return unless @mbuf>0 && @mbex>0; |
174 |
if ($mbuf[0]=~m/\tjigsaw/) { |
175 |
goto NO_INTRONCHECK; |
176 |
} |
177 |
@mbex=sort { $main::a->[0]<=>$main::b->[0] } @mbex; |
178 |
my $maxi=0; |
179 |
if (scalar(@mbex)!=scalar(@mbuf)-1) { |
180 |
die("Error: fltgff4gv: invalid mbuf vs mbex counts for $mbuf[0]!\n"); |
181 |
} |
182 |
for (my $i=1;$i<@mbex;$i++) { |
183 |
my $isize=$mbex[$i]->[0]-$mbex[$i-1]->[1]-1; |
184 |
$maxi=$isize if $maxi<$isize; |
185 |
last if ($maxi>$maxintron); |
186 |
} |
187 |
NO_INTRONCHECK: |
188 |
print join("\n",@mbuf)."\n" unless ($maxi>$maxintron); |
189 |
@mbuf=(); |
190 |
@mbex=(); |
191 |
} |