1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
#use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
btab2cov.pl [-T] [-p <min_ps>] [-c <min_cov>] [-n <max_hits>] <input_btab> |
8 |
|
9 |
WARNING: it assumes that the input btatb is sorted by <qry_name>! |
10 |
|
11 |
It will consider all overlaps of at least <min_ps> percent similarity |
12 |
(default: 70) to assess the coverage of the query sequences |
13 |
|
14 |
Outputs coverage information like this, for any query sequence with |
15 |
an overall percentual coverage of at least <min_cov>% (default 50): |
16 |
|
17 |
<qry_seq> <strand> <qry_cov%> <top5hits> |
18 |
|
19 |
..where: |
20 |
<qry_cov%> is the percentual coverage of <qry_seq> length; |
21 |
<top5hits> is a comma delimited list of max. 5 subj. sequences with the best |
22 |
overall alignment scores |
23 |
|
24 |
/; |
25 |
#umask 0002; |
26 |
getopts('Tn:p:c:') || die($usage."\n"); |
27 |
my $mincov=$Getopt::Std::opt_c || 50; |
28 |
my $minpsim=$Getopt::Std::opt_p || 70; |
29 |
my $tophits=$Getopt::Std::opt_n; |
30 |
my $mgtab=$Getopt::Std::opt_T; |
31 |
my @qhp; #HSPs on positive strand: list of [ql, qr, sname] |
32 |
my @qhm; #HSPs on negative strand:list of [ql, qr, sname] |
33 |
my @hitsp; # list of [score, sname] for positive strand; |
34 |
my @hitsm; # list of [score, sname] for negative strand; |
35 |
my %topm; # names of top n hits on plus strand |
36 |
my %topp; # names of top n hits on minus strand |
37 |
my ($curquery, $curlen); |
38 |
if ($mgtab) { #mgblast TAB format |
39 |
while (<>) { |
40 |
next if m/^\s*#/; |
41 |
chomp; |
42 |
# 0 1 2 3 4 5 6 7 8 9 |
43 |
my ($qname, $qlen, $q_5, $q_3, $sname, $slen, $s_5, $s_3, $pid, $score, |
44 |
# 10 11 12 13 |
45 |
$e_val, $strand, $gapdataq, $gapdatas)=split(/\t/); |
46 |
if ($curquery ne $qname) { |
47 |
flushQ() if $curquery; |
48 |
$curquery=$qname; |
49 |
$curlen=$qlen; |
50 |
@qhp=(); |
51 |
@qhm=(); |
52 |
@hitsp=(); |
53 |
@hitsm=(); |
54 |
} |
55 |
next unless $pid>=$minpsim; |
56 |
if ($strand eq '-') { |
57 |
#minus strand: |
58 |
push(@hitsm, [$score, $sname]); |
59 |
($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3; |
60 |
push(@qhm, [$q_5, $q_3, $sname]); |
61 |
} |
62 |
else { |
63 |
#plus strand: |
64 |
push(@hitsp, [$score, $sname]); |
65 |
($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3; |
66 |
push(@qhp, [$q_5, $q_3, $sname]); |
67 |
} |
68 |
} #while readline |
69 |
} #mgblast TAB format |
70 |
|
71 |
else { # BTAB format |
72 |
|
73 |
while (<>) { |
74 |
next if m/^\s*#/; |
75 |
chomp; |
76 |
# 0 1 2 3 4 5 6 7 8 9 |
77 |
my ($qname, $date, $qlen, $method, $db, $sname, $q_5, $q_3, $s_5, $s_3, |
78 |
# 10 11 12 13 14 15 16 17 18 |
79 |
$pid, $psim, $score, $fofsb, $fofse, $sdescr, $frame, $strand, $slen, |
80 |
# 19 20 21 |
81 |
$e_val, $scov, $hsps)=split(/\t/); |
82 |
if ($curquery ne $qname) { |
83 |
flushQ() if $curquery; |
84 |
$curquery=$qname; |
85 |
$curlen=$qlen; |
86 |
@qhp=(); |
87 |
@qhm=(); |
88 |
@hitsp=(); |
89 |
@hitsm=(); |
90 |
} |
91 |
my @hsp=split(/\~/,$hsps); |
92 |
if ($strand eq '-' || lc($strand) eq 'minus') { |
93 |
#minus strand: |
94 |
push(@hitsm, [$score, $sname]); |
95 |
foreach my $h (@hsp) { |
96 |
my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\|([\d\.]+)/); |
97 |
next unless $p>=$minpsim; |
98 |
die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1; |
99 |
($q5, $q3)=($q3,$q5) if $q5>$q3; |
100 |
push(@qhm, [$q5, $q3, $sname]); |
101 |
} |
102 |
} |
103 |
else { |
104 |
#plus strand: |
105 |
push(@hitsp, [$score, $sname]); |
106 |
foreach my $h (@hsp) { |
107 |
my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\|([\d\.]+)/); |
108 |
next unless $p>=$minpsim; |
109 |
die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1; |
110 |
#($q5, $q3)=($q3,$q5) if $q5>$q3; |
111 |
push(@qhp, [$q5, $q3, $sname]); |
112 |
} |
113 |
} |
114 |
} |
115 |
} #BTAB case |
116 |
|
117 |
flushQ() if $curquery; |
118 |
|
119 |
sub flushQ { |
120 |
#compute coverage per strand |
121 |
my ($cp, $cm)=(0,0); |
122 |
my $rhits; #points to either @hitsp or @hitsm |
123 |
undef %topp; |
124 |
undef %topm; |
125 |
if (@hitsp) { |
126 |
@hitsp = sort { $main::b->[0]<=>$main::a->[0] } @hitsp; #sort hits by score |
127 |
if ($tophits) { |
128 |
foreach my $hd (@hitsp) { |
129 |
$topp{$hd->[1]}=1; |
130 |
last if keys(%topp)>=$tophits; |
131 |
} |
132 |
} |
133 |
} |
134 |
if (@hitsm) { |
135 |
@hitsm = sort { $main::b->[0]<=>$main::a->[0] } @hitsm; |
136 |
if ($tophits) { |
137 |
foreach my $hd (@hitsm) { |
138 |
$topm{$hd->[1]}=1; |
139 |
last if keys(%topm)>=$tophits; |
140 |
} |
141 |
} |
142 |
} |
143 |
if (@qhp) { |
144 |
if ($tophits) { @qhp = grep { exists($topp{$_->[2]}) } @qhp }; |
145 |
#compute coverage on positive strand |
146 |
$cp=getCov($curlen, \@qhp); |
147 |
} |
148 |
if (@qhm) { |
149 |
if ($tophits) { @qhm = grep { exists($topm{$_->[2]}) } @qhm }; |
150 |
#compute coverage on negative strand |
151 |
$cm=getCov($curlen, \@qhm); |
152 |
} |
153 |
if ($cm>=$mincov) { |
154 |
my @h; |
155 |
foreach my $p (@hitsm) { if ($tophits) { |
156 |
push(@h, $$p[1]) if exists($topm{$$p[1]}); |
157 |
} else { push(@h, $$p[1]); } |
158 |
last if @h>=5; |
159 |
} |
160 |
print join("\t",$curquery, '-', $cm, join(',',@h))."\n"; |
161 |
} |
162 |
if ($cp>=$mincov) { |
163 |
my @h; |
164 |
foreach my $p (@hitsp) { |
165 |
if ($tophits) { |
166 |
push(@h, $$p[1]) if exists($topp{$$p[1]}); |
167 |
} |
168 |
else { |
169 |
push(@h, $$p[1]); |
170 |
} |
171 |
last if @h>=5; |
172 |
} |
173 |
print join("\t",$curquery, '+', $cp, join(',',@h))."\n"; |
174 |
} |
175 |
} |
176 |
|
177 |
|
178 |
sub getCov { |
179 |
my ($len, $r)=@_; |
180 |
my @a=sort { $main::a->[0]<=>$main::b->[0] } @$r; |
181 |
my $wasovl=1; |
182 |
my @m = map { [$_->[0], $_->[1]] } @a; |
183 |
WASOVL: while ($wasovl) { |
184 |
$wasovl=0; |
185 |
for (my $i=0;$i<@m-1;$i++) { |
186 |
for (my $j=$i+1;$j<@m;$j++) { |
187 |
next if $i==$j; |
188 |
my ($l1, $r1)=($m[$i]->[0], $m[$i]->[1]); |
189 |
my ($l2, $r2)=($m[$j]->[0], $m[$j]->[1]); |
190 |
#we know that l2>=l1 for sure |
191 |
if ($l2<=$r1) { |
192 |
#intersection detected; |
193 |
$wasovl=1; |
194 |
$m[$i]->[1] = ($r1>$r2)?$r1:$r2; #max right coord |
195 |
splice(@m, $j); |
196 |
next WASOVL; |
197 |
} #intersection |
198 |
} #j |
199 |
} #i |
200 |
}#while wasovl |
201 |
my $cov=0; |
202 |
map { $cov+=$_->[1]-$_->[0]+1 } @m; |
203 |
return sprintf('%d',($cov*100.00)/$len); |
204 |
} |