ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/btab2cov.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 5922 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 #use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 btab2cov.pl [-T] [-p <min_ps>] [-c <min_cov>] [-n <max_hits>] <input_btab>
8
9 WARNING: it assumes that the input btatb is sorted by <qry_name>!
10
11 It will consider all overlaps of at least <min_ps> percent similarity
12 (default: 70) to assess the coverage of the query sequences
13
14 Outputs coverage information like this, for any query sequence with
15 an overall percentual coverage of at least <min_cov>% (default 50):
16
17 <qry_seq> <strand> <qry_cov%> <top5hits>
18
19 ..where:
20 <qry_cov%> is the percentual coverage of <qry_seq> length;
21 <top5hits> is a comma delimited list of max. 5 subj. sequences with the best
22 overall alignment scores
23
24 /;
25 #umask 0002;
26 getopts('Tn:p:c:') || die($usage."\n");
27 my $mincov=$Getopt::Std::opt_c || 50;
28 my $minpsim=$Getopt::Std::opt_p || 70;
29 my $tophits=$Getopt::Std::opt_n;
30 my $mgtab=$Getopt::Std::opt_T;
31 my @qhp; #HSPs on positive strand: list of [ql, qr, sname]
32 my @qhm; #HSPs on negative strand:list of [ql, qr, sname]
33 my @hitsp; # list of [score, sname] for positive strand;
34 my @hitsm; # list of [score, sname] for negative strand;
35 my %topm; # names of top n hits on plus strand
36 my %topp; # names of top n hits on minus strand
37 my ($curquery, $curlen);
38 if ($mgtab) { #mgblast TAB format
39 while (<>) {
40 next if m/^\s*#/;
41 chomp;
42 # 0 1 2 3 4 5 6 7 8 9
43 my ($qname, $qlen, $q_5, $q_3, $sname, $slen, $s_5, $s_3, $pid, $score,
44 # 10 11 12 13
45 $e_val, $strand, $gapdataq, $gapdatas)=split(/\t/);
46 if ($curquery ne $qname) {
47 flushQ() if $curquery;
48 $curquery=$qname;
49 $curlen=$qlen;
50 @qhp=();
51 @qhm=();
52 @hitsp=();
53 @hitsm=();
54 }
55 next unless $pid>=$minpsim;
56 if ($strand eq '-') {
57 #minus strand:
58 push(@hitsm, [$score, $sname]);
59 ($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3;
60 push(@qhm, [$q_5, $q_3, $sname]);
61 }
62 else {
63 #plus strand:
64 push(@hitsp, [$score, $sname]);
65 ($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3;
66 push(@qhp, [$q_5, $q_3, $sname]);
67 }
68 } #while readline
69 } #mgblast TAB format
70
71 else { # BTAB format
72
73 while (<>) {
74 next if m/^\s*#/;
75 chomp;
76 # 0 1 2 3 4 5 6 7 8 9
77 my ($qname, $date, $qlen, $method, $db, $sname, $q_5, $q_3, $s_5, $s_3,
78 # 10 11 12 13 14 15 16 17 18
79 $pid, $psim, $score, $fofsb, $fofse, $sdescr, $frame, $strand, $slen,
80 # 19 20 21
81 $e_val, $scov, $hsps)=split(/\t/);
82 if ($curquery ne $qname) {
83 flushQ() if $curquery;
84 $curquery=$qname;
85 $curlen=$qlen;
86 @qhp=();
87 @qhm=();
88 @hitsp=();
89 @hitsm=();
90 }
91 my @hsp=split(/\~/,$hsps);
92 if ($strand eq '-' || lc($strand) eq 'minus') {
93 #minus strand:
94 push(@hitsm, [$score, $sname]);
95 foreach my $h (@hsp) {
96 my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\|([\d\.]+)/);
97 next unless $p>=$minpsim;
98 die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1;
99 ($q5, $q3)=($q3,$q5) if $q5>$q3;
100 push(@qhm, [$q5, $q3, $sname]);
101 }
102 }
103 else {
104 #plus strand:
105 push(@hitsp, [$score, $sname]);
106 foreach my $h (@hsp) {
107 my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\|([\d\.]+)/);
108 next unless $p>=$minpsim;
109 die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1;
110 #($q5, $q3)=($q3,$q5) if $q5>$q3;
111 push(@qhp, [$q5, $q3, $sname]);
112 }
113 }
114 }
115 } #BTAB case
116
117 flushQ() if $curquery;
118
119 sub flushQ {
120 #compute coverage per strand
121 my ($cp, $cm)=(0,0);
122 my $rhits; #points to either @hitsp or @hitsm
123 undef %topp;
124 undef %topm;
125 if (@hitsp) {
126 @hitsp = sort { $main::b->[0]<=>$main::a->[0] } @hitsp; #sort hits by score
127 if ($tophits) {
128 foreach my $hd (@hitsp) {
129 $topp{$hd->[1]}=1;
130 last if keys(%topp)>=$tophits;
131 }
132 }
133 }
134 if (@hitsm) {
135 @hitsm = sort { $main::b->[0]<=>$main::a->[0] } @hitsm;
136 if ($tophits) {
137 foreach my $hd (@hitsm) {
138 $topm{$hd->[1]}=1;
139 last if keys(%topm)>=$tophits;
140 }
141 }
142 }
143 if (@qhp) {
144 if ($tophits) { @qhp = grep { exists($topp{$_->[2]}) } @qhp };
145 #compute coverage on positive strand
146 $cp=getCov($curlen, \@qhp);
147 }
148 if (@qhm) {
149 if ($tophits) { @qhm = grep { exists($topm{$_->[2]}) } @qhm };
150 #compute coverage on negative strand
151 $cm=getCov($curlen, \@qhm);
152 }
153 if ($cm>=$mincov) {
154 my @h;
155 foreach my $p (@hitsm) { if ($tophits) {
156 push(@h, $$p[1]) if exists($topm{$$p[1]});
157 } else { push(@h, $$p[1]); }
158 last if @h>=5;
159 }
160 print join("\t",$curquery, '-', $cm, join(',',@h))."\n";
161 }
162 if ($cp>=$mincov) {
163 my @h;
164 foreach my $p (@hitsp) {
165 if ($tophits) {
166 push(@h, $$p[1]) if exists($topp{$$p[1]});
167 }
168 else {
169 push(@h, $$p[1]);
170 }
171 last if @h>=5;
172 }
173 print join("\t",$curquery, '+', $cp, join(',',@h))."\n";
174 }
175 }
176
177
178 sub getCov {
179 my ($len, $r)=@_;
180 my @a=sort { $main::a->[0]<=>$main::b->[0] } @$r;
181 my $wasovl=1;
182 my @m = map { [$_->[0], $_->[1]] } @a;
183 WASOVL: while ($wasovl) {
184 $wasovl=0;
185 for (my $i=0;$i<@m-1;$i++) {
186 for (my $j=$i+1;$j<@m;$j++) {
187 next if $i==$j;
188 my ($l1, $r1)=($m[$i]->[0], $m[$i]->[1]);
189 my ($l2, $r2)=($m[$j]->[0], $m[$j]->[1]);
190 #we know that l2>=l1 for sure
191 if ($l2<=$r1) {
192 #intersection detected;
193 $wasovl=1;
194 $m[$i]->[1] = ($r1>$r2)?$r1:$r2; #max right coord
195 splice(@m, $j);
196 next WASOVL;
197 } #intersection
198 } #j
199 } #i
200 }#while wasovl
201 my $cov=0;
202 map { $cov+=$_->[1]-$_->[0]+1 } @m;
203 return sprintf('%d',($cov*100.00)/$len);
204 }

Properties

Name Value
svn:executable *