gclib/scripts/btab2cov.pl

#!/usr/bin/perl
use strict;
use Getopt::Std;
#use FindBin;use lib $FindBin::Bin;

my $usage = q/Usage:
 btab2cov.pl [-T] [-p <min_ps>] [-c <min_cov>] [-n <max_hits>] <input_btab>
 
 WARNING: it assumes that the input btatb is sorted by <qry_name>!
 
 It will consider all overlaps of at least <min_ps> percent similarity
 (default: 70) to assess the coverage of the query sequences
 
 Outputs coverage information like this, for any query sequence with 
 an overall percentual coverage of at least <min_cov>% (default 50):
 
 <qry_seq> <strand> <qry_cov%> <top5hits>
 
 ..where:
 <qry_cov%> is the percentual coverage of <qry_seq> length;
 <top5hits> is a comma delimited list of max. 5 subj. sequences with the best 
 overall alignment scores
 
/;
#umask 0002;
getopts('Tn:p:c:') || die($usage."\n");
my $mincov=$Getopt::Std::opt_c || 50;
my $minpsim=$Getopt::Std::opt_p || 70;
my $tophits=$Getopt::Std::opt_n;
my $mgtab=$Getopt::Std::opt_T;
my @qhp; #HSPs on positive strand: list of [ql, qr, sname]
my @qhm; #HSPs on negative strand:list of  [ql, qr, sname]
my @hitsp; # list of [score, sname] for positive strand;
my @hitsm; # list of [score, sname] for negative strand;
my %topm; # names of top n hits on plus strand
my %topp; # names of top n hits on minus strand
my ($curquery, $curlen);
if ($mgtab) { #mgblast TAB format
while (<>) {
 next if m/^\s*#/;
 chomp;
 #       0    1      2       3     4      5       6    7     8      9
 my ($qname, $qlen, $q_5,   $q_3, $sname, $slen, $s_5, $s_3, $pid, $score,
 #    10     11       12          13
    $e_val, $strand, $gapdataq, $gapdatas)=split(/\t/);
 if ($curquery ne $qname) {
   flushQ() if $curquery;
   $curquery=$qname;
   $curlen=$qlen;
   @qhp=();
   @qhm=();
   @hitsp=();
   @hitsm=();
   }
 next unless $pid>=$minpsim;
 if ($strand eq '-') {
   #minus strand:
   push(@hitsm, [$score, $sname]);   
   ($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3;
   push(@qhm, [$q_5, $q_3, $sname]);
   }
 else {
   #plus strand:
   push(@hitsp, [$score, $sname]);
   ($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3;
   push(@qhp, [$q_5, $q_3, $sname]);
   }
 } #while readline 
} #mgblast TAB format

else { # BTAB format

while (<>) {
 next if m/^\s*#/;
 chomp;
 #       0      1      2       3     4     5       6    7     8      9
 my ($qname, $date, $qlen, $method, $db, $sname, $q_5, $q_3, $s_5, $s_3,
 #    10    11     12      13      14    15        16       17       18
    $pid, $psim, $score, $fofsb, $fofse, $sdescr, $frame, $strand, $slen, 
 #    19    20      21
    $e_val, $scov, $hsps)=split(/\t/);
 if ($curquery ne $qname) {
   flushQ() if $curquery;
   $curquery=$qname;
   $curlen=$qlen;
   @qhp=();
   @qhm=();
   @hitsp=();
   @hitsm=();
   }
 my @hsp=split(/\~/,$hsps);
 if ($strand eq '-' || lc($strand) eq 'minus') {
   #minus strand:
   push(@hitsm, [$score, $sname]); 
   foreach my $h (@hsp) {
     my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\|([\d\.]+)/);
     next unless $p>=$minpsim;
     die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1;
     ($q5, $q3)=($q3,$q5) if $q5>$q3;
     push(@qhm, [$q5, $q3, $sname]);
     }
   }
 else {
   #plus strand:
   push(@hitsp, [$score, $sname]);
   foreach my $h (@hsp) {
     my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\|([\d\.]+)/);
     next unless $p>=$minpsim;
     die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1;
     #($q5, $q3)=($q3,$q5) if $q5>$q3;
     push(@qhp, [$q5, $q3, $sname]);
     }
   }
 }
} #BTAB case

flushQ() if $curquery;

sub flushQ {
 #compute coverage per strand
 my ($cp, $cm)=(0,0);
 my $rhits; #points to either @hitsp or @hitsm
 undef %topp;
 undef %topm;
 if (@hitsp) {
   @hitsp = sort { $main::b->[0]<=>$main::a->[0] } @hitsp; #sort hits by score
   if ($tophits) {
     foreach my $hd (@hitsp) {
        $topp{$hd->[1]}=1;
        last if keys(%topp)>=$tophits;
        }
     }
   }
 if (@hitsm) {
   @hitsm = sort { $main::b->[0]<=>$main::a->[0] } @hitsm;
    if ($tophits) {
      foreach my $hd (@hitsm) {
        $topm{$hd->[1]}=1;
        last if keys(%topm)>=$tophits;
        }
    }
   }
 if (@qhp) {
   if ($tophits) { @qhp = grep { exists($topp{$_->[2]}) } @qhp };
   #compute coverage on positive strand
   $cp=getCov($curlen, \@qhp);
   }
 if (@qhm) {
  if ($tophits) { @qhm = grep { exists($topm{$_->[2]}) } @qhm };
   #compute coverage on negative strand
   $cm=getCov($curlen, \@qhm);
   }
 if ($cm>=$mincov) {
   my @h;
   foreach my $p (@hitsm) { if ($tophits) { 
                                 push(@h, $$p[1]) if exists($topm{$$p[1]});
                             } else { push(@h, $$p[1]); }
                            last if @h>=5; 
                            }
   print join("\t",$curquery, '-', $cm, join(',',@h))."\n";
   }
 if ($cp>=$mincov) {
   my @h;
   foreach my $p (@hitsp) { 
                      if ($tophits) {
                        push(@h, $$p[1]) if exists($topp{$$p[1]});
                        }
                       else { 
                        push(@h, $$p[1]); 
                        }
                      last if @h>=5; 
                      }
   print join("\t",$curquery, '+', $cp, join(',',@h))."\n";
   }
}


sub getCov {
 my ($len, $r)=@_;
 my @a=sort { $main::a->[0]<=>$main::b->[0] } @$r;
 my $wasovl=1;
 my @m = map { [$_->[0], $_->[1]] } @a;
 WASOVL: while ($wasovl) {
  $wasovl=0;
  for (my $i=0;$i<@m-1;$i++) {
     for (my $j=$i+1;$j<@m;$j++) {
      next if $i==$j;
      my ($l1, $r1)=($m[$i]->[0], $m[$i]->[1]);
      my ($l2, $r2)=($m[$j]->[0], $m[$j]->[1]);
      #we know that l2>=l1 for sure
      if ($l2<=$r1) {
        #intersection detected;
        $wasovl=1;
        $m[$i]->[1] = ($r1>$r2)?$r1:$r2; #max right coord
        splice(@m, $j);
        next WASOVL;
        } #intersection
     } #j
   } #i
 }#while wasovl
 my $cov=0;
 map { $cov+=$_->[1]-$_->[0]+1 } @m;
 return sprintf('%d',($cov*100.00)/$len);
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size:	5922 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	#use FindBin;use lib $FindBin::Bin;
5
6	my $usage = q/Usage:
7	btab2cov.pl [-T] [-p <min_ps>] [-c <min_cov>] [-n <max_hits>] <input_btab>
8
9	WARNING: it assumes that the input btatb is sorted by <qry_name>!
10
11	It will consider all overlaps of at least <min_ps> percent similarity
12	(default: 70) to assess the coverage of the query sequences
13
14	Outputs coverage information like this, for any query sequence with
15	an overall percentual coverage of at least <min_cov>% (default 50):
16
17	<qry_seq> <strand> <qry_cov%> <top5hits>
18
19	..where:
20	<qry_cov%> is the percentual coverage of <qry_seq> length;
21	<top5hits> is a comma delimited list of max. 5 subj. sequences with the best
22	overall alignment scores
23
24	/;
25	#umask 0002;
26	getopts('Tn:p:c:') \|\| die($usage."\n");
27	my $mincov=$Getopt::Std::opt_c \|\| 50;
28	my $minpsim=$Getopt::Std::opt_p \|\| 70;
29	my $tophits=$Getopt::Std::opt_n;
30	my $mgtab=$Getopt::Std::opt_T;
31	my @qhp; #HSPs on positive strand: list of [ql, qr, sname]
32	my @qhm; #HSPs on negative strand:list of [ql, qr, sname]
33	my @hitsp; # list of [score, sname] for positive strand;
34	my @hitsm; # list of [score, sname] for negative strand;
35	my %topm; # names of top n hits on plus strand
36	my %topp; # names of top n hits on minus strand
37	my ($curquery, $curlen);
38	if ($mgtab) { #mgblast TAB format
39	while (<>) {
40	next if m/^\s*#/;
41	chomp;
42	# 0 1 2 3 4 5 6 7 8 9
43	my ($qname, $qlen, $q_5, $q_3, $sname, $slen, $s_5, $s_3, $pid, $score,
44	# 10 11 12 13
45	$e_val, $strand, $gapdataq, $gapdatas)=split(/\t/);
46	if ($curquery ne $qname) {
47	flushQ() if $curquery;
48	$curquery=$qname;
49	$curlen=$qlen;
50	@qhp=();
51	@qhm=();
52	@hitsp=();
53	@hitsm=();
54	}
55	next unless $pid>=$minpsim;
56	if ($strand eq '-') {
57	#minus strand:
58	push(@hitsm, [$score, $sname]);
59	($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3;
60	push(@qhm, [$q_5, $q_3, $sname]);
61	}
62	else {
63	#plus strand:
64	push(@hitsp, [$score, $sname]);
65	($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3;
66	push(@qhp, [$q_5, $q_3, $sname]);
67	}
68	} #while readline
69	} #mgblast TAB format
70
71	else { # BTAB format
72
73	while (<>) {
74	next if m/^\s*#/;
75	chomp;
76	# 0 1 2 3 4 5 6 7 8 9
77	my ($qname, $date, $qlen, $method, $db, $sname, $q_5, $q_3, $s_5, $s_3,
78	# 10 11 12 13 14 15 16 17 18
79	$pid, $psim, $score, $fofsb, $fofse, $sdescr, $frame, $strand, $slen,
80	# 19 20 21
81	$e_val, $scov, $hsps)=split(/\t/);
82	if ($curquery ne $qname) {
83	flushQ() if $curquery;
84	$curquery=$qname;
85	$curlen=$qlen;
86	@qhp=();
87	@qhm=();
88	@hitsp=();
89	@hitsm=();
90	}
91	my @hsp=split(/\~/,$hsps);
92	if ($strand eq '-' \|\| lc($strand) eq 'minus') {
93	#minus strand:
94	push(@hitsm, [$score, $sname]);
95	foreach my $h (@hsp) {
96	my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\\|([\d\.]+)/);
97	next unless $p>=$minpsim;
98	die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1;
99	($q5, $q3)=($q3,$q5) if $q5>$q3;
100	push(@qhm, [$q5, $q3, $sname]);
101	}
102	}
103	else {
104	#plus strand:
105	push(@hitsp, [$score, $sname]);
106	foreach my $h (@hsp) {
107	my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\\|([\d\.]+)/);
108	next unless $p>=$minpsim;
109	die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1;
110	#($q5, $q3)=($q3,$q5) if $q5>$q3;
111	push(@qhp, [$q5, $q3, $sname]);
112	}
113	}
114	}
115	} #BTAB case
116
117	flushQ() if $curquery;
118
119	sub flushQ {
120	#compute coverage per strand
121	my ($cp, $cm)=(0,0);
122	my $rhits; #points to either @hitsp or @hitsm
123	undef %topp;
124	undef %topm;
125	if (@hitsp) {
126	@hitsp = sort { $main::b->[0]<=>$main::a->[0] } @hitsp; #sort hits by score
127	if ($tophits) {
128	foreach my $hd (@hitsp) {
129	$topp{$hd->[1]}=1;
130	last if keys(%topp)>=$tophits;
131	}
132	}
133	}
134	if (@hitsm) {
135	@hitsm = sort { $main::b->[0]<=>$main::a->[0] } @hitsm;
136	if ($tophits) {
137	foreach my $hd (@hitsm) {
138	$topm{$hd->[1]}=1;
139	last if keys(%topm)>=$tophits;
140	}
141	}
142	}
143	if (@qhp) {
144	if ($tophits) { @qhp = grep { exists($topp{$_->[2]}) } @qhp };
145	#compute coverage on positive strand
146	$cp=getCov($curlen, \@qhp);
147	}
148	if (@qhm) {
149	if ($tophits) { @qhm = grep { exists($topm{$_->[2]}) } @qhm };
150	#compute coverage on negative strand
151	$cm=getCov($curlen, \@qhm);
152	}
153	if ($cm>=$mincov) {
154	my @h;
155	foreach my $p (@hitsm) { if ($tophits) {
156	push(@h, $$p[1]) if exists($topm{$$p[1]});
157	} else { push(@h, $$p[1]); }
158	last if @h>=5;
159	}
160	print join("\t",$curquery, '-', $cm, join(',',@h))."\n";
161	}
162	if ($cp>=$mincov) {
163	my @h;
164	foreach my $p (@hitsp) {
165	if ($tophits) {
166	push(@h, $$p[1]) if exists($topp{$$p[1]});
167	}
168	else {
169	push(@h, $$p[1]);
170	}
171	last if @h>=5;
172	}
173	print join("\t",$curquery, '+', $cp, join(',',@h))."\n";
174	}
175	}
176
177
178	sub getCov {
179	my ($len, $r)=@_;
180	my @a=sort { $main::a->[0]<=>$main::b->[0] } @$r;
181	my $wasovl=1;
182	my @m = map { [$_->[0], $_->[1]] } @a;
183	WASOVL: while ($wasovl) {
184	$wasovl=0;
185	for (my $i=0;$i<@m-1;$i++) {
186	for (my $j=$i+1;$j<@m;$j++) {
187	next if $i==$j;
188	my ($l1, $r1)=($m[$i]->[0], $m[$i]->[1]);
189	my ($l2, $r2)=($m[$j]->[0], $m[$j]->[1]);
190	#we know that l2>=l1 for sure
191	if ($l2<=$r1) {
192	#intersection detected;
193	$wasovl=1;
194	$m[$i]->[1] = ($r1>$r2)?$r1:$r2; #max right coord
195	splice(@m, $j);
196	next WASOVL;
197	} #intersection
198	} #j
199	} #i
200	}#while wasovl
201	my $cov=0;
202	map { $cov+=$_->[1]-$_->[0]+1 } @m;
203	return sprintf('%d',($cov*100.00)/$len);
204	}