1 |
#!/usr/bin/perl |
2 |
|
3 |
#some filtering can be performed on the output |
4 |
# |
5 |
# this should be rewritten in C/C++ to be faster |
6 |
|
7 |
use strict; |
8 |
|
9 |
my $usage = q{ |
10 |
blast2btab [HITLEN=nn[%]] [OVHANG=nn[%]] [PID=nn] [PSIM=nn] |
11 |
[SCORE=nnnn | E=nnnnn] |
12 |
|
13 |
Format and filter wu-blast2 output coming at stdin. The output consists of |
14 |
the following tab delimited fields: |
15 |
0 1 2 3 4 5 |
16 |
q_name, date, q_len, method, db_file, hit_name, |
17 |
6 7 8 9 10 11 12 |
18 |
q_n5, q_n3, hit_n5, hit_n3, %id, %sim, score, |
19 |
13 14 15 16 |
20 |
file_pos1, file_pos2, hit_description, Frame, |
21 |
17 18 19 [ 20 21 ] |
22 |
strand, hit_len, E-value [ hit-coverage HSPs ] |
23 |
|
24 |
Filtering: |
25 |
PID for minimum percent identity |
26 |
PSIM for minimum percent similarity |
27 |
SCORE for minimum alignment score |
28 |
E for maximum E value (Blast E parameter) |
29 |
HITLEN for minimum overlap length |
30 |
OVHANG for maximum overhang length (mismatched overhangs) |
31 |
|
32 |
If '%' is specified for HITLEN or OVHANG values, the actual cutoff |
33 |
lengths will be computed as percentages of the query sequence length; |
34 |
|
35 |
If -topcomboX option is used, each line will represent a group, with the individual |
36 |
segment hits in the 21th field, and the actual hit coverage length as the 20th field |
37 |
}; |
38 |
|
39 |
die($usage) if ($ARGV[0]=~/\-h|\-?\-help/i); |
40 |
my $_bignum=9999999; |
41 |
my %a=('HITLEN'=>0, |
42 |
'OVHANG'=>$_bignum, |
43 |
'PID'=>0, |
44 |
'PSIM'=>0, |
45 |
'SCORE'=>0, |
46 |
'E'=>$_bignum, |
47 |
'DEFLINE'=>'none', |
48 |
'PARSE'=>0 |
49 |
); |
50 |
# try to parse filtering options, if any; |
51 |
my ($is_lenperc, $is_ovperc)=(0,0); |
52 |
my $is_filtered=0; |
53 |
my $bytes_read=0; |
54 |
foreach (@ARGV) { |
55 |
my @param=split(/=/); |
56 |
$is_filtered=1 if ($param[0] ne 'DEFLINE'); |
57 |
die $usage if (@param<2 || not exists($a{$param[0]})); |
58 |
if ($param[0] eq 'HITLEN') { |
59 |
my $p=rindex($param[1],'%'); |
60 |
if ($p>0) { |
61 |
$is_lenperc=1; |
62 |
$param[1]=substr($param[1],0,$p); |
63 |
} |
64 |
} |
65 |
elsif ($param[0] eq 'OVHANG') { |
66 |
my $p=rindex($param[1],'%'); |
67 |
if ($p>=0) { |
68 |
$is_ovperc=1; |
69 |
$param[1]=substr($param[1],0,$p); |
70 |
} |
71 |
} |
72 |
$a{$param[0]}=$param[1]; |
73 |
} |
74 |
my $defln=lc($a{'DEFLINE'}); |
75 |
if ($defln eq 'none') { |
76 |
$defln=0; |
77 |
} |
78 |
elsif ($defln eq 'q') { |
79 |
$defln=1; |
80 |
} |
81 |
elsif ($defln eq 'h') { |
82 |
$defln=2; |
83 |
} |
84 |
elsif ($defln eq 'b') { |
85 |
$defln=3; |
86 |
} |
87 |
else { |
88 |
die "$usage. Error at DEFLINE option."; |
89 |
} |
90 |
|
91 |
$/ = "\n"; |
92 |
#$\ = "\n"; |
93 |
#$, = "\t"; |
94 |
my ($minhlen, $maxoh, $doparse, $minpid, $minpsim, $minscore, $maxE)=( |
95 |
$a{'HITLEN'}, |
96 |
$a{'OVHANG'}, |
97 |
$a{'PARSE'}, |
98 |
$a{'PID'}, |
99 |
$a{'PSIM'}, |
100 |
$a{'SCORE'}, |
101 |
$a{'E'}); |
102 |
$maxE='1'.$maxE if lc(substr($maxE,0,1)) eq 'e'; |
103 |
|
104 |
my $writeline= ($is_filtered) ? \&write_flt : \&write_noflt; |
105 |
my $curgrp; #current group # |
106 |
my @grpdata; #when groups are defined, this accumulates HSP data |
107 |
# list of [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame] |
108 |
#my ($grphspcount, $grplen, $grpids, $grpsims); |
109 |
#segments expected, segments collected already, |
110 |
# total bases, total identities, total positives |
111 |
|
112 |
#only one file is accepted here... |
113 |
my ($query, $qlen, $q5, $q3, $hit, $hitlen, $h5, $h3, $ids, $sims, $bcount, $pid, $psim, |
114 |
$score, $evalue, $strand, $frame, $hdefline, $searchdb, $fpos, $fend, $qdefline); |
115 |
my $flush=0; |
116 |
$fpos=-1; |
117 |
$fend=-1; |
118 |
my $method='prog'; |
119 |
$searchdb='n/a'; |
120 |
my @lcdate=localtime(); |
121 |
my $date=($lcdate[4]+1).'/'.$lcdate[3].'/'.(1900+$lcdate[5]); |
122 |
$frame='*'; |
123 |
BIGLOOP: |
124 |
while (<STDIN>) { |
125 |
die "Line $. doesn't appear to be a blast output:\n$_\n" unless (/^(T?BLAST[NXP]?)/); |
126 |
NEXT_QRYBLOCK: |
127 |
$bytes_read+=length($_); |
128 |
next if /^\s*$/; |
129 |
|
130 |
goto QRYFOUND if m/^Query=\s*(\S+)\s*(.*)/; |
131 |
|
132 |
$method=lc($1); |
133 |
do { |
134 |
$_=<STDIN>; $bytes_read+=length($_); |
135 |
|
136 |
last BIGLOOP unless $_; |
137 |
} until m/^Query=\s*(\S+)\s*(.*)/; |
138 |
QRYFOUND: |
139 |
&write_group() if $query && $curgrp; |
140 |
$query=$1; |
141 |
$qdefline=''; |
142 |
if ($defln==1 || $defln==3) { |
143 |
$qdefline=$2; |
144 |
#print STDERR "qdefline set to '$qdefline'\n"; |
145 |
$qdefline =~ tr/\n//d; |
146 |
} |
147 |
my $linepos=$.; |
148 |
do { |
149 |
$_=<STDIN>; $bytes_read+=length($_); |
150 |
last BIGLOOP unless $_; |
151 |
&err_parse("Cannot find 'Length' line for Query $query") |
152 |
if ($.-$linepos) > 10000; |
153 |
} until /\s+\(([\d\,]+) letters/; |
154 |
|
155 |
$qlen=$1; |
156 |
$qlen =~ tr/,//d; |
157 |
# parse all matches for this $query: |
158 |
$_=<STDIN>; $bytes_read+=length($_); |
159 |
my $was_param; |
160 |
while ($_ && !(($was_param=m/^Parameters:/) || m/^EXIT CODE/)) { |
161 |
chomp; |
162 |
if (m/^Database:\s+(\S+)/) { |
163 |
$searchdb=$1; |
164 |
($searchdb)=($searchdb =~ m/([^\/]+)$/); |
165 |
} |
166 |
# Hit section: |
167 |
if (/^>(\S+)\s*(.*)/) { |
168 |
#HSPs here |
169 |
$hit=$1; |
170 |
#print STDERR "hitline is: $_\n"; |
171 |
#if ($defln>=2) { |
172 |
$hdefline=$2; |
173 |
#print STDERR "hdefline set to '$hdefline'\n"; |
174 |
# } |
175 |
NEXT_SUBJ: |
176 |
my $savepos; |
177 |
#&err_parse("Did you use '-noseqs' option for this blast search?") |
178 |
# if ($hit && $score && !$h3); |
179 |
$savepos=$.; |
180 |
while (<STDIN>) { |
181 |
$bytes_read+=length($_); |
182 |
&err_parse("Cannot find 'Length' line for hit $hit") |
183 |
if ($.-$savepos) > 10000; |
184 |
if (/^\s+Length\s+=\s+([\d\,]+)/) { |
185 |
$hitlen=$1; $hitlen =~ tr/,//d; |
186 |
last; |
187 |
} |
188 |
chomp; |
189 |
s/^\s+//; |
190 |
$hdefline.=' '.$_ if $_; |
191 |
} |
192 |
$h3=0;$score=0;$strand='+'; |
193 |
$_=<STDIN>;$bytes_read+=length($_); |
194 |
$savepos=$.+80; |
195 |
my $withseqs=0; |
196 |
while ($_) { |
197 |
#search of all HSPs until Subject match or EOF is encountered |
198 |
#print "&&&&&& $query: found hit $hit, score=$score\n"; |
199 |
if (/^>(\S+)\s*(.*)/) { |
200 |
#print "prevscore= $score, prevhit=$hit, prevcoords=$h3\n"; |
201 |
#&err_parse("-- coordinates not found for hit $hit. Please be sure blastn is given -noseqs option!") |
202 |
# if ($hit && $score && !$h3); |
203 |
&write_group() if $curgrp; |
204 |
chomp; |
205 |
$hit=$1;$hdefline=$2; |
206 |
goto NEXT_SUBJ; |
207 |
} |
208 |
goto SKIP_EMPTYLN if ($_ eq "\n"); |
209 |
if (($was_param=m/^Parameters:/) || m/^EXIT CODE/) { |
210 |
&write_group() if $curgrp; |
211 |
goto NEXT_QUERY; |
212 |
} |
213 |
if (/^\s*Score[\s=]+(\d+)/) { |
214 |
$score=$1; |
215 |
$fpos=$bytes_read-length($_); |
216 |
($evalue)=(m/\,\s*Expect\s*=\s*([\de\.\-]+)/); |
217 |
#print("got e-value $evalue for hit $hit \n"); |
218 |
my ($group)=(m/Group\s+=\s+(\d+)/); |
219 |
&write_group() if ($curgrp && $curgrp!=$group); |
220 |
$curgrp=$group; |
221 |
$savepos=$.+80; |
222 |
} |
223 |
elsif (/^\s*Identities\s+=\s+(\d+)\/(\d+)\s+\(([\d\.]+)\%\)/) { |
224 |
($ids, $bcount, $pid)=($1,$2,$3); |
225 |
$psim=$pid; |
226 |
if (/Positives\s+=\s+(\d+)\/\d+\s+\(([\d\.]+)\%\)/) { |
227 |
($sims, $psim)=($1, $2) |
228 |
} |
229 |
$strand=/Minus/?'Minus':'Plus'; |
230 |
if (/Frame\s+\=\s+([\+\-\d]+)/) { |
231 |
$frame=$1; |
232 |
$strand=(substr($frame,0,1) eq '-') ? 'Minus':'Plus'; |
233 |
} |
234 |
} |
235 |
elsif (m/^Query:\s+/) { #parse query coordinates |
236 |
if (m/^Query:\s*(\d+)[\s]+\-\s+(\d+)/) { #noseqs case |
237 |
($q5,$q3)=($1,$2); |
238 |
$_=<STDIN>;$bytes_read+=length($_); |
239 |
if (m/^Sbjct:\s*(\d+)\s+\-\s+(\d+)/) { #noseqs case |
240 |
($h5,$h3)=($1,$2); |
241 |
$fend=$bytes_read; |
242 |
if ($curgrp) { |
243 |
push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]); |
244 |
#$grphspcount++; |
245 |
#$grpids+=$ids; |
246 |
#$grpsims+=$sims; |
247 |
#$grplen+=$bcount; |
248 |
} |
249 |
else { |
250 |
&$writeline; |
251 |
} |
252 |
} |
253 |
else { |
254 |
&err_parse("Cannot find Sbjct: line for [noseqs] output (query=$query, subj=$hit)"); |
255 |
} |
256 |
} |
257 |
else { #read/parse the whole alignment to get the start-end coordinates |
258 |
#for both query and sequence |
259 |
if (m/^Query:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) { |
260 |
($q5,$q3)=($1,$2); |
261 |
} |
262 |
else { |
263 |
&err_parse("Invalid start of Query: alignment line (query=$query, subj=$hit)"); |
264 |
} |
265 |
$withseqs=1; |
266 |
$_=<STDIN>;$bytes_read+=length($_); |
267 |
$_=<STDIN>;$bytes_read+=length($_); |
268 |
if (m/^Sbjct:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) { |
269 |
($h5, $h3) = ($1, $2); |
270 |
} |
271 |
else { |
272 |
&err_parse("Cannot find start Sbjct: line for alignment (query=$query, subj=$hit)"); |
273 |
} |
274 |
$_=<STDIN>; $bytes_read+=length($_);#skip empty line |
275 |
#$_=<STDIN>; |
276 |
#print STDERR "Alignment continuation is:\n$_\n\n"; |
277 |
#exit(0); |
278 |
while (defined($_=<STDIN>) && m/^Query:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) { |
279 |
$bytes_read+=length($_); |
280 |
#print STDERR "found alignment continuation\n"; |
281 |
$q3=$1; |
282 |
$_=<STDIN>;$bytes_read+=length($_); |
283 |
$_=<STDIN>;$bytes_read+=length($_); |
284 |
if (m/^Sbjct:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) { |
285 |
$h3=$1; |
286 |
} |
287 |
else { |
288 |
&err_parse("Cannot find end of Sbjct: line for alignment (query=$query, subj=$hit)"); |
289 |
} |
290 |
$fend = $bytes_read; |
291 |
$_=<STDIN>;$bytes_read+=length($_); |
292 |
} |
293 |
$fend=$bytes_read; |
294 |
if ($curgrp) { |
295 |
push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]); |
296 |
#$grphspcount++; |
297 |
#$grpids+=$ids; |
298 |
#$grpsims+=$sims; |
299 |
#$grplen+=$bcount; |
300 |
} |
301 |
else { |
302 |
&$writeline; |
303 |
} |
304 |
next; #skip to next HSP, as we might have read it |
305 |
} # alignment parsing |
306 |
} |
307 |
SKIP_EMPTYLN: |
308 |
$_=<STDIN>;$bytes_read+=length($_); |
309 |
&err_parse("--Timeout trying to find all HSPs for query $query and subj $hit") |
310 |
if ($.>$savepos+5000); |
311 |
} |
312 |
&err_parse("--Cannot get hit data for $hit") |
313 |
if ($score==0 || $h3==0); |
314 |
|
315 |
} #--Subject entry |
316 |
$_=<STDIN>;$bytes_read+=length($_); |
317 |
} |
318 |
NEXT_QUERY: |
319 |
do { |
320 |
$_=<STDIN>;$bytes_read+=length($_); |
321 |
last if eof(STDIN); |
322 |
#} until /^\x0C/; # record separator for multi-fasta wu-blast! |
323 |
} until (/^Query=/); |
324 |
goto NEXT_QRYBLOCK; |
325 |
} |
326 |
|
327 |
exit(0); |
328 |
|
329 |
#========================================== |
330 |
#to speed it up, use the variables directly from main program |
331 |
#(not very elegant, but more efficient) |
332 |
|
333 |
sub err_parse { |
334 |
die "Parsing Error at line $.:\n$_\n$_[0]\n"; |
335 |
} |
336 |
|
337 |
sub write_noflt { |
338 |
$hdefline =~ s/^>/\x01/; |
339 |
$hdefline =~ s/ >([\w\|]{6,})/\x01$1/g; |
340 |
my $xtra=$_[0]?"\t$_[0]":""; |
341 |
print join("\t", |
342 |
# 0 1 2 3 4 5 6 7 8 9 10 11 |
343 |
($query, $date, $qlen, $method, $searchdb, $hit, $q5, $q3, $h5, $h3, $pid, $psim, |
344 |
# 12 13 14 15 16 17 18 19 |
345 |
$score, $fpos, $fend, $hdefline, $frame, $strand, $hitlen, $evalue))."$xtra\n"; |
346 |
}; #clean write |
347 |
|
348 |
sub write_flt { |
349 |
#my ($minhlen, $maxoh, $minpid, $minsc, $maxE) |
350 |
my ($minmatch, $maxovhang)=($is_lenperc?(($minhlen*$qlen)/100):$minhlen, |
351 |
$is_ovperc?(($maxoh*$qlen)/100):$maxoh); |
352 |
|
353 |
my ($q_match, $q_beg_over, $q_end_over)=($q5 < $q3)? |
354 |
(($q3 - $q5) + 1, $q5 - 1, $qlen - $q3) : |
355 |
(($q5 - $q3) + 1, $qlen - $q5, $q3 - 1); |
356 |
my ($h_match, $h_beg_over, $h_end_over)= ($h5 < $h3)? |
357 |
( ($h3 - $h5) + 1, $h5 - 1, $hitlen - $h3) : |
358 |
( ($h5 - $h3) + 1, $hitlen - $h5, $h3 - 1); |
359 |
&write_noflt($_[0]) |
360 |
if (($query ne $hit) && |
361 |
(($h_match >=$minmatch) || ($q_match >= $minmatch)) |
362 |
&& (($q_beg_over <= $maxovhang) || ($h_beg_over <= $maxovhang)) |
363 |
&& (($q_end_over <= $maxovhang) || ($h_end_over <= $maxovhang)) |
364 |
&& ($pid>=$minpid) |
365 |
&& ($psim>=$minpsim) |
366 |
&& ($score>=$minscore) |
367 |
&& ($evalue<=$maxE)) |
368 |
} |
369 |
|
370 |
sub write_group { |
371 |
die "Error: write_group called with empty \@grpdata\n" |
372 |
if (@grpdata==0); |
373 |
my (@q5, @q3, @h5, @h3); |
374 |
@grpdata = sort { $main::a->[0] <=> $main::b->[0] } @grpdata; |
375 |
my @details; |
376 |
$score=0; |
377 |
my $hitcov; #non redundant hit coverage -- |
378 |
my @hitsegs; # non-overlapping hit segments |
379 |
my $frm; |
380 |
my ($grplen, $grpids, $grpsims)=(0,0,0); |
381 |
# also recompute $psim, $pid, $q5, $q3, $h5, $h3 |
382 |
# because psim/pid filter may have discarded some HSPs |
383 |
#my ($sumpid, $sumpsim, $sumlen, $minq, $maxq, $minh, $maxh); |
384 |
foreach my $d (@grpdata) { |
385 |
#$d=[$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $numids, $numsims, $numbases] |
386 |
# 0 1 2 3 4 5 6 7 8 9 10 |
387 |
#print STDERR "seg $$d[0]-$$d[1] : $$d[2]-$$d[3]\n"; |
388 |
next unless ($$d[4]>=$minpid && $$d[5]>=$minpsim); |
389 |
$grpids += $$d[8]; |
390 |
$grpsims += $$d[9]; |
391 |
$grplen += $$d[10]; |
392 |
my ($seg_start, $seg_end)=($$d[1]<$$d[0])?($$d[1],$$d[0]) :($$d[0],$$d[1]); |
393 |
my $seglen=$seg_end-$seg_start+1; |
394 |
#$sumpid+=$seglen*$$d[4]; |
395 |
#$sumpsim+=$seglen*$$d[5]; |
396 |
#$sumlen+=$seglen; |
397 |
#$minq=$seg_start if ($minq==0 || $minq>$seg_start); |
398 |
#$maxq=$seg_end if ($maxq==0 || $maxq<$seg_end); |
399 |
|
400 |
push(@q5,$$d[0]);push(@q3,$$d[1]); |
401 |
push(@h5,$$d[2]);push(@h3,$$d[3]); |
402 |
if ($$d[2]>$$d[3]) { |
403 |
#$minh=$$d[3] if ($minh==0 || $minh>$$d[3]); |
404 |
#$maxh=$$d[2] if ($maxh==0 || $maxh<$$d[2]); |
405 |
push(@hitsegs,[$$d[3],$$d[2]] ) |
406 |
} |
407 |
else { |
408 |
#$minh=$$d[3] if ($minh==0 || $minh>$$d[3]); |
409 |
#$maxh=$$d[2] if ($maxh==0 || $maxh<$$d[2]); |
410 |
push(@hitsegs,[$$d[2],$$d[3]]); |
411 |
} |
412 |
push(@details, "$$d[0]-$$d[1]\:$$d[2]-$$d[3]|$$d[5]"); |
413 |
$score+=$$d[6]; |
414 |
$frm=$$d[7] unless $frm; |
415 |
$frame='*' if ($$d[7] ne $frm); |
416 |
} #for each HSP |
417 |
#if ($sumlen>0) { |
418 |
# $psim=$sumpsim/$sumlen; |
419 |
# $pid=$sumpid/$sumlen; |
420 |
# } |
421 |
goto GO_OUT if ($grplen==0); |
422 |
$pid=sprintf('%d',$grpids*100/$grplen); |
423 |
$psim=sprintf('%d',$grpsims*100/$grplen); |
424 |
|
425 |
goto GO_OUT if ($psim<$minpsim || $pid <$minpid); |
426 |
|
427 |
$hitcov=&mergeSegs(\@hitsegs); |
428 |
|
429 |
#@q5 = sort { $main::a <=> $main::b } @q5; |
430 |
#@q3 = sort { $main::a <=> $main::b } @q3; |
431 |
#@h5 = sort { $main::a <=> $main::b } @h5; |
432 |
#@h3 = sort { $main::a <=> $main::b } @h3; |
433 |
#determine actual (non-overlapping) coverage of the hit |
434 |
($q5, $q3, $h5, $h3)=($q5[0], $q3[-1], $h5[0], $h3[-1]); |
435 |
if ($strand eq 'Minus' && ($q5[0]>$q3[0])) { # q5>q3 unless protein search |
436 |
($q5, $q3, $h5, $h3)=($q5[-1], $q3[0], $h5[0], $h3[-1]); |
437 |
@details=reverse(@details); |
438 |
} |
439 |
|
440 |
&$writeline("$hitcov\t".join('~',@details)); |
441 |
#-- clear |
442 |
GO_OUT: |
443 |
@grpdata=(); |
444 |
undef($curgrp); |
445 |
#$grplen=0;$grpids=0;$grpsims=0;$grphspcount=0; |
446 |
} |
447 |
|
448 |
sub mergeSegs { |
449 |
my $a=shift; # ref to list of [h5, h3] |
450 |
my $hcov=0; |
451 |
OVLCHECK: |
452 |
for (my $i=0;$i<(@$a-1); $i++) { |
453 |
for (my $j=$i+1;$j<@$a; $j++) { |
454 |
my ($li, $ri, $lj, $rj)= |
455 |
($$a[$i]->[0], $$a[$i]->[1], $$a[$j]->[0], $$a[$j]->[1]); |
456 |
if ($lj<=$ri && $lj>=$li) { |
457 |
$$a[$i]->[0]=$li; |
458 |
$$a[$i]->[1]=($rj>$ri) ? $rj : $ri; |
459 |
splice(@$a, $j,1); |
460 |
goto OVLCHECK; |
461 |
} |
462 |
elsif ($li<=$rj && $li>=$lj) { |
463 |
$$a[$i]->[0]=$lj; |
464 |
$$a[$i]->[1]=($rj>$ri) ? $rj : $ri; |
465 |
splice(@$a, $j,1); |
466 |
goto OVLCHECK; |
467 |
} |
468 |
} # for j |
469 |
} # for i |
470 |
foreach my $d (@$a) { |
471 |
$hcov+=($$d[1]-$$d[0]+1); |
472 |
} |
473 |
return $hcov; |
474 |
} |