ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/blast2btab
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years ago) by gpertea
File size: 16146 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2
3 #some filtering can be performed on the output
4 #
5 # this should be rewritten in C/C++ to be faster
6
7 use strict;
8
9 my $usage = q{
10 blast2btab [HITLEN=nn[%]] [OVHANG=nn[%]] [PID=nn] [PSIM=nn]
11 [SCORE=nnnn | E=nnnnn]
12
13 Format and filter wu-blast2 output coming at stdin. The output consists of
14 the following tab delimited fields:
15 0 1 2 3 4 5
16 q_name, date, q_len, method, db_file, hit_name,
17 6 7 8 9 10 11 12
18 q_n5, q_n3, hit_n5, hit_n3, %id, %sim, score,
19 13 14 15 16
20 file_pos1, file_pos2, hit_description, Frame,
21 17 18 19 [ 20 21 ]
22 strand, hit_len, E-value [ hit-coverage HSPs ]
23
24 Filtering:
25 PID for minimum percent identity
26 PSIM for minimum percent similarity
27 SCORE for minimum alignment score
28 E for maximum E value (Blast E parameter)
29 HITLEN for minimum overlap length
30 OVHANG for maximum overhang length (mismatched overhangs)
31
32 If '%' is specified for HITLEN or OVHANG values, the actual cutoff
33 lengths will be computed as percentages of the query sequence length;
34
35 If -topcomboX option is used, each line will represent a group, with the individual
36 segment hits in the 21th field, and the actual hit coverage length as the 20th field
37 };
38
39 die($usage) if ($ARGV[0]=~/\-h|\-?\-help/i);
40 my $_bignum=9999999;
41 my %a=('HITLEN'=>0,
42 'OVHANG'=>$_bignum,
43 'PID'=>0,
44 'PSIM'=>0,
45 'SCORE'=>0,
46 'E'=>$_bignum,
47 'DEFLINE'=>'none',
48 'PARSE'=>0
49 );
50 # try to parse filtering options, if any;
51 my ($is_lenperc, $is_ovperc)=(0,0);
52 my $is_filtered=0;
53 my $bytes_read=0;
54 foreach (@ARGV) {
55 my @param=split(/=/);
56 $is_filtered=1 if ($param[0] ne 'DEFLINE');
57 die $usage if (@param<2 || not exists($a{$param[0]}));
58 if ($param[0] eq 'HITLEN') {
59 my $p=rindex($param[1],'%');
60 if ($p>0) {
61 $is_lenperc=1;
62 $param[1]=substr($param[1],0,$p);
63 }
64 }
65 elsif ($param[0] eq 'OVHANG') {
66 my $p=rindex($param[1],'%');
67 if ($p>=0) {
68 $is_ovperc=1;
69 $param[1]=substr($param[1],0,$p);
70 }
71 }
72 $a{$param[0]}=$param[1];
73 }
74 my $defln=lc($a{'DEFLINE'});
75 if ($defln eq 'none') {
76 $defln=0;
77 }
78 elsif ($defln eq 'q') {
79 $defln=1;
80 }
81 elsif ($defln eq 'h') {
82 $defln=2;
83 }
84 elsif ($defln eq 'b') {
85 $defln=3;
86 }
87 else {
88 die "$usage. Error at DEFLINE option.";
89 }
90
91 $/ = "\n";
92 #$\ = "\n";
93 #$, = "\t";
94 my ($minhlen, $maxoh, $doparse, $minpid, $minpsim, $minscore, $maxE)=(
95 $a{'HITLEN'},
96 $a{'OVHANG'},
97 $a{'PARSE'},
98 $a{'PID'},
99 $a{'PSIM'},
100 $a{'SCORE'},
101 $a{'E'});
102 $maxE='1'.$maxE if lc(substr($maxE,0,1)) eq 'e';
103
104 my $writeline= ($is_filtered) ? \&write_flt : \&write_noflt;
105 my $curgrp; #current group #
106 my @grpdata; #when groups are defined, this accumulates HSP data
107 # list of [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame]
108 #my ($grphspcount, $grplen, $grpids, $grpsims);
109 #segments expected, segments collected already,
110 # total bases, total identities, total positives
111
112 #only one file is accepted here...
113 my ($query, $qlen, $q5, $q3, $hit, $hitlen, $h5, $h3, $ids, $sims, $bcount, $pid, $psim,
114 $score, $evalue, $strand, $frame, $hdefline, $searchdb, $fpos, $fend, $qdefline);
115 my $flush=0;
116 $fpos=-1;
117 $fend=-1;
118 my $method='prog';
119 $searchdb='n/a';
120 my @lcdate=localtime();
121 my $date=($lcdate[4]+1).'/'.$lcdate[3].'/'.(1900+$lcdate[5]);
122 $frame='*';
123 BIGLOOP:
124 while (<STDIN>) {
125 die "Line $. doesn't appear to be a blast output:\n$_\n" unless (/^(T?BLAST[NXP]?)/);
126 NEXT_QRYBLOCK:
127 $bytes_read+=length($_);
128 next if /^\s*$/;
129
130 goto QRYFOUND if m/^Query=\s*(\S+)\s*(.*)/;
131
132 $method=lc($1);
133 do {
134 $_=<STDIN>; $bytes_read+=length($_);
135
136 last BIGLOOP unless $_;
137 } until m/^Query=\s*(\S+)\s*(.*)/;
138 QRYFOUND:
139 &write_group() if $query && $curgrp;
140 $query=$1;
141 $qdefline='';
142 if ($defln==1 || $defln==3) {
143 $qdefline=$2;
144 #print STDERR "qdefline set to '$qdefline'\n";
145 $qdefline =~ tr/\n//d;
146 }
147 my $linepos=$.;
148 do {
149 $_=<STDIN>; $bytes_read+=length($_);
150 last BIGLOOP unless $_;
151 &err_parse("Cannot find 'Length' line for Query $query")
152 if ($.-$linepos) > 10000;
153 } until /\s+\(([\d\,]+) letters/;
154
155 $qlen=$1;
156 $qlen =~ tr/,//d;
157 # parse all matches for this $query:
158 $_=<STDIN>; $bytes_read+=length($_);
159 my $was_param;
160 while ($_ && !(($was_param=m/^Parameters:/) || m/^EXIT CODE/)) {
161 chomp;
162 if (m/^Database:\s+(\S+)/) {
163 $searchdb=$1;
164 ($searchdb)=($searchdb =~ m/([^\/]+)$/);
165 }
166 # Hit section:
167 if (/^>(\S+)\s*(.*)/) {
168 #HSPs here
169 $hit=$1;
170 #print STDERR "hitline is: $_\n";
171 #if ($defln>=2) {
172 $hdefline=$2;
173 #print STDERR "hdefline set to '$hdefline'\n";
174 # }
175 NEXT_SUBJ:
176 my $savepos;
177 #&err_parse("Did you use '-noseqs' option for this blast search?")
178 # if ($hit && $score && !$h3);
179 $savepos=$.;
180 while (<STDIN>) {
181 $bytes_read+=length($_);
182 &err_parse("Cannot find 'Length' line for hit $hit")
183 if ($.-$savepos) > 10000;
184 if (/^\s+Length\s+=\s+([\d\,]+)/) {
185 $hitlen=$1; $hitlen =~ tr/,//d;
186 last;
187 }
188 chomp;
189 s/^\s+//;
190 $hdefline.=' '.$_ if $_;
191 }
192 $h3=0;$score=0;$strand='+';
193 $_=<STDIN>;$bytes_read+=length($_);
194 $savepos=$.+80;
195 my $withseqs=0;
196 while ($_) {
197 #search of all HSPs until Subject match or EOF is encountered
198 #print "&&&&&& $query: found hit $hit, score=$score\n";
199 if (/^>(\S+)\s*(.*)/) {
200 #print "prevscore= $score, prevhit=$hit, prevcoords=$h3\n";
201 #&err_parse("-- coordinates not found for hit $hit. Please be sure blastn is given -noseqs option!")
202 # if ($hit && $score && !$h3);
203 &write_group() if $curgrp;
204 chomp;
205 $hit=$1;$hdefline=$2;
206 goto NEXT_SUBJ;
207 }
208 goto SKIP_EMPTYLN if ($_ eq "\n");
209 if (($was_param=m/^Parameters:/) || m/^EXIT CODE/) {
210 &write_group() if $curgrp;
211 goto NEXT_QUERY;
212 }
213 if (/^\s*Score[\s=]+(\d+)/) {
214 $score=$1;
215 $fpos=$bytes_read-length($_);
216 ($evalue)=(m/\,\s*Expect\s*=\s*([\de\.\-]+)/);
217 #print("got e-value $evalue for hit $hit \n");
218 my ($group)=(m/Group\s+=\s+(\d+)/);
219 &write_group() if ($curgrp && $curgrp!=$group);
220 $curgrp=$group;
221 $savepos=$.+80;
222 }
223 elsif (/^\s*Identities\s+=\s+(\d+)\/(\d+)\s+\(([\d\.]+)\%\)/) {
224 ($ids, $bcount, $pid)=($1,$2,$3);
225 $psim=$pid;
226 if (/Positives\s+=\s+(\d+)\/\d+\s+\(([\d\.]+)\%\)/) {
227 ($sims, $psim)=($1, $2)
228 }
229 $strand=/Minus/?'Minus':'Plus';
230 if (/Frame\s+\=\s+([\+\-\d]+)/) {
231 $frame=$1;
232 $strand=(substr($frame,0,1) eq '-') ? 'Minus':'Plus';
233 }
234 }
235 elsif (m/^Query:\s+/) { #parse query coordinates
236 if (m/^Query:\s*(\d+)[\s]+\-\s+(\d+)/) { #noseqs case
237 ($q5,$q3)=($1,$2);
238 $_=<STDIN>;$bytes_read+=length($_);
239 if (m/^Sbjct:\s*(\d+)\s+\-\s+(\d+)/) { #noseqs case
240 ($h5,$h3)=($1,$2);
241 $fend=$bytes_read;
242 if ($curgrp) {
243 push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]);
244 #$grphspcount++;
245 #$grpids+=$ids;
246 #$grpsims+=$sims;
247 #$grplen+=$bcount;
248 }
249 else {
250 &$writeline;
251 }
252 }
253 else {
254 &err_parse("Cannot find Sbjct: line for [noseqs] output (query=$query, subj=$hit)");
255 }
256 }
257 else { #read/parse the whole alignment to get the start-end coordinates
258 #for both query and sequence
259 if (m/^Query:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
260 ($q5,$q3)=($1,$2);
261 }
262 else {
263 &err_parse("Invalid start of Query: alignment line (query=$query, subj=$hit)");
264 }
265 $withseqs=1;
266 $_=<STDIN>;$bytes_read+=length($_);
267 $_=<STDIN>;$bytes_read+=length($_);
268 if (m/^Sbjct:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
269 ($h5, $h3) = ($1, $2);
270 }
271 else {
272 &err_parse("Cannot find start Sbjct: line for alignment (query=$query, subj=$hit)");
273 }
274 $_=<STDIN>; $bytes_read+=length($_);#skip empty line
275 #$_=<STDIN>;
276 #print STDERR "Alignment continuation is:\n$_\n\n";
277 #exit(0);
278 while (defined($_=<STDIN>) && m/^Query:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
279 $bytes_read+=length($_);
280 #print STDERR "found alignment continuation\n";
281 $q3=$1;
282 $_=<STDIN>;$bytes_read+=length($_);
283 $_=<STDIN>;$bytes_read+=length($_);
284 if (m/^Sbjct:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
285 $h3=$1;
286 }
287 else {
288 &err_parse("Cannot find end of Sbjct: line for alignment (query=$query, subj=$hit)");
289 }
290 $fend = $bytes_read;
291 $_=<STDIN>;$bytes_read+=length($_);
292 }
293 $fend=$bytes_read;
294 if ($curgrp) {
295 push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]);
296 #$grphspcount++;
297 #$grpids+=$ids;
298 #$grpsims+=$sims;
299 #$grplen+=$bcount;
300 }
301 else {
302 &$writeline;
303 }
304 next; #skip to next HSP, as we might have read it
305 } # alignment parsing
306 }
307 SKIP_EMPTYLN:
308 $_=<STDIN>;$bytes_read+=length($_);
309 &err_parse("--Timeout trying to find all HSPs for query $query and subj $hit")
310 if ($.>$savepos+5000);
311 }
312 &err_parse("--Cannot get hit data for $hit")
313 if ($score==0 || $h3==0);
314
315 } #--Subject entry
316 $_=<STDIN>;$bytes_read+=length($_);
317 }
318 NEXT_QUERY:
319 do {
320 $_=<STDIN>;$bytes_read+=length($_);
321 last if eof(STDIN);
322 #} until /^\x0C/; # record separator for multi-fasta wu-blast!
323 } until (/^Query=/);
324 goto NEXT_QRYBLOCK;
325 }
326
327 exit(0);
328
329 #==========================================
330 #to speed it up, use the variables directly from main program
331 #(not very elegant, but more efficient)
332
333 sub err_parse {
334 die "Parsing Error at line $.:\n$_\n$_[0]\n";
335 }
336
337 sub write_noflt {
338 $hdefline =~ s/^>/\x01/;
339 $hdefline =~ s/ >([\w\|]{6,})/\x01$1/g;
340 my $xtra=$_[0]?"\t$_[0]":"";
341 print join("\t",
342 # 0 1 2 3 4 5 6 7 8 9 10 11
343 ($query, $date, $qlen, $method, $searchdb, $hit, $q5, $q3, $h5, $h3, $pid, $psim,
344 # 12 13 14 15 16 17 18 19
345 $score, $fpos, $fend, $hdefline, $frame, $strand, $hitlen, $evalue))."$xtra\n";
346 }; #clean write
347
348 sub write_flt {
349 #my ($minhlen, $maxoh, $minpid, $minsc, $maxE)
350 my ($minmatch, $maxovhang)=($is_lenperc?(($minhlen*$qlen)/100):$minhlen,
351 $is_ovperc?(($maxoh*$qlen)/100):$maxoh);
352
353 my ($q_match, $q_beg_over, $q_end_over)=($q5 < $q3)?
354 (($q3 - $q5) + 1, $q5 - 1, $qlen - $q3) :
355 (($q5 - $q3) + 1, $qlen - $q5, $q3 - 1);
356 my ($h_match, $h_beg_over, $h_end_over)= ($h5 < $h3)?
357 ( ($h3 - $h5) + 1, $h5 - 1, $hitlen - $h3) :
358 ( ($h5 - $h3) + 1, $hitlen - $h5, $h3 - 1);
359 &write_noflt($_[0])
360 if (($query ne $hit) &&
361 (($h_match >=$minmatch) || ($q_match >= $minmatch))
362 && (($q_beg_over <= $maxovhang) || ($h_beg_over <= $maxovhang))
363 && (($q_end_over <= $maxovhang) || ($h_end_over <= $maxovhang))
364 && ($pid>=$minpid)
365 && ($psim>=$minpsim)
366 && ($score>=$minscore)
367 && ($evalue<=$maxE))
368 }
369
370 sub write_group {
371 die "Error: write_group called with empty \@grpdata\n"
372 if (@grpdata==0);
373 my (@q5, @q3, @h5, @h3);
374 @grpdata = sort { $main::a->[0] <=> $main::b->[0] } @grpdata;
375 my @details;
376 $score=0;
377 my $hitcov; #non redundant hit coverage --
378 my @hitsegs; # non-overlapping hit segments
379 my $frm;
380 my ($grplen, $grpids, $grpsims)=(0,0,0);
381 # also recompute $psim, $pid, $q5, $q3, $h5, $h3
382 # because psim/pid filter may have discarded some HSPs
383 #my ($sumpid, $sumpsim, $sumlen, $minq, $maxq, $minh, $maxh);
384 foreach my $d (@grpdata) {
385 #$d=[$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $numids, $numsims, $numbases]
386 # 0 1 2 3 4 5 6 7 8 9 10
387 #print STDERR "seg $$d[0]-$$d[1] : $$d[2]-$$d[3]\n";
388 next unless ($$d[4]>=$minpid && $$d[5]>=$minpsim);
389 $grpids += $$d[8];
390 $grpsims += $$d[9];
391 $grplen += $$d[10];
392 my ($seg_start, $seg_end)=($$d[1]<$$d[0])?($$d[1],$$d[0]) :($$d[0],$$d[1]);
393 my $seglen=$seg_end-$seg_start+1;
394 #$sumpid+=$seglen*$$d[4];
395 #$sumpsim+=$seglen*$$d[5];
396 #$sumlen+=$seglen;
397 #$minq=$seg_start if ($minq==0 || $minq>$seg_start);
398 #$maxq=$seg_end if ($maxq==0 || $maxq<$seg_end);
399
400 push(@q5,$$d[0]);push(@q3,$$d[1]);
401 push(@h5,$$d[2]);push(@h3,$$d[3]);
402 if ($$d[2]>$$d[3]) {
403 #$minh=$$d[3] if ($minh==0 || $minh>$$d[3]);
404 #$maxh=$$d[2] if ($maxh==0 || $maxh<$$d[2]);
405 push(@hitsegs,[$$d[3],$$d[2]] )
406 }
407 else {
408 #$minh=$$d[3] if ($minh==0 || $minh>$$d[3]);
409 #$maxh=$$d[2] if ($maxh==0 || $maxh<$$d[2]);
410 push(@hitsegs,[$$d[2],$$d[3]]);
411 }
412 push(@details, "$$d[0]-$$d[1]\:$$d[2]-$$d[3]|$$d[5]");
413 $score+=$$d[6];
414 $frm=$$d[7] unless $frm;
415 $frame='*' if ($$d[7] ne $frm);
416 } #for each HSP
417 #if ($sumlen>0) {
418 # $psim=$sumpsim/$sumlen;
419 # $pid=$sumpid/$sumlen;
420 # }
421 goto GO_OUT if ($grplen==0);
422 $pid=sprintf('%d',$grpids*100/$grplen);
423 $psim=sprintf('%d',$grpsims*100/$grplen);
424
425 goto GO_OUT if ($psim<$minpsim || $pid <$minpid);
426
427 $hitcov=&mergeSegs(\@hitsegs);
428
429 #@q5 = sort { $main::a <=> $main::b } @q5;
430 #@q3 = sort { $main::a <=> $main::b } @q3;
431 #@h5 = sort { $main::a <=> $main::b } @h5;
432 #@h3 = sort { $main::a <=> $main::b } @h3;
433 #determine actual (non-overlapping) coverage of the hit
434 ($q5, $q3, $h5, $h3)=($q5[0], $q3[-1], $h5[0], $h3[-1]);
435 if ($strand eq 'Minus' && ($q5[0]>$q3[0])) { # q5>q3 unless protein search
436 ($q5, $q3, $h5, $h3)=($q5[-1], $q3[0], $h5[0], $h3[-1]);
437 @details=reverse(@details);
438 }
439
440 &$writeline("$hitcov\t".join('~',@details));
441 #-- clear
442 GO_OUT:
443 @grpdata=();
444 undef($curgrp);
445 #$grplen=0;$grpids=0;$grpsims=0;$grphspcount=0;
446 }
447
448 sub mergeSegs {
449 my $a=shift; # ref to list of [h5, h3]
450 my $hcov=0;
451 OVLCHECK:
452 for (my $i=0;$i<(@$a-1); $i++) {
453 for (my $j=$i+1;$j<@$a; $j++) {
454 my ($li, $ri, $lj, $rj)=
455 ($$a[$i]->[0], $$a[$i]->[1], $$a[$j]->[0], $$a[$j]->[1]);
456 if ($lj<=$ri && $lj>=$li) {
457 $$a[$i]->[0]=$li;
458 $$a[$i]->[1]=($rj>$ri) ? $rj : $ri;
459 splice(@$a, $j,1);
460 goto OVLCHECK;
461 }
462 elsif ($li<=$rj && $li>=$lj) {
463 $$a[$i]->[0]=$lj;
464 $$a[$i]->[1]=($rj>$ri) ? $rj : $ri;
465 splice(@$a, $j,1);
466 goto OVLCHECK;
467 }
468 } # for j
469 } # for i
470 foreach my $d (@$a) {
471 $hcov+=($$d[1]-$$d[0]+1);
472 }
473 return $hcov;
474 }

Properties

Name Value
svn:executable *