ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/blast2btab
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 16146 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2    
3     #some filtering can be performed on the output
4     #
5     # this should be rewritten in C/C++ to be faster
6    
7     use strict;
8    
9     my $usage = q{
10     blast2btab [HITLEN=nn[%]] [OVHANG=nn[%]] [PID=nn] [PSIM=nn]
11     [SCORE=nnnn | E=nnnnn]
12    
13     Format and filter wu-blast2 output coming at stdin. The output consists of
14     the following tab delimited fields:
15     0 1 2 3 4 5
16     q_name, date, q_len, method, db_file, hit_name,
17     6 7 8 9 10 11 12
18     q_n5, q_n3, hit_n5, hit_n3, %id, %sim, score,
19     13 14 15 16
20     file_pos1, file_pos2, hit_description, Frame,
21     17 18 19 [ 20 21 ]
22     strand, hit_len, E-value [ hit-coverage HSPs ]
23    
24     Filtering:
25     PID for minimum percent identity
26     PSIM for minimum percent similarity
27     SCORE for minimum alignment score
28     E for maximum E value (Blast E parameter)
29     HITLEN for minimum overlap length
30     OVHANG for maximum overhang length (mismatched overhangs)
31    
32     If '%' is specified for HITLEN or OVHANG values, the actual cutoff
33     lengths will be computed as percentages of the query sequence length;
34    
35     If -topcomboX option is used, each line will represent a group, with the individual
36     segment hits in the 21th field, and the actual hit coverage length as the 20th field
37     };
38    
39     die($usage) if ($ARGV[0]=~/\-h|\-?\-help/i);
40     my $_bignum=9999999;
41     my %a=('HITLEN'=>0,
42     'OVHANG'=>$_bignum,
43     'PID'=>0,
44     'PSIM'=>0,
45     'SCORE'=>0,
46     'E'=>$_bignum,
47     'DEFLINE'=>'none',
48     'PARSE'=>0
49     );
50     # try to parse filtering options, if any;
51     my ($is_lenperc, $is_ovperc)=(0,0);
52     my $is_filtered=0;
53     my $bytes_read=0;
54     foreach (@ARGV) {
55     my @param=split(/=/);
56     $is_filtered=1 if ($param[0] ne 'DEFLINE');
57     die $usage if (@param<2 || not exists($a{$param[0]}));
58     if ($param[0] eq 'HITLEN') {
59     my $p=rindex($param[1],'%');
60     if ($p>0) {
61     $is_lenperc=1;
62     $param[1]=substr($param[1],0,$p);
63     }
64     }
65     elsif ($param[0] eq 'OVHANG') {
66     my $p=rindex($param[1],'%');
67     if ($p>=0) {
68     $is_ovperc=1;
69     $param[1]=substr($param[1],0,$p);
70     }
71     }
72     $a{$param[0]}=$param[1];
73     }
74     my $defln=lc($a{'DEFLINE'});
75     if ($defln eq 'none') {
76     $defln=0;
77     }
78     elsif ($defln eq 'q') {
79     $defln=1;
80     }
81     elsif ($defln eq 'h') {
82     $defln=2;
83     }
84     elsif ($defln eq 'b') {
85     $defln=3;
86     }
87     else {
88     die "$usage. Error at DEFLINE option.";
89     }
90    
91     $/ = "\n";
92     #$\ = "\n";
93     #$, = "\t";
94     my ($minhlen, $maxoh, $doparse, $minpid, $minpsim, $minscore, $maxE)=(
95     $a{'HITLEN'},
96     $a{'OVHANG'},
97     $a{'PARSE'},
98     $a{'PID'},
99     $a{'PSIM'},
100     $a{'SCORE'},
101     $a{'E'});
102     $maxE='1'.$maxE if lc(substr($maxE,0,1)) eq 'e';
103    
104     my $writeline= ($is_filtered) ? \&write_flt : \&write_noflt;
105     my $curgrp; #current group #
106     my @grpdata; #when groups are defined, this accumulates HSP data
107     # list of [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame]
108     #my ($grphspcount, $grplen, $grpids, $grpsims);
109     #segments expected, segments collected already,
110     # total bases, total identities, total positives
111    
112     #only one file is accepted here...
113     my ($query, $qlen, $q5, $q3, $hit, $hitlen, $h5, $h3, $ids, $sims, $bcount, $pid, $psim,
114     $score, $evalue, $strand, $frame, $hdefline, $searchdb, $fpos, $fend, $qdefline);
115     my $flush=0;
116     $fpos=-1;
117     $fend=-1;
118     my $method='prog';
119     $searchdb='n/a';
120     my @lcdate=localtime();
121     my $date=($lcdate[4]+1).'/'.$lcdate[3].'/'.(1900+$lcdate[5]);
122     $frame='*';
123     BIGLOOP:
124     while (<STDIN>) {
125     die "Line $. doesn't appear to be a blast output:\n$_\n" unless (/^(T?BLAST[NXP]?)/);
126     NEXT_QRYBLOCK:
127     $bytes_read+=length($_);
128     next if /^\s*$/;
129    
130     goto QRYFOUND if m/^Query=\s*(\S+)\s*(.*)/;
131    
132     $method=lc($1);
133     do {
134     $_=<STDIN>; $bytes_read+=length($_);
135    
136     last BIGLOOP unless $_;
137     } until m/^Query=\s*(\S+)\s*(.*)/;
138     QRYFOUND:
139     &write_group() if $query && $curgrp;
140     $query=$1;
141     $qdefline='';
142     if ($defln==1 || $defln==3) {
143     $qdefline=$2;
144     #print STDERR "qdefline set to '$qdefline'\n";
145     $qdefline =~ tr/\n//d;
146     }
147     my $linepos=$.;
148     do {
149     $_=<STDIN>; $bytes_read+=length($_);
150     last BIGLOOP unless $_;
151     &err_parse("Cannot find 'Length' line for Query $query")
152     if ($.-$linepos) > 10000;
153     } until /\s+\(([\d\,]+) letters/;
154    
155     $qlen=$1;
156     $qlen =~ tr/,//d;
157     # parse all matches for this $query:
158     $_=<STDIN>; $bytes_read+=length($_);
159     my $was_param;
160     while ($_ && !(($was_param=m/^Parameters:/) || m/^EXIT CODE/)) {
161     chomp;
162     if (m/^Database:\s+(\S+)/) {
163     $searchdb=$1;
164     ($searchdb)=($searchdb =~ m/([^\/]+)$/);
165     }
166     # Hit section:
167     if (/^>(\S+)\s*(.*)/) {
168     #HSPs here
169     $hit=$1;
170     #print STDERR "hitline is: $_\n";
171     #if ($defln>=2) {
172     $hdefline=$2;
173     #print STDERR "hdefline set to '$hdefline'\n";
174     # }
175     NEXT_SUBJ:
176     my $savepos;
177     #&err_parse("Did you use '-noseqs' option for this blast search?")
178     # if ($hit && $score && !$h3);
179     $savepos=$.;
180     while (<STDIN>) {
181     $bytes_read+=length($_);
182     &err_parse("Cannot find 'Length' line for hit $hit")
183     if ($.-$savepos) > 10000;
184     if (/^\s+Length\s+=\s+([\d\,]+)/) {
185     $hitlen=$1; $hitlen =~ tr/,//d;
186     last;
187     }
188     chomp;
189     s/^\s+//;
190     $hdefline.=' '.$_ if $_;
191     }
192     $h3=0;$score=0;$strand='+';
193     $_=<STDIN>;$bytes_read+=length($_);
194     $savepos=$.+80;
195     my $withseqs=0;
196     while ($_) {
197     #search of all HSPs until Subject match or EOF is encountered
198     #print "&&&&&& $query: found hit $hit, score=$score\n";
199     if (/^>(\S+)\s*(.*)/) {
200     #print "prevscore= $score, prevhit=$hit, prevcoords=$h3\n";
201     #&err_parse("-- coordinates not found for hit $hit. Please be sure blastn is given -noseqs option!")
202     # if ($hit && $score && !$h3);
203     &write_group() if $curgrp;
204     chomp;
205     $hit=$1;$hdefline=$2;
206     goto NEXT_SUBJ;
207     }
208     goto SKIP_EMPTYLN if ($_ eq "\n");
209     if (($was_param=m/^Parameters:/) || m/^EXIT CODE/) {
210     &write_group() if $curgrp;
211     goto NEXT_QUERY;
212     }
213     if (/^\s*Score[\s=]+(\d+)/) {
214     $score=$1;
215     $fpos=$bytes_read-length($_);
216     ($evalue)=(m/\,\s*Expect\s*=\s*([\de\.\-]+)/);
217     #print("got e-value $evalue for hit $hit \n");
218     my ($group)=(m/Group\s+=\s+(\d+)/);
219     &write_group() if ($curgrp && $curgrp!=$group);
220     $curgrp=$group;
221     $savepos=$.+80;
222     }
223     elsif (/^\s*Identities\s+=\s+(\d+)\/(\d+)\s+\(([\d\.]+)\%\)/) {
224     ($ids, $bcount, $pid)=($1,$2,$3);
225     $psim=$pid;
226     if (/Positives\s+=\s+(\d+)\/\d+\s+\(([\d\.]+)\%\)/) {
227     ($sims, $psim)=($1, $2)
228     }
229     $strand=/Minus/?'Minus':'Plus';
230     if (/Frame\s+\=\s+([\+\-\d]+)/) {
231     $frame=$1;
232     $strand=(substr($frame,0,1) eq '-') ? 'Minus':'Plus';
233     }
234     }
235     elsif (m/^Query:\s+/) { #parse query coordinates
236     if (m/^Query:\s*(\d+)[\s]+\-\s+(\d+)/) { #noseqs case
237     ($q5,$q3)=($1,$2);
238     $_=<STDIN>;$bytes_read+=length($_);
239     if (m/^Sbjct:\s*(\d+)\s+\-\s+(\d+)/) { #noseqs case
240     ($h5,$h3)=($1,$2);
241     $fend=$bytes_read;
242     if ($curgrp) {
243     push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]);
244     #$grphspcount++;
245     #$grpids+=$ids;
246     #$grpsims+=$sims;
247     #$grplen+=$bcount;
248     }
249     else {
250     &$writeline;
251     }
252     }
253     else {
254     &err_parse("Cannot find Sbjct: line for [noseqs] output (query=$query, subj=$hit)");
255     }
256     }
257     else { #read/parse the whole alignment to get the start-end coordinates
258     #for both query and sequence
259     if (m/^Query:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
260     ($q5,$q3)=($1,$2);
261     }
262     else {
263     &err_parse("Invalid start of Query: alignment line (query=$query, subj=$hit)");
264     }
265     $withseqs=1;
266     $_=<STDIN>;$bytes_read+=length($_);
267     $_=<STDIN>;$bytes_read+=length($_);
268     if (m/^Sbjct:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
269     ($h5, $h3) = ($1, $2);
270     }
271     else {
272     &err_parse("Cannot find start Sbjct: line for alignment (query=$query, subj=$hit)");
273     }
274     $_=<STDIN>; $bytes_read+=length($_);#skip empty line
275     #$_=<STDIN>;
276     #print STDERR "Alignment continuation is:\n$_\n\n";
277     #exit(0);
278     while (defined($_=<STDIN>) && m/^Query:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
279     $bytes_read+=length($_);
280     #print STDERR "found alignment continuation\n";
281     $q3=$1;
282     $_=<STDIN>;$bytes_read+=length($_);
283     $_=<STDIN>;$bytes_read+=length($_);
284     if (m/^Sbjct:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
285     $h3=$1;
286     }
287     else {
288     &err_parse("Cannot find end of Sbjct: line for alignment (query=$query, subj=$hit)");
289     }
290     $fend = $bytes_read;
291     $_=<STDIN>;$bytes_read+=length($_);
292     }
293     $fend=$bytes_read;
294     if ($curgrp) {
295     push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]);
296     #$grphspcount++;
297     #$grpids+=$ids;
298     #$grpsims+=$sims;
299     #$grplen+=$bcount;
300     }
301     else {
302     &$writeline;
303     }
304     next; #skip to next HSP, as we might have read it
305     } # alignment parsing
306     }
307     SKIP_EMPTYLN:
308     $_=<STDIN>;$bytes_read+=length($_);
309     &err_parse("--Timeout trying to find all HSPs for query $query and subj $hit")
310     if ($.>$savepos+5000);
311     }
312     &err_parse("--Cannot get hit data for $hit")
313     if ($score==0 || $h3==0);
314    
315     } #--Subject entry
316     $_=<STDIN>;$bytes_read+=length($_);
317     }
318     NEXT_QUERY:
319     do {
320     $_=<STDIN>;$bytes_read+=length($_);
321     last if eof(STDIN);
322     #} until /^\x0C/; # record separator for multi-fasta wu-blast!
323     } until (/^Query=/);
324     goto NEXT_QRYBLOCK;
325     }
326    
327     exit(0);
328    
329     #==========================================
330     #to speed it up, use the variables directly from main program
331     #(not very elegant, but more efficient)
332    
333     sub err_parse {
334     die "Parsing Error at line $.:\n$_\n$_[0]\n";
335     }
336    
337     sub write_noflt {
338     $hdefline =~ s/^>/\x01/;
339     $hdefline =~ s/ >([\w\|]{6,})/\x01$1/g;
340     my $xtra=$_[0]?"\t$_[0]":"";
341     print join("\t",
342     # 0 1 2 3 4 5 6 7 8 9 10 11
343     ($query, $date, $qlen, $method, $searchdb, $hit, $q5, $q3, $h5, $h3, $pid, $psim,
344     # 12 13 14 15 16 17 18 19
345     $score, $fpos, $fend, $hdefline, $frame, $strand, $hitlen, $evalue))."$xtra\n";
346     }; #clean write
347    
348     sub write_flt {
349     #my ($minhlen, $maxoh, $minpid, $minsc, $maxE)
350     my ($minmatch, $maxovhang)=($is_lenperc?(($minhlen*$qlen)/100):$minhlen,
351     $is_ovperc?(($maxoh*$qlen)/100):$maxoh);
352    
353     my ($q_match, $q_beg_over, $q_end_over)=($q5 < $q3)?
354     (($q3 - $q5) + 1, $q5 - 1, $qlen - $q3) :
355     (($q5 - $q3) + 1, $qlen - $q5, $q3 - 1);
356     my ($h_match, $h_beg_over, $h_end_over)= ($h5 < $h3)?
357     ( ($h3 - $h5) + 1, $h5 - 1, $hitlen - $h3) :
358     ( ($h5 - $h3) + 1, $hitlen - $h5, $h3 - 1);
359     &write_noflt($_[0])
360     if (($query ne $hit) &&
361     (($h_match >=$minmatch) || ($q_match >= $minmatch))
362     && (($q_beg_over <= $maxovhang) || ($h_beg_over <= $maxovhang))
363     && (($q_end_over <= $maxovhang) || ($h_end_over <= $maxovhang))
364     && ($pid>=$minpid)
365     && ($psim>=$minpsim)
366     && ($score>=$minscore)
367     && ($evalue<=$maxE))
368     }
369    
370     sub write_group {
371     die "Error: write_group called with empty \@grpdata\n"
372     if (@grpdata==0);
373     my (@q5, @q3, @h5, @h3);
374     @grpdata = sort { $main::a->[0] <=> $main::b->[0] } @grpdata;
375     my @details;
376     $score=0;
377     my $hitcov; #non redundant hit coverage --
378     my @hitsegs; # non-overlapping hit segments
379     my $frm;
380     my ($grplen, $grpids, $grpsims)=(0,0,0);
381     # also recompute $psim, $pid, $q5, $q3, $h5, $h3
382     # because psim/pid filter may have discarded some HSPs
383     #my ($sumpid, $sumpsim, $sumlen, $minq, $maxq, $minh, $maxh);
384     foreach my $d (@grpdata) {
385     #$d=[$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $numids, $numsims, $numbases]
386     # 0 1 2 3 4 5 6 7 8 9 10
387     #print STDERR "seg $$d[0]-$$d[1] : $$d[2]-$$d[3]\n";
388     next unless ($$d[4]>=$minpid && $$d[5]>=$minpsim);
389     $grpids += $$d[8];
390     $grpsims += $$d[9];
391     $grplen += $$d[10];
392     my ($seg_start, $seg_end)=($$d[1]<$$d[0])?($$d[1],$$d[0]) :($$d[0],$$d[1]);
393     my $seglen=$seg_end-$seg_start+1;
394     #$sumpid+=$seglen*$$d[4];
395     #$sumpsim+=$seglen*$$d[5];
396     #$sumlen+=$seglen;
397     #$minq=$seg_start if ($minq==0 || $minq>$seg_start);
398     #$maxq=$seg_end if ($maxq==0 || $maxq<$seg_end);
399    
400     push(@q5,$$d[0]);push(@q3,$$d[1]);
401     push(@h5,$$d[2]);push(@h3,$$d[3]);
402     if ($$d[2]>$$d[3]) {
403     #$minh=$$d[3] if ($minh==0 || $minh>$$d[3]);
404     #$maxh=$$d[2] if ($maxh==0 || $maxh<$$d[2]);
405     push(@hitsegs,[$$d[3],$$d[2]] )
406     }
407     else {
408     #$minh=$$d[3] if ($minh==0 || $minh>$$d[3]);
409     #$maxh=$$d[2] if ($maxh==0 || $maxh<$$d[2]);
410     push(@hitsegs,[$$d[2],$$d[3]]);
411     }
412     push(@details, "$$d[0]-$$d[1]\:$$d[2]-$$d[3]|$$d[5]");
413     $score+=$$d[6];
414     $frm=$$d[7] unless $frm;
415     $frame='*' if ($$d[7] ne $frm);
416     } #for each HSP
417     #if ($sumlen>0) {
418     # $psim=$sumpsim/$sumlen;
419     # $pid=$sumpid/$sumlen;
420     # }
421     goto GO_OUT if ($grplen==0);
422     $pid=sprintf('%d',$grpids*100/$grplen);
423     $psim=sprintf('%d',$grpsims*100/$grplen);
424    
425     goto GO_OUT if ($psim<$minpsim || $pid <$minpid);
426    
427     $hitcov=&mergeSegs(\@hitsegs);
428    
429     #@q5 = sort { $main::a <=> $main::b } @q5;
430     #@q3 = sort { $main::a <=> $main::b } @q3;
431     #@h5 = sort { $main::a <=> $main::b } @h5;
432     #@h3 = sort { $main::a <=> $main::b } @h3;
433     #determine actual (non-overlapping) coverage of the hit
434     ($q5, $q3, $h5, $h3)=($q5[0], $q3[-1], $h5[0], $h3[-1]);
435     if ($strand eq 'Minus' && ($q5[0]>$q3[0])) { # q5>q3 unless protein search
436     ($q5, $q3, $h5, $h3)=($q5[-1], $q3[0], $h5[0], $h3[-1]);
437     @details=reverse(@details);
438     }
439    
440     &$writeline("$hitcov\t".join('~',@details));
441     #-- clear
442     GO_OUT:
443     @grpdata=();
444     undef($curgrp);
445     #$grplen=0;$grpids=0;$grpsims=0;$grphspcount=0;
446     }
447    
448     sub mergeSegs {
449     my $a=shift; # ref to list of [h5, h3]
450     my $hcov=0;
451     OVLCHECK:
452     for (my $i=0;$i<(@$a-1); $i++) {
453     for (my $j=$i+1;$j<@$a; $j++) {
454     my ($li, $ri, $lj, $rj)=
455     ($$a[$i]->[0], $$a[$i]->[1], $$a[$j]->[0], $$a[$j]->[1]);
456     if ($lj<=$ri && $lj>=$li) {
457     $$a[$i]->[0]=$li;
458     $$a[$i]->[1]=($rj>$ri) ? $rj : $ri;
459     splice(@$a, $j,1);
460     goto OVLCHECK;
461     }
462     elsif ($li<=$rj && $li>=$lj) {
463     $$a[$i]->[0]=$lj;
464     $$a[$i]->[1]=($rj>$ri) ? $rj : $ri;
465     splice(@$a, $j,1);
466     goto OVLCHECK;
467     }
468     } # for j
469     } # for i
470     foreach my $d (@$a) {
471     $hcov+=($$d[1]-$$d[0]+1);
472     }
473     return $hcov;
474     }

Properties

Name Value
svn:executable *