[ViewVC] Annotation of: gclib/scripts/blast2btab

#!/usr/bin/perl

#some filtering can be performed on the output
#
# this should be rewritten in C/C++ to be faster

use strict;

my $usage = q{
blast2btab [HITLEN=nn[%]] [OVHANG=nn[%]] [PID=nn] [PSIM=nn]
 [SCORE=nnnn | E=nnnnn]

Format and filter wu-blast2 output coming at stdin. The output consists of 
the following tab delimited fields:
0        1      2         3      4       5        
q_name, date, q_len, method, db_file, hit_name, 
6        7      8         9      10     11        12
q_n5, q_n3,   hit_n5, hit_n3,   %id,   %sim,    score,
13             14        15             16  
file_pos1, file_pos2, hit_description, Frame, 
 17            18        19      [      20        21   ]
strand,    hit_len,    E-value   [ hit-coverage  HSPs  ]

Filtering:
   PID    for minimum percent identity
   PSIM   for minimum percent similarity
   SCORE  for minimum alignment score   
   E      for maximum E value (Blast E parameter)
   HITLEN for minimum overlap length
   OVHANG for maximum overhang length (mismatched overhangs)

   If '%' is specified for HITLEN or OVHANG values, the actual cutoff
   lengths will be computed as percentages of the query sequence length;

If -topcomboX option is used, each line will represent a group, with the individual
segment hits in the 21th field, and the actual hit coverage length as the 20th field
};

die($usage) if ($ARGV[0]=~/\-h|\-?\-help/i);
my $_bignum=9999999;
my %a=('HITLEN'=>0, 
      'OVHANG'=>$_bignum, 
      'PID'=>0,
      'PSIM'=>0,
      'SCORE'=>0,
      'E'=>$_bignum,
      'DEFLINE'=>'none',
      'PARSE'=>0
      );
# try to parse filtering options, if any;
my ($is_lenperc, $is_ovperc)=(0,0);
my $is_filtered=0;
my $bytes_read=0;
foreach (@ARGV) {
  my @param=split(/=/);
  $is_filtered=1 if ($param[0] ne 'DEFLINE');
  die $usage if (@param<2 || not exists($a{$param[0]}));
  if ($param[0] eq 'HITLEN') {
     my $p=rindex($param[1],'%');
     if ($p>0) {
       $is_lenperc=1;
       $param[1]=substr($param[1],0,$p);
       }
     }
   elsif ($param[0] eq 'OVHANG') {
     my $p=rindex($param[1],'%');
     if ($p>=0) {
       $is_ovperc=1;
       $param[1]=substr($param[1],0,$p);
       }
     }
  $a{$param[0]}=$param[1];  
  }
my $defln=lc($a{'DEFLINE'});
if ($defln eq 'none') {
   $defln=0;
   }
  elsif ($defln eq 'q') {
   $defln=1; 
   }
  elsif ($defln eq 'h') {
   $defln=2;
   }
  elsif ($defln eq 'b') {
   $defln=3;
   }
  else {
    die "$usage. Error at DEFLINE option.";
   }

$/ = "\n";
#$\ = "\n";
#$, = "\t";
my ($minhlen, $maxoh, $doparse, $minpid, $minpsim, $minscore, $maxE)=(
       $a{'HITLEN'},
       $a{'OVHANG'},
       $a{'PARSE'},
       $a{'PID'},
       $a{'PSIM'},
       $a{'SCORE'},
       $a{'E'});
$maxE='1'.$maxE if lc(substr($maxE,0,1)) eq 'e';

my $writeline= ($is_filtered) ? \&write_flt : \&write_noflt;
my $curgrp; #current group #
my @grpdata; #when groups are defined, this accumulates HSP data
# list of [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame]
#my ($grphspcount, $grplen, $grpids, $grpsims); 
#segments expected, segments collected already, 
#  total bases, total identities, total positives

#only one file is accepted here...
my ($query, $qlen, $q5, $q3, $hit, $hitlen, $h5, $h3, $ids, $sims, $bcount, $pid, $psim, 
    $score, $evalue, $strand, $frame, $hdefline, $searchdb, $fpos, $fend, $qdefline);
my $flush=0;
$fpos=-1;
$fend=-1;
my $method='prog';
$searchdb='n/a';
my @lcdate=localtime();
my $date=($lcdate[4]+1).'/'.$lcdate[3].'/'.(1900+$lcdate[5]);
$frame='*';
BIGLOOP:
while (<STDIN>) {
  die "Line $. doesn't appear to be a blast output:\n$_\n" unless (/^(T?BLAST[NXP]?)/);
  NEXT_QRYBLOCK:
   $bytes_read+=length($_);
   next if /^\s*$/;
  
  goto QRYFOUND if m/^Query=\s*(\S+)\s*(.*)/;
  
  $method=lc($1);
  do {
   $_=<STDIN>; $bytes_read+=length($_);

   last BIGLOOP unless $_;
   } until m/^Query=\s*(\S+)\s*(.*)/;
  QRYFOUND: 
  &write_group() if $query && $curgrp;
  $query=$1;
  $qdefline='';
  if ($defln==1 || $defln==3) {
     $qdefline=$2;
     #print STDERR "qdefline set to '$qdefline'\n";
     $qdefline =~ tr/\n//d;
     }
  my $linepos=$.;
  do {
   $_=<STDIN>;   $bytes_read+=length($_);
   last BIGLOOP unless $_;
   &err_parse("Cannot find 'Length' line for Query $query") 
       if ($.-$linepos) > 10000;
   } until /\s+\(([\d\,]+) letters/;
  
  $qlen=$1;
  $qlen =~ tr/,//d;
  # parse all matches for this $query:
  $_=<STDIN>; $bytes_read+=length($_);
  my $was_param;
  while ($_ && !(($was_param=m/^Parameters:/) || m/^EXIT CODE/)) {
     chomp;
     if (m/^Database:\s+(\S+)/) {
        $searchdb=$1;
        ($searchdb)=($searchdb =~ m/([^\/]+)$/);
        }
     # Hit section:   
     if (/^>(\S+)\s*(.*)/) {
       #HSPs here
       $hit=$1;
       #print STDERR "hitline is: $_\n";
       #if ($defln>=2) {
        $hdefline=$2;
        #print STDERR "hdefline set to '$hdefline'\n";
       #    }
     NEXT_SUBJ:
       my $savepos;
       #&err_parse("Did you use '-noseqs' option for this blast search?")
       #       if ($hit && $score && !$h3);
       $savepos=$.;
       while (<STDIN>) {
         $bytes_read+=length($_);
         &err_parse("Cannot find 'Length' line for hit $hit") 
            if ($.-$savepos) > 10000;
         if (/^\s+Length\s+=\s+([\d\,]+)/) {   
           $hitlen=$1; $hitlen =~ tr/,//d;
           last;
           }
         chomp;
         s/^\s+//;
         $hdefline.=' '.$_ if $_;
         }
       $h3=0;$score=0;$strand='+';
       $_=<STDIN>;$bytes_read+=length($_);
       $savepos=$.+80;
       my $withseqs=0;
       while ($_) { 
            #search of all HSPs until Subject match or EOF is encountered
            #print "&&&&&& $query: found hit $hit, score=$score\n";
            if (/^>(\S+)\s*(.*)/) {
                #print "prevscore= $score, prevhit=$hit, prevcoords=$h3\n";
                #&err_parse("-- coordinates not found for hit $hit. Please be sure blastn is given -noseqs option!") 
                #   if ($hit && $score && !$h3);
                &write_group() if $curgrp;
                chomp;
                $hit=$1;$hdefline=$2;
                goto NEXT_SUBJ;
                }
            goto SKIP_EMPTYLN if ($_ eq "\n");
            if (($was_param=m/^Parameters:/) || m/^EXIT CODE/) {
              &write_group() if $curgrp;
              goto NEXT_QUERY;
              }
            if (/^\s*Score[\s=]+(\d+)/) {
               $score=$1;
               $fpos=$bytes_read-length($_);
               ($evalue)=(m/\,\s*Expect\s*=\s*([\de\.\-]+)/);
               #print("got e-value $evalue for hit $hit \n");
               my ($group)=(m/Group\s+=\s+(\d+)/);
               &write_group() if ($curgrp && $curgrp!=$group);
               $curgrp=$group;
               $savepos=$.+80;
               }
             elsif (/^\s*Identities\s+=\s+(\d+)\/(\d+)\s+\(([\d\.]+)\%\)/) {
              ($ids, $bcount, $pid)=($1,$2,$3);
              $psim=$pid;
              if (/Positives\s+=\s+(\d+)\/\d+\s+\(([\d\.]+)\%\)/) {
               ($sims, $psim)=($1, $2)
               }
              $strand=/Minus/?'Minus':'Plus';
              if (/Frame\s+\=\s+([\+\-\d]+)/) {
                $frame=$1;
                $strand=(substr($frame,0,1) eq '-') ? 'Minus':'Plus';
                }
              }
             elsif (m/^Query:\s+/) { #parse query coordinates
               if (m/^Query:\s*(\d+)[\s]+\-\s+(\d+)/) { #noseqs case
                        ($q5,$q3)=($1,$2);
                  $_=<STDIN>;$bytes_read+=length($_);
                  if (m/^Sbjct:\s*(\d+)\s+\-\s+(\d+)/) { #noseqs case
                     ($h5,$h3)=($1,$2);
                     $fend=$bytes_read;
                     if ($curgrp) {
                          push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]);
                          #$grphspcount++;
                          #$grpids+=$ids;
                          #$grpsims+=$sims;
                          #$grplen+=$bcount;
                        }
                       else {
                        &$writeline;
                        }
                     }
                   else {
                     &err_parse("Cannot find Sbjct: line for [noseqs] output (query=$query, subj=$hit)");
                     }
                  }
               else { #read/parse the whole alignment to get the start-end coordinates
                      #for both query and sequence
                 if  (m/^Query:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
                    ($q5,$q3)=($1,$2);
                    }
                  else {
                    &err_parse("Invalid start of Query: alignment line (query=$query, subj=$hit)");
                    }
                 $withseqs=1;
                 $_=<STDIN>;$bytes_read+=length($_);
                 $_=<STDIN>;$bytes_read+=length($_);
                 if (m/^Sbjct:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
                    ($h5, $h3) = ($1, $2);
                    }
                   else {
                    &err_parse("Cannot find start Sbjct: line for alignment (query=$query, subj=$hit)");
                    }
                 $_=<STDIN>; $bytes_read+=length($_);#skip empty line
                   #$_=<STDIN>;
                   #print STDERR "Alignment continuation is:\n$_\n\n";
                   #exit(0);
                 while (defined($_=<STDIN>) && m/^Query:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
                   $bytes_read+=length($_);
                   #print STDERR "found alignment continuation\n";
                   $q3=$1;
                   $_=<STDIN>;$bytes_read+=length($_);
                   $_=<STDIN>;$bytes_read+=length($_);
                   if (m/^Sbjct:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
                     $h3=$1;
                     }
                    else {
                      &err_parse("Cannot find end of Sbjct: line for alignment (query=$query, subj=$hit)");
                      }
                   $fend = $bytes_read;
                   $_=<STDIN>;$bytes_read+=length($_);
                   }
                  $fend=$bytes_read;
                  if ($curgrp) {
                      push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]);
                      #$grphspcount++;
                      #$grpids+=$ids;
                      #$grpsims+=$sims;
                      #$grplen+=$bcount;
                     }
                    else {
                     &$writeline;
                     } 
                  next; #skip to next HSP, as we might have read it
                 } # alignment parsing 
              }
         SKIP_EMPTYLN:
            $_=<STDIN>;$bytes_read+=length($_);
            &err_parse("--Timeout trying to find all HSPs for query $query and subj $hit")
              if ($.>$savepos+5000);
            }
       &err_parse("--Cannot get hit data for $hit")
                      if ($score==0 || $h3==0);
             
       } #--Subject entry
      $_=<STDIN>;$bytes_read+=length($_);
      }
  NEXT_QUERY:  
    do {
      $_=<STDIN>;$bytes_read+=length($_);
      last if eof(STDIN);
      #} until /^\x0C/; # record separator for multi-fasta wu-blast!
      } until (/^Query=/);
    goto NEXT_QRYBLOCK;  
  }

exit(0);

#==========================================
#to speed it up, use the variables directly from main program 
#(not very elegant, but more efficient)

sub err_parse {
 die "Parsing Error at line $.:\n$_\n$_[0]\n";
}

sub write_noflt {
  $hdefline =~ s/^>/\x01/;
  $hdefline =~ s/ >([\w\|]{6,})/\x01$1/g;
  my $xtra=$_[0]?"\t$_[0]":"";
  print join("\t",
      # 0       1      2       3         4        5     6    7   8     9    10    11
     ($query, $date, $qlen, $method, $searchdb, $hit, $q5, $q3, $h5, $h3, $pid, $psim, 
          #  12     13      14     15        16      17        18      19
          $score, $fpos, $fend, $hdefline, $frame, $strand, $hitlen, $evalue))."$xtra\n";
  }; #clean write
  
sub write_flt { 
  #my ($minhlen, $maxoh, $minpid, $minsc, $maxE)
  my ($minmatch, $maxovhang)=($is_lenperc?(($minhlen*$qlen)/100):$minhlen,
                        $is_ovperc?(($maxoh*$qlen)/100):$maxoh);
                        
  my ($q_match,    $q_beg_over, $q_end_over)=($q5 < $q3)?
     (($q3 - $q5) + 1, $q5 - 1, $qlen - $q3) :
     (($q5 - $q3) + 1, $qlen - $q5, $q3 - 1);
  my ($h_match, $h_beg_over, $h_end_over)= ($h5 < $h3)?
     ( ($h3 - $h5) + 1, $h5 - 1, $hitlen - $h3) :
     ( ($h5 - $h3) + 1, $hitlen - $h5, $h3 - 1);
  &write_noflt($_[0])
       if (($query ne $hit) &&
           (($h_match >=$minmatch) || ($q_match >= $minmatch))
           && (($q_beg_over <= $maxovhang) || ($h_beg_over <= $maxovhang))
           && (($q_end_over <= $maxovhang) || ($h_end_over <= $maxovhang))
           && ($pid>=$minpid)
           && ($psim>=$minpsim)
           && ($score>=$minscore)
           && ($evalue<=$maxE))
  }

sub write_group {
 die "Error: write_group called with empty \@grpdata\n"
   if (@grpdata==0);
 my (@q5, @q3, @h5, @h3);
 @grpdata = sort { $main::a->[0] <=> $main::b->[0] } @grpdata;
 my @details;
 $score=0;
 my $hitcov; #non redundant hit coverage --
 my @hitsegs; # non-overlapping hit segments
 my $frm;
 my ($grplen, $grpids, $grpsims)=(0,0,0);
 # also recompute $psim, $pid, $q5, $q3, $h5, $h3 
 # because psim/pid filter may have discarded some HSPs
 #my ($sumpid, $sumpsim, $sumlen, $minq, $maxq, $minh, $maxh);
 foreach my $d (@grpdata) {
  #$d=[$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $numids, $numsims, $numbases]
  #     0     1   2    3     4     5        6      7        8       9         10
  #print STDERR "seg $$d[0]-$$d[1] : $$d[2]-$$d[3]\n";
  next unless ($$d[4]>=$minpid && $$d[5]>=$minpsim);
  $grpids += $$d[8];
  $grpsims += $$d[9];
  $grplen += $$d[10];
  my ($seg_start, $seg_end)=($$d[1]<$$d[0])?($$d[1],$$d[0]) :($$d[0],$$d[1]);
  my $seglen=$seg_end-$seg_start+1;
  #$sumpid+=$seglen*$$d[4];
  #$sumpsim+=$seglen*$$d[5];
  #$sumlen+=$seglen;
  #$minq=$seg_start if ($minq==0 || $minq>$seg_start);
  #$maxq=$seg_end if ($maxq==0 || $maxq<$seg_end);

  push(@q5,$$d[0]);push(@q3,$$d[1]);
  push(@h5,$$d[2]);push(@h3,$$d[3]); 
  if ($$d[2]>$$d[3]) {
     #$minh=$$d[3] if ($minh==0 || $minh>$$d[3]);
     #$maxh=$$d[2] if ($maxh==0 || $maxh<$$d[2]);
     push(@hitsegs,[$$d[3],$$d[2]] )
     }
    else {
     #$minh=$$d[3] if ($minh==0 || $minh>$$d[3]);
     #$maxh=$$d[2] if ($maxh==0 || $maxh<$$d[2]);
     push(@hitsegs,[$$d[2],$$d[3]]);
     }
  push(@details, "$$d[0]-$$d[1]\:$$d[2]-$$d[3]|$$d[5]");
  $score+=$$d[6];
  $frm=$$d[7] unless $frm;
  $frame='*' if ($$d[7] ne $frm);
  } #for each HSP
 #if ($sumlen>0) {
 # $psim=$sumpsim/$sumlen;
 # $pid=$sumpid/$sumlen;
 # }
 goto GO_OUT if ($grplen==0);
 $pid=sprintf('%d',$grpids*100/$grplen);
 $psim=sprintf('%d',$grpsims*100/$grplen);

 goto GO_OUT if ($psim<$minpsim || $pid <$minpid);
 
 $hitcov=&mergeSegs(\@hitsegs);
 
 #@q5 = sort { $main::a <=> $main::b } @q5;
 #@q3 = sort { $main::a <=> $main::b } @q3;
 #@h5 = sort { $main::a <=> $main::b } @h5;
 #@h3 = sort { $main::a <=> $main::b } @h3;
 #determine actual (non-overlapping) coverage of the hit
 ($q5, $q3, $h5, $h3)=($q5[0], $q3[-1], $h5[0], $h3[-1]);
 if ($strand eq 'Minus' && ($q5[0]>$q3[0])) { # q5>q3 unless protein search
   ($q5, $q3, $h5, $h3)=($q5[-1], $q3[0], $h5[0], $h3[-1]);
   @details=reverse(@details);
   }
   
 &$writeline("$hitcov\t".join('~',@details));
 #-- clear
GO_OUT:
 @grpdata=();
 undef($curgrp);
 #$grplen=0;$grpids=0;$grpsims=0;$grphspcount=0;
}

sub mergeSegs {
 my $a=shift; # ref to list of [h5, h3]
 my $hcov=0;
 OVLCHECK:
 for (my $i=0;$i<(@$a-1); $i++) {
  for (my $j=$i+1;$j<@$a; $j++) {
    my ($li, $ri, $lj, $rj)=
        ($$a[$i]->[0], $$a[$i]->[1], $$a[$j]->[0], $$a[$j]->[1]);
    if ($lj<=$ri && $lj>=$li) {
      $$a[$i]->[0]=$li;
      $$a[$i]->[1]=($rj>$ri) ? $rj : $ri;
      splice(@$a, $j,1);
      goto OVLCHECK;
      }
    elsif ($li<=$rj && $li>=$lj) {
      $$a[$i]->[0]=$lj;
      $$a[$i]->[1]=($rj>$ri) ? $rj : $ri;
      splice(@$a, $j,1);
      goto OVLCHECK;
      }
    } # for j
  } # for i
 foreach my $d (@$a) {
  $hcov+=($$d[1]-$$d[0]+1);
  }
 return $hcov;
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 6 months ago) by gpertea
File size:	16146 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2
3	#some filtering can be performed on the output
4	#
5	# this should be rewritten in C/C++ to be faster
6
7	use strict;
8
9	my $usage = q{
10	blast2btab [HITLEN=nn[%]] [OVHANG=nn[%]] [PID=nn] [PSIM=nn]
11	[SCORE=nnnn \| E=nnnnn]
12
13	Format and filter wu-blast2 output coming at stdin. The output consists of
14	the following tab delimited fields:
15	0 1 2 3 4 5
16	q_name, date, q_len, method, db_file, hit_name,
17	6 7 8 9 10 11 12
18	q_n5, q_n3, hit_n5, hit_n3, %id, %sim, score,
19	13 14 15 16
20	file_pos1, file_pos2, hit_description, Frame,
21	17 18 19 [ 20 21 ]
22	strand, hit_len, E-value [ hit-coverage HSPs ]
23
24	Filtering:
25	PID for minimum percent identity
26	PSIM for minimum percent similarity
27	SCORE for minimum alignment score
28	E for maximum E value (Blast E parameter)
29	HITLEN for minimum overlap length
30	OVHANG for maximum overhang length (mismatched overhangs)
31
32	If '%' is specified for HITLEN or OVHANG values, the actual cutoff
33	lengths will be computed as percentages of the query sequence length;
34
35	If -topcomboX option is used, each line will represent a group, with the individual
36	segment hits in the 21th field, and the actual hit coverage length as the 20th field
37	};
38
39	die($usage) if ($ARGV[0]=~/\-h\|\-?\-help/i);
40	my $_bignum=9999999;
41	my %a=('HITLEN'=>0,
42	'OVHANG'=>$_bignum,
43	'PID'=>0,
44	'PSIM'=>0,
45	'SCORE'=>0,
46	'E'=>$_bignum,
47	'DEFLINE'=>'none',
48	'PARSE'=>0
49	);
50	# try to parse filtering options, if any;
51	my ($is_lenperc, $is_ovperc)=(0,0);
52	my $is_filtered=0;
53	my $bytes_read=0;
54	foreach (@ARGV) {
55	my @param=split(/=/);
56	$is_filtered=1 if ($param[0] ne 'DEFLINE');
57	die $usage if (@param<2 \|\| not exists($a{$param[0]}));
58	if ($param[0] eq 'HITLEN') {
59	my $p=rindex($param[1],'%');
60	if ($p>0) {
61	$is_lenperc=1;
62	$param[1]=substr($param[1],0,$p);
63	}
64	}
65	elsif ($param[0] eq 'OVHANG') {
66	my $p=rindex($param[1],'%');
67	if ($p>=0) {
68	$is_ovperc=1;
69	$param[1]=substr($param[1],0,$p);
70	}
71	}
72	$a{$param[0]}=$param[1];
73	}
74	my $defln=lc($a{'DEFLINE'});
75	if ($defln eq 'none') {
76	$defln=0;
77	}
78	elsif ($defln eq 'q') {
79	$defln=1;
80	}
81	elsif ($defln eq 'h') {
82	$defln=2;
83	}
84	elsif ($defln eq 'b') {
85	$defln=3;
86	}
87	else {
88	die "$usage. Error at DEFLINE option.";
89	}
90
91	$/ = "\n";
92	#$\ = "\n";
93	#$, = "\t";
94	my ($minhlen, $maxoh, $doparse, $minpid, $minpsim, $minscore, $maxE)=(
95	$a{'HITLEN'},
96	$a{'OVHANG'},
97	$a{'PARSE'},
98	$a{'PID'},
99	$a{'PSIM'},
100	$a{'SCORE'},
101	$a{'E'});
102	$maxE='1'.$maxE if lc(substr($maxE,0,1)) eq 'e';
103
104	my $writeline= ($is_filtered) ? \&write_flt : \&write_noflt;
105	my $curgrp; #current group #
106	my @grpdata; #when groups are defined, this accumulates HSP data
107	# list of [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame]
108	#my ($grphspcount, $grplen, $grpids, $grpsims);
109	#segments expected, segments collected already,
110	# total bases, total identities, total positives
111
112	#only one file is accepted here...
113	my ($query, $qlen, $q5, $q3, $hit, $hitlen, $h5, $h3, $ids, $sims, $bcount, $pid, $psim,
114	$score, $evalue, $strand, $frame, $hdefline, $searchdb, $fpos, $fend, $qdefline);
115	my $flush=0;
116	$fpos=-1;
117	$fend=-1;
118	my $method='prog';
119	$searchdb='n/a';
120	my @lcdate=localtime();
121	my $date=($lcdate[4]+1).'/'.$lcdate[3].'/'.(1900+$lcdate[5]);
122	$frame='*';
123	BIGLOOP:
124	while (<STDIN>) {
125	die "Line $. doesn't appear to be a blast output:\n$_\n" unless (/^(T?BLAST[NXP]?)/);
126	NEXT_QRYBLOCK:
127	$bytes_read+=length($_);
128	next if /^\s*$/;
129
130	goto QRYFOUND if m/^Query=\s(\S+)\s(.*)/;
131
132	$method=lc($1);
133	do {
134	$_=<STDIN>; $bytes_read+=length($_);
135
136	last BIGLOOP unless $_;
137	} until m/^Query=\s(\S+)\s(.*)/;
138	QRYFOUND:
139	&write_group() if $query && $curgrp;
140	$query=$1;
141	$qdefline='';
142	if ($defln==1 \|\| $defln==3) {
143	$qdefline=$2;
144	#print STDERR "qdefline set to '$qdefline'\n";
145	$qdefline =~ tr/\n//d;
146	}
147	my $linepos=$.;
148	do {
149	$_=<STDIN>; $bytes_read+=length($_);
150	last BIGLOOP unless $_;
151	&err_parse("Cannot find 'Length' line for Query $query")
152	if ($.-$linepos) > 10000;
153	} until /\s+\(([\d\,]+) letters/;
154
155	$qlen=$1;
156	$qlen =~ tr/,//d;
157	# parse all matches for this $query:
158	$_=<STDIN>; $bytes_read+=length($_);
159	my $was_param;
160	while ($_ && !(($was_param=m/^Parameters:/) \|\| m/^EXIT CODE/)) {
161	chomp;
162	if (m/^Database:\s+(\S+)/) {
163	$searchdb=$1;
164	($searchdb)=($searchdb =~ m/([^\/]+)$/);
165	}
166	# Hit section:
167	if (/^>(\S+)\s(.)/) {
168	#HSPs here
169	$hit=$1;
170	#print STDERR "hitline is: $_\n";
171	#if ($defln>=2) {
172	$hdefline=$2;
173	#print STDERR "hdefline set to '$hdefline'\n";
174	# }
175	NEXT_SUBJ:
176	my $savepos;
177	#&err_parse("Did you use '-noseqs' option for this blast search?")
178	# if ($hit && $score && !$h3);
179	$savepos=$.;
180	while (<STDIN>) {
181	$bytes_read+=length($_);
182	&err_parse("Cannot find 'Length' line for hit $hit")
183	if ($.-$savepos) > 10000;
184	if (/^\s+Length\s+=\s+([\d\,]+)/) {
185	$hitlen=$1; $hitlen =~ tr/,//d;
186	last;
187	}
188	chomp;
189	s/^\s+//;
190	$hdefline.=' '.$_ if $_;
191	}
192	$h3=0;$score=0;$strand='+';
193	$_=<STDIN>;$bytes_read+=length($_);
194	$savepos=$.+80;
195	my $withseqs=0;
196	while ($_) {
197	#search of all HSPs until Subject match or EOF is encountered
198	#print "&&&&&& $query: found hit $hit, score=$score\n";
199	if (/^>(\S+)\s(.)/) {
200	#print "prevscore= $score, prevhit=$hit, prevcoords=$h3\n";
201	#&err_parse("-- coordinates not found for hit $hit. Please be sure blastn is given -noseqs option!")
202	# if ($hit && $score && !$h3);
203	&write_group() if $curgrp;
204	chomp;
205	$hit=$1;$hdefline=$2;
206	goto NEXT_SUBJ;
207	}
208	goto SKIP_EMPTYLN if ($_ eq "\n");
209	if (($was_param=m/^Parameters:/) \|\| m/^EXIT CODE/) {
210	&write_group() if $curgrp;
211	goto NEXT_QUERY;
212	}
213	if (/^\s*Score[\s=]+(\d+)/) {
214	$score=$1;
215	$fpos=$bytes_read-length($_);
216	($evalue)=(m/\,\sExpect\s=\s*([\de\.\-]+)/);
217	#print("got e-value $evalue for hit $hit \n");
218	my ($group)=(m/Group\s+=\s+(\d+)/);
219	&write_group() if ($curgrp && $curgrp!=$group);
220	$curgrp=$group;
221	$savepos=$.+80;
222	}
223	elsif (/^\s*Identities\s+=\s+(\d+)\/(\d+)\s+\(([\d\.]+)\%\)/) {
224	($ids, $bcount, $pid)=($1,$2,$3);
225	$psim=$pid;
226	if (/Positives\s+=\s+(\d+)\/\d+\s+\(([\d\.]+)\%\)/) {
227	($sims, $psim)=($1, $2)
228	}
229	$strand=/Minus/?'Minus':'Plus';
230	if (/Frame\s+\=\s+([\+\-\d]+)/) {
231	$frame=$1;
232	$strand=(substr($frame,0,1) eq '-') ? 'Minus':'Plus';
233	}
234	}
235	elsif (m/^Query:\s+/) { #parse query coordinates
236	if (m/^Query:\s*(\d+)[\s]+\-\s+(\d+)/) { #noseqs case
237	($q5,$q3)=($1,$2);
238	$_=<STDIN>;$bytes_read+=length($_);
239	if (m/^Sbjct:\s*(\d+)\s+\-\s+(\d+)/) { #noseqs case
240	($h5,$h3)=($1,$2);
241	$fend=$bytes_read;
242	if ($curgrp) {
243	push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]);
244	#$grphspcount++;
245	#$grpids+=$ids;
246	#$grpsims+=$sims;
247	#$grplen+=$bcount;
248	}
249	else {
250	&$writeline;
251	}
252	}
253	else {
254	&err_parse("Cannot find Sbjct: line for [noseqs] output (query=$query, subj=$hit)");
255	}
256	}
257	else { #read/parse the whole alignment to get the start-end coordinates
258	#for both query and sequence
259	if (m/^Query:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
260	($q5,$q3)=($1,$2);
261	}
262	else {
263	&err_parse("Invalid start of Query: alignment line (query=$query, subj=$hit)");
264	}
265	$withseqs=1;
266	$_=<STDIN>;$bytes_read+=length($_);
267	$_=<STDIN>;$bytes_read+=length($_);
268	if (m/^Sbjct:\s+(\d+)\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
269	($h5, $h3) = ($1, $2);
270	}
271	else {
272	&err_parse("Cannot find start Sbjct: line for alignment (query=$query, subj=$hit)");
273	}
274	$_=<STDIN>; $bytes_read+=length($_);#skip empty line
275	#$_=<STDIN>;
276	#print STDERR "Alignment continuation is:\n$_\n\n";
277	#exit(0);
278	while (defined($_=<STDIN>) && m/^Query:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
279	$bytes_read+=length($_);
280	#print STDERR "found alignment continuation\n";
281	$q3=$1;
282	$_=<STDIN>;$bytes_read+=length($_);
283	$_=<STDIN>;$bytes_read+=length($_);
284	if (m/^Sbjct:\s+\d+\s+[A-Z,\-,\*]+\s+(\d+)$/i) {
285	$h3=$1;
286	}
287	else {
288	&err_parse("Cannot find end of Sbjct: line for alignment (query=$query, subj=$hit)");
289	}
290	$fend = $bytes_read;
291	$_=<STDIN>;$bytes_read+=length($_);
292	}
293	$fend=$bytes_read;
294	if ($curgrp) {
295	push(@grpdata, [$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $ids, $sims, $bcount]);
296	#$grphspcount++;
297	#$grpids+=$ids;
298	#$grpsims+=$sims;
299	#$grplen+=$bcount;
300	}
301	else {
302	&$writeline;
303	}
304	next; #skip to next HSP, as we might have read it
305	} # alignment parsing
306	}
307	SKIP_EMPTYLN:
308	$_=<STDIN>;$bytes_read+=length($_);
309	&err_parse("--Timeout trying to find all HSPs for query $query and subj $hit")
310	if ($.>$savepos+5000);
311	}
312	&err_parse("--Cannot get hit data for $hit")
313	if ($score==0 \|\| $h3==0);
314
315	} #--Subject entry
316	$_=<STDIN>;$bytes_read+=length($_);
317	}
318	NEXT_QUERY:
319	do {
320	$_=<STDIN>;$bytes_read+=length($_);
321	last if eof(STDIN);
322	#} until /^\x0C/; # record separator for multi-fasta wu-blast!
323	} until (/^Query=/);
324	goto NEXT_QRYBLOCK;
325	}
326
327	exit(0);
328
329	#==========================================
330	#to speed it up, use the variables directly from main program
331	#(not very elegant, but more efficient)
332
333	sub err_parse {
334	die "Parsing Error at line $.:\n$_\n$_[0]\n";
335	}
336
337	sub write_noflt {
338	$hdefline =~ s/^>/\x01/;
339	$hdefline =~ s/ >([\w\\|]{6,})/\x01$1/g;
340	my $xtra=$_[0]?"\t$_[0]":"";
341	print join("\t",
342	# 0 1 2 3 4 5 6 7 8 9 10 11
343	($query, $date, $qlen, $method, $searchdb, $hit, $q5, $q3, $h5, $h3, $pid, $psim,
344	# 12 13 14 15 16 17 18 19
345	$score, $fpos, $fend, $hdefline, $frame, $strand, $hitlen, $evalue))."$xtra\n";
346	}; #clean write
347
348	sub write_flt {
349	#my ($minhlen, $maxoh, $minpid, $minsc, $maxE)
350	my ($minmatch, $maxovhang)=($is_lenperc?(($minhlen*$qlen)/100):$minhlen,
351	$is_ovperc?(($maxoh*$qlen)/100):$maxoh);
352
353	my ($q_match, $q_beg_over, $q_end_over)=($q5 < $q3)?
354	(($q3 - $q5) + 1, $q5 - 1, $qlen - $q3) :
355	(($q5 - $q3) + 1, $qlen - $q5, $q3 - 1);
356	my ($h_match, $h_beg_over, $h_end_over)= ($h5 < $h3)?
357	( ($h3 - $h5) + 1, $h5 - 1, $hitlen - $h3) :
358	( ($h5 - $h3) + 1, $hitlen - $h5, $h3 - 1);
359	&write_noflt($_[0])
360	if (($query ne $hit) &&
361	(($h_match >=$minmatch) \|\| ($q_match >= $minmatch))
362	&& (($q_beg_over <= $maxovhang) \|\| ($h_beg_over <= $maxovhang))
363	&& (($q_end_over <= $maxovhang) \|\| ($h_end_over <= $maxovhang))
364	&& ($pid>=$minpid)
365	&& ($psim>=$minpsim)
366	&& ($score>=$minscore)
367	&& ($evalue<=$maxE))
368	}
369
370	sub write_group {
371	die "Error: write_group called with empty \@grpdata\n"
372	if (@grpdata==0);
373	my (@q5, @q3, @h5, @h3);
374	@grpdata = sort { $main::a->[0] <=> $main::b->[0] } @grpdata;
375	my @details;
376	$score=0;
377	my $hitcov; #non redundant hit coverage --
378	my @hitsegs; # non-overlapping hit segments
379	my $frm;
380	my ($grplen, $grpids, $grpsims)=(0,0,0);
381	# also recompute $psim, $pid, $q5, $q3, $h5, $h3
382	# because psim/pid filter may have discarded some HSPs
383	#my ($sumpid, $sumpsim, $sumlen, $minq, $maxq, $minh, $maxh);
384	foreach my $d (@grpdata) {
385	#$d=[$q5, $q3, $h5, $h3, $pid, $psim, $score, $frame, $numids, $numsims, $numbases]
386	# 0 1 2 3 4 5 6 7 8 9 10
387	#print STDERR "seg $$d[0]-$$d[1] : $$d[2]-$$d[3]\n";
388	next unless ($$d[4]>=$minpid && $$d[5]>=$minpsim);
389	$grpids += $$d[8];
390	$grpsims += $$d[9];
391	$grplen += $$d[10];
392	my ($seg_start, $seg_end)=($$d[1]<$$d[0])?($$d[1],$$d[0]) :($$d[0],$$d[1]);
393	my $seglen=$seg_end-$seg_start+1;
394	#$sumpid+=$seglen*$$d[4];
395	#$sumpsim+=$seglen*$$d[5];
396	#$sumlen+=$seglen;
397	#$minq=$seg_start if ($minq==0 \|\| $minq>$seg_start);
398	#$maxq=$seg_end if ($maxq==0 \|\| $maxq<$seg_end);
399
400	push(@q5,$$d[0]);push(@q3,$$d[1]);
401	push(@h5,$$d[2]);push(@h3,$$d[3]);
402	if ($$d[2]>$$d[3]) {
403	#$minh=$$d[3] if ($minh==0 \|\| $minh>$$d[3]);
404	#$maxh=$$d[2] if ($maxh==0 \|\| $maxh<$$d[2]);
405	push(@hitsegs,[$$d[3],$$d[2]] )
406	}
407	else {
408	#$minh=$$d[3] if ($minh==0 \|\| $minh>$$d[3]);
409	#$maxh=$$d[2] if ($maxh==0 \|\| $maxh<$$d[2]);
410	push(@hitsegs,[$$d[2],$$d[3]]);
411	}
412	push(@details, "$$d[0]-$$d[1]\:$$d[2]-$$d[3]\|$$d[5]");
413	$score+=$$d[6];
414	$frm=$$d[7] unless $frm;
415	$frame='*' if ($$d[7] ne $frm);
416	} #for each HSP
417	#if ($sumlen>0) {
418	# $psim=$sumpsim/$sumlen;
419	# $pid=$sumpid/$sumlen;
420	# }
421	goto GO_OUT if ($grplen==0);
422	$pid=sprintf('%d',$grpids*100/$grplen);
423	$psim=sprintf('%d',$grpsims*100/$grplen);
424
425	goto GO_OUT if ($psim<$minpsim \|\| $pid <$minpid);
426
427	$hitcov=&mergeSegs(\@hitsegs);
428
429	#@q5 = sort { $main::a <=> $main::b } @q5;
430	#@q3 = sort { $main::a <=> $main::b } @q3;
431	#@h5 = sort { $main::a <=> $main::b } @h5;
432	#@h3 = sort { $main::a <=> $main::b } @h3;
433	#determine actual (non-overlapping) coverage of the hit
434	($q5, $q3, $h5, $h3)=($q5[0], $q3[-1], $h5[0], $h3[-1]);
435	if ($strand eq 'Minus' && ($q5[0]>$q3[0])) { # q5>q3 unless protein search
436	($q5, $q3, $h5, $h3)=($q5[-1], $q3[0], $h5[0], $h3[-1]);
437	@details=reverse(@details);
438	}
439
440	&$writeline("$hitcov\t".join('~',@details));
441	#-- clear
442	GO_OUT:
443	@grpdata=();
444	undef($curgrp);
445	#$grplen=0;$grpids=0;$grpsims=0;$grphspcount=0;
446	}
447
448	sub mergeSegs {
449	my $a=shift; # ref to list of [h5, h3]
450	my $hcov=0;
451	OVLCHECK:
452	for (my $i=0;$i<(@$a-1); $i++) {
453	for (my $j=$i+1;$j<@$a; $j++) {
454	my ($li, $ri, $lj, $rj)=
455	($$a[$i]->[0], $$a[$i]->[1], $$a[$j]->[0], $$a[$j]->[1]);
456	if ($lj<=$ri && $lj>=$li) {
457	$$a[$i]->[0]=$li;
458	$$a[$i]->[1]=($rj>$ri) ? $rj : $ri;
459	splice(@$a, $j,1);
460	goto OVLCHECK;
461	}
462	elsif ($li<=$rj && $li>=$lj) {
463	$$a[$i]->[0]=$lj;
464	$$a[$i]->[1]=($rj>$ri) ? $rj : $ri;
465	splice(@$a, $j,1);
466	goto OVLCHECK;
467	}
468	} # for j
469	} # for i
470	foreach my $d (@$a) {
471	$hcov+=($$d[1]-$$d[0]+1);
472	}
473	return $hcov;
474	}