1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use FindBin; |
4 |
|
5 |
umask 0002; |
6 |
#the line below is needed if pvmsx is used |
7 |
# also, the error condition is set only by the presence of $file |
8 |
chdir($ENV{PVMSX_WD}) if ($ENV{PVMSX_WD}); |
9 |
|
10 |
|
11 |
$ENV{'PATH'}=$FindBin::Bin.':'.$ENV{'PATH'}; |
12 |
|
13 |
#so for pvmsx to consider the task was successful, $file must be deleted! |
14 |
#============== |
15 |
# 1 is the name of the fasta sequence input file |
16 |
# 2 is the # of sequences in ${1} should = 1 for this script |
17 |
# 3 is the slice no. being processed by sx |
18 |
# 4 is 0 if not the last file, 1 if the last file |
19 |
# 5 is the # of sequences skipped initially |
20 |
# 6 is the # of sequences to be processed (-1 = ALL) |
21 |
# 7 user parameter |
22 |
# 1 2 3 4 5 6 |
23 |
my ($file, $numpass, $slice_num, $last, $skipped, $total, $userparams)=@ARGV; |
24 |
#user parameters: |
25 |
# 1) database (full path) |
26 |
# 2) PID cutoff (using 94 if not specified) |
27 |
# 3) maximum overhang |
28 |
# 4) minimum overlap |
29 |
# 5) extra flags (optional) |
30 |
# flags: M = no masking, D= separate database (not all-vs-all) |
31 |
# G = show gaps |
32 |
my ($searchdb,$pid,$maxovh,$minovl, $xflags)=split(/:/,$userparams); |
33 |
my $mgblast_res=$file.'.tab'; |
34 |
my $nomasking = $xflags=~/M/; |
35 |
my $gapinfo = $xflags=~/G/; |
36 |
my $otherDb = $xflags=~/D/; |
37 |
|
38 |
$pid=94 unless $pid>50; |
39 |
$minovl=40 unless $minovl; |
40 |
my $log_file='log_std'; |
41 |
my $err_file='err_log'; |
42 |
open(STDERR, '>>'.$err_file); |
43 |
open(STDOUT, '>>'.$log_file); |
44 |
|
45 |
my $toskip=($file =~ m/_\@(\d+)_v\d+\.\d+/) ? $1 : $skipped+$numpass*($slice_num-1); |
46 |
|
47 |
#= mgblast weakness - when some query seqs are entirely masked |
48 |
#= or what's left is just low complexity, it will abort the search |
49 |
#= so let's filter the input in advance |
50 |
unless ($nomasking) { |
51 |
my $renfile=$file.'_cpy'; |
52 |
rename($file, $renfile) || die "Cannot rename '$file' to '$renfile'!\n"; |
53 |
local $/="\n>"; |
54 |
open (INFILE, "mdust -m L < $renfile |") || die "Cannot open renamed file '$renfile'\n"; |
55 |
open (OUTFILE, '>'.$file) || die "Cannot create flt file '$file'\n"; |
56 |
open (MASKED, '>>masked.lst') || die "Cannot open masked.lst for append\n"; |
57 |
open (TOOSHORT, '>>tooshort.lst') || die "Cannot open tooshort.lst for append\n"; |
58 |
|
59 |
my $counter; |
60 |
while(<INFILE>) { |
61 |
s/^>//; |
62 |
chomp; |
63 |
$counter++; |
64 |
my ($defline, $seq)=(m/^(.+?)\n(.+)/s); |
65 |
my $s=$seq; |
66 |
$s =~ tr/\n \t//d; |
67 |
my $seqlen=length($s); |
68 |
if ($seqlen<$minovl) { |
69 |
print TOOSHORT $defline."\n"; |
70 |
next; |
71 |
} |
72 |
my @uc=($s =~ m/([A-M,O-W,Y,Z]+)/g); |
73 |
my $valid; |
74 |
foreach my $l (@uc) { |
75 |
if (length($l)>12) { # a valid stretch of at least 12nt would suffice |
76 |
$valid=1; |
77 |
last; |
78 |
} |
79 |
} |
80 |
if ($valid) { |
81 |
print OUTFILE '>'.$defline."\n".$seq."\n"; |
82 |
} |
83 |
else { |
84 |
print MASKED $defline."\n"; |
85 |
} |
86 |
} #while |
87 |
close(INFILE);close(OUTFILE);close(MASKED);close(TOOSHORT); |
88 |
die "Error: no sequence in this slice!\n" unless $counter; |
89 |
unlink($renfile); |
90 |
} |
91 |
|
92 |
my $cmd="mgblast -i $file -d $searchdb -p $pid -W12 -X12 ". |
93 |
" -JF -v100 -b100 -C$minovl -H$maxovh "; |
94 |
#$cmd.=$gapinfo ? ' -D5 ':' -D4 '; |
95 |
$cmd.= ' -D4 -UF -F"m D" '; #use dust filter |
96 |
#$cmd.= ($nomasking) ? ' -FF -UF ' : ' -UT -F "m D" '; |
97 |
unless ($otherDb) { |
98 |
$cmd.=' -KT '; |
99 |
$cmd.= " -k $toskip " if $toskip>0; |
100 |
} |
101 |
$cmd.=" > $mgblast_res"; |
102 |
my $slno=sprintf("slice:%09d",$slice_num); |
103 |
print STDERR ">>$slno: $cmd\n"; |
104 |
|
105 |
my $errmsg = `($cmd) 2>&1`; |
106 |
print STDERR "<<$slno: done.\n"; |
107 |
|
108 |
if ($? || ($errmsg=~/ERROR/i) || ($errmsg=~/Segmentation/i)) { |
109 |
print STDERR "!Error at:\n$cmd\n"; |
110 |
print STDERR "$errmsg\n"; |
111 |
exit(1); |
112 |
} |
113 |
$cmd="sort -k11,11g -k10,10nr -k9,9nr $mgblast_res | bzip2 -cf > $mgblast_res.bz2"; |
114 |
my $r=system($cmd); |
115 |
if ($r && ($r>>8 != 2) ) { |
116 |
die "Error sorting/compressing the results file '$mgblast_res'\n$cmd\n"; |
117 |
} |
118 |
|
119 |
unlink($file, $mgblast_res); |
120 |
exit 0; |