1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use FindBin; |
4 |
umask 0002; |
5 |
|
6 |
chdir($ENV{PVMSX_WD}) if ($ENV{PVMSX_WD}); |
7 |
|
8 |
my ($file, $numpass, $sliceno, $last, $skipped, $total, $userparams)=@ARGV; |
9 |
|
10 |
#$ENV{'PATH'}=$FindBin::Bin.':'.$ENV{'PATH'}; |
11 |
$userparams =~ s/\~\+/ /g; |
12 |
my ($dbfile, $flags, $vec_db, $screen_db, $minvlen, $minpid)=split(/:/, $userparams); |
13 |
my ($dbname)=($dbfile =~ m/([^\/]+)$/); |
14 |
$dbname=$dbfile unless $dbname; |
15 |
|
16 |
#minimum theoretical length of a terminal vector/linker residue |
17 |
$minvlen=11 unless $minvlen; |
18 |
|
19 |
my $cmd; |
20 |
my $wrkfile = $file.'.wrk'; |
21 |
my $repfile=sprintf('%06d_%s.cln', $sliceno, $dbname); |
22 |
|
23 |
#my $outfile= sprintf('%06d_%s.outfasta', $sliceno, $dbname) if ($flags =~ m/O/); |
24 |
my $log_file='log_std'; |
25 |
my $err_file='err_log'; |
26 |
my %seqs; # this is the hash with all sequence data |
27 |
my @seqlist; #this is the list with all the sequences in this file |
28 |
my ($tooshort) = ($flags =~ m/S(\d+)/); |
29 |
my $lowqualcheck = ($flags =~ m/M/); |
30 |
my $minseqlen = $tooshort || 50; |
31 |
open(STDERR, '>>'.$err_file); |
32 |
open(STDOUT, '>>'.$log_file); |
33 |
&writeCLRfile($wrkfile, 1); #create sequence hash and work file (length filtered) |
34 |
my $polyA = ($flags =~ m/A/); |
35 |
my $endN = ($flags =~ m/N/); |
36 |
my $dust = ($flags =~ m/L/); |
37 |
#run trimpoly unless disabled |
38 |
if ($polyA || $endN) { |
39 |
my $trimcmd = ($polyA)? 'trimpoly -R -s10 -r8 -ex -l20 ':'trimpoly -R '; |
40 |
if ($endN) { |
41 |
$trimcmd.=' -n1 -m15 '; |
42 |
$trimcmd.=' -N ' unless $polyA; |
43 |
} |
44 |
open (TRIMPOLY, $trimcmd." < $wrkfile |") || die "Error opening '$trimcmd pipe\n"; |
45 |
while (<TRIMPOLY>) { |
46 |
chomp; |
47 |
my ($seqname, $percN, $sh5, $sh3, $ilen)=split(/\t/); |
48 |
my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n"; |
49 |
$$d[1]+=$sh5; |
50 |
$$d[2]-=$sh3; |
51 |
$$d[4]=$percN; |
52 |
$$d[5]='low_qual' if ($lowqualcheck && $percN>3.00 && $endN); #trash |
53 |
$$d[5]='shortq' if ($tooshort && $$d[2]-$$d[1]+1<$tooshort); |
54 |
$$d[6].="trimpoly[+$sh5, -$sh3]; " if $sh5 || $sh3; |
55 |
} |
56 |
close(TRIMPOLY); |
57 |
writeCLRfile($wrkfile); |
58 |
} #trimpoly part |
59 |
|
60 |
#== trash low complexity (unless disabled) |
61 |
if ($dust) { |
62 |
$cmd="mdust $wrkfile -c |"; |
63 |
open (INCOMING, $cmd) || &MErrExit("Error running $cmd !\n"); |
64 |
my $lastseq; |
65 |
my $lastlen; |
66 |
my $trashcount=0; |
67 |
while (<INCOMING>) { |
68 |
my ($seq_name, $len, $c5,$c3)=split(/\s+/); |
69 |
my $d=$seqs{$seq_name} || die "Error at dust: $seq_name not in hash!\n"; |
70 |
my $rlen = ($seq_name eq $lastseq) ? $lastlen-($c3-$c5) : $len-($c3-$c5); #remaining non-lc |
71 |
if ($rlen<40) { |
72 |
$$d[5]='dust' unless ($$d[5]); |
73 |
$$d[6].='low complexity; '; |
74 |
$trashcount++; |
75 |
} |
76 |
$lastseq=$seq_name; |
77 |
$lastlen=$rlen; |
78 |
} |
79 |
close(INCOMING); |
80 |
writeCLRfile($wrkfile) if $trashcount; |
81 |
} |
82 |
|
83 |
#=== the vector search; assumes the vector database is already |
84 |
# formatted with formatdb |
85 |
|
86 |
my $minperc = ($minpid>0) ? $minpid : 93; |
87 |
|
88 |
foreach my $vdb (split(/,/,$vec_db)) { |
89 |
my $changed; |
90 |
my $noflt; |
91 |
{ local $/='^'; $noflt=chomp($vdb); } |
92 |
die "Error locating contaminant database $vdb\n" unless -e $vdb; |
93 |
my ($fvdb)=($vdb =~ m/([^\/]+)$/); |
94 |
$fvdb=$vdb unless $fvdb; |
95 |
my $tabout=$wrkfile.'.vtab'; |
96 |
$cmd="blastall -p blastn -d $vdb -i $wrkfile "; |
97 |
$cmd.=$noflt?' -F F ':' -F "m D" '; |
98 |
my $blout=$wrkfile.'.blout'; |
99 |
$cmd.=' -q -4 -G 3 -E 3 -v 10000 -b 10000 -e 700 -Y 1.75e10 -m 8 > '.$blout; |
100 |
sysrun($cmd); |
101 |
$cmd= 'grep -v "^#" '.$blout.' | sort -k1,1 -k12,12 -nr '." > $tabout"; |
102 |
sysrun($cmd); |
103 |
|
104 |
if (-s $tabout) { |
105 |
local $_; |
106 |
open (TABOUT, $tabout) || die "Error opening '$tabout' at $vdb search\n"; |
107 |
while (<TABOUT>) { |
108 |
next if m/^\s*#/; |
109 |
my @t=split(/\t/); |
110 |
die "Error: invalid line format in $tabout:\n$_\n" unless @t>=12; |
111 |
my $d=$seqs{$t[0]} || |
112 |
die "Error: query '$t[0]' from $tabout not found in sequence hash!\n"; |
113 |
next if $$d[5]; |
114 |
#distance of hit from either end: |
115 |
my ($d5, $d3)=($t[6]-1, ($$d[8]-$$d[7]+1)-$t[7]); |
116 |
#hit should start within 30% distance of either (original) end |
117 |
my $maxdist=0.30*$$d[3]; |
118 |
my $hitlen=$t[7]-$t[6]; |
119 |
my $mindist=$d5>$d3?$d3:$d5; |
120 |
my $minhitlen=$minvlen+$mindist/2>35 ? 35 : $minvlen+$mindist/2; |
121 |
#validate the hit: the further it is, the longer it should be |
122 |
|
123 |
if ($t[2]>=$minperc && $mindist<$maxdist && $hitlen>=$minhitlen) { |
124 |
my $adjusted; |
125 |
if ($mindist == $d5) { |
126 |
#trim 5' end |
127 |
if ($$d[7]+$t[7]>$$d[1]) { |
128 |
$$d[1]=$$d[7]+$t[7]; #set the new end5 after the 3' end of the vector hit |
129 |
$adjusted=1; |
130 |
} |
131 |
} |
132 |
else { |
133 |
#trim 3' end |
134 |
if ($$d[7]+$t[6]-2<$$d[2]) { |
135 |
$$d[2]=$$d[7]+$t[6]-2; #set the new end3 before the 5' end of the vector hit |
136 |
$adjusted=1; |
137 |
} |
138 |
} |
139 |
if ($adjusted) { |
140 |
$changed=1; |
141 |
if ($$d[2]-$$d[1]<$minseqlen) { |
142 |
$$d[5]=$fvdb; #trash |
143 |
$$d[1]=$$d[2] if $$d[1]>$$d[2]; |
144 |
} |
145 |
$$d[6].=" $t[1]".':<'.($$d[7]+$t[6]-1).'-'.($$d[7]+$t[7]-1).'>;' if $adjusted; |
146 |
} |
147 |
} |
148 |
} |
149 |
close(TABOUT); |
150 |
} |
151 |
unlink($tabout);unlink($blout); |
152 |
writeCLRfile($wrkfile) if $changed; |
153 |
} # vector search loop |
154 |
|
155 |
# -- screening searches: large contaminant hit expected; that is, |
156 |
# at least 60% of the query should get a hit of at least 96% identity |
157 |
$minperc = ($minpid>10) ? $minpid : 96; |
158 |
foreach my $sdb (split(/,/,$screen_db)) { |
159 |
die "Error locating contaminant database $sdb\n" unless -e $sdb; |
160 |
my ($fsdb)=($sdb =~ m/([^\/]+)$/); |
161 |
$fsdb=$sdb unless $fsdb; |
162 |
my $tabout=$wrkfile.'.stab'; |
163 |
my $changed; |
164 |
# -- 2.2.11-13 megablast bug: it IGNORES -JF flag! |
165 |
# at least blastall still works as it should.. |
166 |
# so let's just use mgblast instead here: |
167 |
$cmd="mgblast -d $sdb -i $wrkfile -p$minperc". |
168 |
' -W18 -JF -F "m D" -X30 -D3 | '. |
169 |
' grep -v "^#" | sort -k1,1 -k12,12 -nr '. |
170 |
" > $tabout"; |
171 |
# $minscov=60 unless $minscov; |
172 |
sysrun($cmd); |
173 |
if (-s $tabout) { |
174 |
local $_; |
175 |
open (TABOUT, $tabout) || die "Error opening '$tabout' at $sdb search\n"; |
176 |
while (<TABOUT>) { |
177 |
next if m/^\s*#/; |
178 |
my @t=split(/\t/); |
179 |
die "Error: invalid line format in $tabout:\n$_\n" unless @t>=12; |
180 |
my $d=$seqs{$t[0]} || |
181 |
die "Error: query '$t[0]' from $tabout not found in sequence hash!\n"; |
182 |
next if $$d[5]; |
183 |
#distance of hit from either [original] end: |
184 |
my ($d5, $d3)=($t[6]-1, ($$d[8]-$$d[7]+1)-$t[7]); |
185 |
my $hitlen=$t[7]-$t[6]; |
186 |
#my $minhitlen= (0.75 * ($$d[8]-$$d[7]+1) < $minseqlen) ? $minseqlen: 0.75 * ($$d[8]-$$d[7]+1); |
187 |
my $minhitlen = 60; #at least 60 base pairs overlap! |
188 |
|
189 |
my $mindist=$d5>$d3?$d3:$d5; |
190 |
if ($hitlen>=$minhitlen) { |
191 |
my $adjusted; |
192 |
if ($mindist == $d5) { |
193 |
#trim 5' end |
194 |
if ($$d[7]+$t[7]>$$d[1]) { |
195 |
$$d[1]=$$d[7]+$t[7]; #set the new end5 after the 3' end of the vector hit |
196 |
$adjusted=1; |
197 |
} |
198 |
} |
199 |
else { |
200 |
#trim 3' end |
201 |
if ($$d[7]+$t[6]-2<$$d[2]) { |
202 |
$$d[2]=$$d[7]+$t[6]-2; #set the new end3 before the 5' end of the vector hit |
203 |
$adjusted=1; |
204 |
} |
205 |
} |
206 |
if ($adjusted) { |
207 |
$changed=1; |
208 |
if ($$d[2]-$$d[1]<$minseqlen) { |
209 |
$$d[5]=$fsdb ; #trash |
210 |
$$d[1]=$$d[2] if $$d[1]>$$d[2]; |
211 |
} |
212 |
$$d[6].=" $t[1]".':<'.($$d[7]+$t[6]-1).'-'.($$d[7]+$t[7]-1).'>;' if $adjusted; |
213 |
} |
214 |
} |
215 |
} #while |
216 |
close(TABOUT); |
217 |
} |
218 |
unlink($tabout); |
219 |
writeCLRfile($wrkfile) if $changed; |
220 |
} # larger contaminant screening |
221 |
|
222 |
# run trimpoly mostly for computing the percentage of undetermined bases |
223 |
my $trimcmd = ($polyA)? 'trimpoly -R -s10 -r8 -ex -l20 ':'trimpoly -R '; |
224 |
$trimcmd.= ($endN ? ' -n1 -m15 ' : ' -n90 -m1 '); |
225 |
$trimcmd.=' -N ' unless $polyA; |
226 |
open (TRIMPOLY, $trimcmd." < $wrkfile |") || die "Error opening final '$trimcmd' pipe\n"; |
227 |
while (<TRIMPOLY>) { |
228 |
chomp; |
229 |
my ($seqname, $percN, $sh5, $sh3, $ilen)=split(/\t/); |
230 |
my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n"; |
231 |
$$d[1]+=$sh5; |
232 |
$$d[2]-=$sh3; |
233 |
$$d[4]= $percN; |
234 |
$$d[5]='low_qual' if ($lowqualcheck && $percN>3.00 && $endN); #trash |
235 |
$$d[5]='shortq' if ($tooshort && $$d[2]-$$d[1]+1<$tooshort); |
236 |
$$d[6].="trimpoly[+$sh5, -$sh3]; " if $sh5 || $sh3; |
237 |
} |
238 |
close(TRIMPOLY); |
239 |
|
240 |
#------- write cleaning report file for this slice |
241 |
open(CLNFILE, '>'.$repfile) || die "Error creating file '$repfile'\n"; |
242 |
foreach my $seqname (@seqlist) { |
243 |
my $d=$seqs{$seqname} || die "Error writing cleaning report: sequence $seqname not found!\n"; |
244 |
printf(CLNFILE "%-16s\t%5.2f\t%5s\t%5s\t%5s\t%12s\t%s\n",$seqname, |
245 |
$$d[4],$$d[1],$$d[2],$$d[3],$$d[5], $$d[6]); |
246 |
} |
247 |
close(CLNFILE); |
248 |
|
249 |
#------ cleaning up |
250 |
$file.='*'; |
251 |
unlink(<${file}>); |
252 |
print "<< finished $sliceno\n"; |
253 |
|
254 |
#================== |
255 |
sub sysrun { |
256 |
local $_ = `($_[0]) 2>&1`; |
257 |
my $code=$?; |
258 |
print STDERR $_; |
259 |
my @cores = <core.*>; |
260 |
if ($code && @cores>0) { |
261 |
print STDERR "ERROR: command \n$cmd\n exited with code $code\n"; |
262 |
exit(1); |
263 |
} |
264 |
if ($code || m/error|failure|fatal|segmentation|core/i) { |
265 |
if (($_[0] =~ m/^\s*blastall/i)) { |
266 |
my @olines=split(/\n/); |
267 |
@olines=grep(m/error/i && !m/couldn't uncache/i && !m/No valid letters to be indexed/i, |
268 |
@olines); |
269 |
die "blastall unexpected error at\n$_[0]\n" if (@olines>0); |
270 |
#my @e = (m/(error)/igs); |
271 |
#my @r = (m/(couldn't uncache)/igs); |
272 |
#if (scalar(@e) == scalar(@r)) { |
273 |
# print STDERR "Please ignore (code $code) 'uncache' errors above.\n"; |
274 |
# return; |
275 |
# } |
276 |
#print STDERR "Warning: error code $code with not just uncaching errors at\n$_[0]\n"; |
277 |
} |
278 |
else { |
279 |
die "Error code: $code, message:\n$_) at command:\n$_[0]\n"; |
280 |
} |
281 |
} |
282 |
|
283 |
} |
284 |
#---------------------------------------------------------------------------- |
285 |
#printing fasta sequence (60 chars per line), uppercase |
286 |
# $seq is just a reference to an actual nucleotide string |
287 |
# if e5,e3 are specified and e3>e5 only CLEAR range will be returned |
288 |
#---------------------------------------------------------------------------- |
289 |
sub print_fasta { |
290 |
my ($seq_name, $seq, $e5, $e3) = @_; |
291 |
my $pos=$e5-1; |
292 |
print '>'.$seq_name."\n" if $seq_name; |
293 |
if ($e5 && $e3 && $e3>$e5) { #assumes e3 and e5 make sense |
294 |
while ($e3-$pos>=60) { |
295 |
print uc(substr($$seq,$pos,60))."\n"; |
296 |
$pos+=60; |
297 |
} |
298 |
print uc(substr($$seq,$pos,$e3-$pos))."\n" |
299 |
if ($e3-$pos>0); |
300 |
} |
301 |
else { #no valid clear range provided, print whole sequence |
302 |
my $len=length($$seq); |
303 |
my $pos=0; |
304 |
while ($pos<$len) { |
305 |
print uc(substr($$seq,$pos,60))."\n"; |
306 |
$pos+=60; |
307 |
} |
308 |
} |
309 |
} |
310 |
|
311 |
sub writeCLRfile { |
312 |
my ($outfile, $create)=@_; |
313 |
local *SLICEFILE; |
314 |
local *OUTFILE; |
315 |
open(SLICEFILE, $file) || die "Error opening slice file '$file'\n"; |
316 |
local $/="\n>"; |
317 |
open (OUTFILE, ">$outfile") || die "Error creating work file '$outfile'\n"; |
318 |
my $oldsel=select(OUTFILE); # outfile, only containing clear range, non-trashed sequences |
319 |
while (<SLICEFILE>) { |
320 |
s/^>//; |
321 |
chomp; |
322 |
my ($defline, $seq)=(m/^(.+?)\n(.+)/s); |
323 |
my ($seqname, $seqdescr) = ($defline =~ m/^(\S+)\s*(.*)/); |
324 |
$seq =~ s/\s+//g; |
325 |
my $trash; |
326 |
my ($e5, $e3); |
327 |
if ($create) { #descr, end5, end3, oldlength, percent N, trash code, cleaning comments, f5, f3 |
328 |
#already trash sequences shorter than 100 with code 'i' |
329 |
my $seqlen=length($seq); |
330 |
$trash='short' if ($tooshort && $seqlen<$tooshort); |
331 |
# 0 1 2 3 4 5 6 7 8 |
332 |
$seqs{$seqname}=[$seqdescr, 1, $seqlen, $seqlen, 0 , $trash, '', 1, $seqlen]; |
333 |
push(@seqlist, $seqname); |
334 |
} |
335 |
else { #extract clear range |
336 |
my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n"; |
337 |
($e5,$e3, $trash)=($$d[1], $$d[2], $$d[5]); |
338 |
($$d[7], $$d[8])=($e5, $e3); #coordinates of the sequence written in this working file |
339 |
} |
340 |
&print_fasta($seqname,\$seq, $e5, $e3) unless $trash; |
341 |
} |
342 |
select($oldsel); |
343 |
close OUTFILE; |
344 |
close SLICEFILE; |
345 |
} |