gclib/scripts/seqclean.psx

#!/usr/bin/perl
use strict;
use FindBin;
umask 0002;

chdir($ENV{PVMSX_WD}) if ($ENV{PVMSX_WD});

my ($file, $numpass, $sliceno, $last, $skipped, $total, $userparams)=@ARGV;

#$ENV{'PATH'}=$FindBin::Bin.':'.$ENV{'PATH'};
$userparams =~ s/\~\+/ /g;
my ($dbfile, $flags, $vec_db, $screen_db, $minvlen, $minpid)=split(/:/, $userparams);
my ($dbname)=($dbfile =~ m/([^\/]+)$/);
$dbname=$dbfile unless $dbname;

#minimum theoretical length of a terminal vector/linker residue
$minvlen=11 unless $minvlen;

my $cmd;
my $wrkfile = $file.'.wrk';
my $repfile=sprintf('%06d_%s.cln', $sliceno, $dbname);

#my $outfile= sprintf('%06d_%s.outfasta', $sliceno, $dbname) if ($flags =~ m/O/);
my $log_file='log_std';
my $err_file='err_log';
my %seqs; # this is the hash with all sequence data
my @seqlist; #this is the list with all the sequences in this file
my ($tooshort) = ($flags =~ m/S(\d+)/);
my $lowqualcheck = ($flags =~ m/M/);
my $minseqlen = $tooshort || 50;
open(STDERR, '>>'.$err_file);
open(STDOUT, '>>'.$log_file);
&writeCLRfile($wrkfile, 1); #create sequence hash and work file (length filtered)
my $polyA = ($flags =~ m/A/);
my $endN = ($flags =~ m/N/);
my $dust = ($flags =~ m/L/);
#run trimpoly unless disabled
if ($polyA || $endN) {
 my $trimcmd = ($polyA)? 'trimpoly -R -s10 -r8 -ex -l20 ':'trimpoly -R ';
 if ($endN) {
   $trimcmd.=' -n1 -m15 ';
   $trimcmd.=' -N ' unless $polyA;
   }
 open (TRIMPOLY, $trimcmd." < $wrkfile |") || die "Error opening '$trimcmd pipe\n";
 while (<TRIMPOLY>) {
   chomp;
   my ($seqname, $percN, $sh5, $sh3, $ilen)=split(/\t/);
   my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n";
   $$d[1]+=$sh5;
   $$d[2]-=$sh3;
   $$d[4]=$percN;
   $$d[5]='low_qual' if ($lowqualcheck && $percN>3.00 && $endN); #trash
   $$d[5]='shortq' if ($tooshort && $$d[2]-$$d[1]+1<$tooshort);
   $$d[6].="trimpoly[+$sh5, -$sh3]; " if $sh5 || $sh3;
   }
 close(TRIMPOLY);
 writeCLRfile($wrkfile);
 } #trimpoly part

#== trash low complexity (unless disabled)
if ($dust) {
  $cmd="mdust $wrkfile -c |";
  open (INCOMING, $cmd) || &MErrExit("Error running $cmd !\n");
  my $lastseq;
  my $lastlen;
  my $trashcount=0;
  while (<INCOMING>) {
    my ($seq_name, $len, $c5,$c3)=split(/\s+/);
    my $d=$seqs{$seq_name} || die "Error at dust: $seq_name not in hash!\n";
    my $rlen = ($seq_name eq $lastseq) ? $lastlen-($c3-$c5) : $len-($c3-$c5); #remaining non-lc
    if ($rlen<40) {
       $$d[5]='dust' unless ($$d[5]);
       $$d[6].='low complexity; ';
       $trashcount++;
      }
    $lastseq=$seq_name;
    $lastlen=$rlen;  
    }
 close(INCOMING);   
 writeCLRfile($wrkfile) if $trashcount;
 }

#===  the vector search; assumes the vector database is already 
#     formatted with formatdb

my $minperc = ($minpid>0) ? $minpid : 93;

foreach my $vdb (split(/,/,$vec_db)) {
 my $changed;
 my $noflt;
 { local $/='^';  $noflt=chomp($vdb); }
 die "Error locating contaminant database $vdb\n" unless -e $vdb;
 my ($fvdb)=($vdb =~ m/([^\/]+)$/);
 $fvdb=$vdb unless $fvdb;
 my $tabout=$wrkfile.'.vtab';
 $cmd="blastall -p blastn -d $vdb -i $wrkfile ";
 $cmd.=$noflt?' -F F ':' -F "m D" ';
 my $blout=$wrkfile.'.blout'; 
 $cmd.=' -q -4 -G 3 -E 3  -v 10000 -b 10000 -e 700 -Y 1.75e10 -m 8 > '.$blout; 
 sysrun($cmd);
 $cmd= 'grep -v "^#" '.$blout.' | sort -k1,1 -k12,12 -nr '." > $tabout"; 
 sysrun($cmd);

 if (-s $tabout) {
   local $_;
   open (TABOUT, $tabout) || die "Error opening '$tabout' at $vdb search\n";
   while (<TABOUT>) {
    next if m/^\s*#/;
    my @t=split(/\t/);
    die "Error: invalid line format in $tabout:\n$_\n" unless @t>=12;
    my $d=$seqs{$t[0]} || 
       die "Error: query '$t[0]' from $tabout not found in sequence hash!\n";
    next if $$d[5];
    #distance of hit from either end:
    my ($d5, $d3)=($t[6]-1, ($$d[8]-$$d[7]+1)-$t[7]);
    #hit should start within 30% distance of either (original) end 
    my $maxdist=0.30*$$d[3];
    my $hitlen=$t[7]-$t[6];
    my $mindist=$d5>$d3?$d3:$d5;
    my $minhitlen=$minvlen+$mindist/2>35 ? 35 : $minvlen+$mindist/2;
    #validate the hit: the further it is, the longer it should be
    
    if ($t[2]>=$minperc && $mindist<$maxdist && $hitlen>=$minhitlen) {
      my $adjusted;
      if ($mindist == $d5) { 
        #trim 5' end
        if ($$d[7]+$t[7]>$$d[1]) {
           $$d[1]=$$d[7]+$t[7]; #set the new end5 after the 3' end of the vector hit
           $adjusted=1;
           }
        }
      else { 
        #trim 3' end
         if ($$d[7]+$t[6]-2<$$d[2]) {
          $$d[2]=$$d[7]+$t[6]-2; #set the new end3 before the 5' end of the vector hit
          $adjusted=1;
          }
        }
      if ($adjusted) {
        $changed=1;
        if ($$d[2]-$$d[1]<$minseqlen) {
           $$d[5]=$fvdb; #trash
           $$d[1]=$$d[2] if $$d[1]>$$d[2];
           }
        $$d[6].=" $t[1]".':<'.($$d[7]+$t[6]-1).'-'.($$d[7]+$t[7]-1).'>;' if $adjusted;
        }
      }
    }
   close(TABOUT);
   }
 unlink($tabout);unlink($blout);
 writeCLRfile($wrkfile) if $changed;
 } # vector search loop
 
# -- screening searches: large contaminant hit expected; that is, 
#    at least 60% of the query should get a hit of at least 96% identity
$minperc = ($minpid>10) ? $minpid : 96;
foreach my $sdb (split(/,/,$screen_db)) {
 die "Error locating contaminant database $sdb\n" unless -e $sdb;
 my ($fsdb)=($sdb =~ m/([^\/]+)$/);
 $fsdb=$sdb unless $fsdb;
 my $tabout=$wrkfile.'.stab';
 my $changed;
 # -- 2.2.11-13 megablast bug: it IGNORES -JF flag!
 # at least blastall still works as it should..
 # so let's just use mgblast instead here:
 $cmd="mgblast -d $sdb -i $wrkfile -p$minperc".
   ' -W18 -JF -F "m D" -X30 -D3 | '.
    ' grep -v "^#" | sort -k1,1 -k12,12 -nr '.
    " > $tabout";
# $minscov=60 unless $minscov;
 sysrun($cmd);
 if (-s $tabout) {
   local $_;
   open (TABOUT, $tabout) || die "Error opening '$tabout' at $sdb search\n";
   while (<TABOUT>) {
    next if m/^\s*#/;
    my @t=split(/\t/);
    die "Error: invalid line format in $tabout:\n$_\n" unless @t>=12;
    my $d=$seqs{$t[0]} || 
       die "Error: query '$t[0]' from $tabout not found in sequence hash!\n";
    next if $$d[5];
    #distance of hit from either [original] end:
    my ($d5, $d3)=($t[6]-1, ($$d[8]-$$d[7]+1)-$t[7]);
    my $hitlen=$t[7]-$t[6];
    #my $minhitlen= (0.75 * ($$d[8]-$$d[7]+1) < $minseqlen) ?  $minseqlen: 0.75 * ($$d[8]-$$d[7]+1);
    my $minhitlen = 60; #at least 60 base pairs overlap!
    
    my $mindist=$d5>$d3?$d3:$d5;
    if ($hitlen>=$minhitlen) {
      my $adjusted;
      if ($mindist == $d5) { 
        #trim 5' end
        if ($$d[7]+$t[7]>$$d[1]) {
           $$d[1]=$$d[7]+$t[7]; #set the new end5 after the 3' end of the vector hit
           $adjusted=1;
           }
        }
      else {
        #trim 3' end
         if ($$d[7]+$t[6]-2<$$d[2]) {
          $$d[2]=$$d[7]+$t[6]-2; #set the new end3 before the 5' end of the vector hit
          $adjusted=1;
          }
        }
      if ($adjusted) {
        $changed=1;
        if ($$d[2]-$$d[1]<$minseqlen) {
           $$d[5]=$fsdb ; #trash
           $$d[1]=$$d[2] if $$d[1]>$$d[2];
           }
        $$d[6].=" $t[1]".':<'.($$d[7]+$t[6]-1).'-'.($$d[7]+$t[7]-1).'>;' if $adjusted;
        }
      }
    } #while
   close(TABOUT);
   }
 unlink($tabout);
 writeCLRfile($wrkfile) if $changed;
 } # larger contaminant screening 

# run trimpoly mostly for computing the percentage of undetermined bases
my $trimcmd = ($polyA)? 'trimpoly -R -s10 -r8 -ex -l20 ':'trimpoly -R ';
$trimcmd.= ($endN ? ' -n1 -m15 '  : ' -n90 -m1 ');
$trimcmd.=' -N ' unless $polyA;
open (TRIMPOLY, $trimcmd." < $wrkfile |") || die "Error opening final '$trimcmd' pipe\n";
while (<TRIMPOLY>) {
   chomp;
   my ($seqname, $percN, $sh5, $sh3, $ilen)=split(/\t/);
   my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n";
   $$d[1]+=$sh5;
   $$d[2]-=$sh3;
   $$d[4]= $percN;
   $$d[5]='low_qual' if ($lowqualcheck && $percN>3.00 && $endN); #trash
   $$d[5]='shortq' if ($tooshort && $$d[2]-$$d[1]+1<$tooshort);
   $$d[6].="trimpoly[+$sh5, -$sh3]; " if $sh5 || $sh3;
   }
close(TRIMPOLY);

#------- write cleaning report file for this slice
open(CLNFILE, '>'.$repfile) || die "Error creating file '$repfile'\n";
foreach my $seqname (@seqlist) {
  my $d=$seqs{$seqname} || die "Error writing cleaning report: sequence $seqname not found!\n";
  printf(CLNFILE "%-16s\t%5.2f\t%5s\t%5s\t%5s\t%12s\t%s\n",$seqname,
                       $$d[4],$$d[1],$$d[2],$$d[3],$$d[5], $$d[6]);
  }
close(CLNFILE);

#------ cleaning up
$file.='*';
unlink(<${file}>);
print "<< finished $sliceno\n";

#==================
sub sysrun {
 local $_ = `($_[0]) 2>&1`;
 my $code=$?; 
 print STDERR $_;
 my @cores = <core.*>;
 if ($code && @cores>0) {
   print STDERR "ERROR: command \n$cmd\n  exited with code $code\n";
   exit(1);
   }
 if ($code ||  m/error|failure|fatal|segmentation|core/i) {
   if (($_[0] =~ m/^\s*blastall/i)) {
     my @olines=split(/\n/);
     @olines=grep(m/error/i && !m/couldn't uncache/i && !m/No valid letters to be indexed/i, 
                   @olines);
     die "blastall unexpected error at\n$_[0]\n" if (@olines>0);
     #my @e = (m/(error)/igs);
     #my @r = (m/(couldn't uncache)/igs);
     #if (scalar(@e) == scalar(@r)) {
     #  print STDERR "Please ignore (code $code) 'uncache' errors above.\n";
     #  return;
     #  }
     #print STDERR "Warning: error code $code with not just uncaching errors at\n$_[0]\n";
     }
    else {
     die "Error code: $code, message:\n$_) at command:\n$_[0]\n";
     }
   }
  
}
#----------------------------------------------------------------------------
#printing fasta sequence (60 chars per line), uppercase
# $seq is just a reference to an actual nucleotide string
# if e5,e3 are specified and e3>e5 only CLEAR range will be returned
#----------------------------------------------------------------------------
sub print_fasta { 
 my ($seq_name, $seq, $e5, $e3) = @_;
 my $pos=$e5-1;
 print '>'.$seq_name."\n" if $seq_name;
 if ($e5 && $e3 && $e3>$e5)  { #assumes e3 and e5 make sense  
    while ($e3-$pos>=60) {
           print uc(substr($$seq,$pos,60))."\n";
           $pos+=60;
           }
    print uc(substr($$seq,$pos,$e3-$pos))."\n" 
              if ($e3-$pos>0);
    }
 else { #no valid clear range provided, print whole sequence
    my $len=length($$seq);
    my $pos=0;
    while ($pos<$len) {
       print uc(substr($$seq,$pos,60))."\n";
       $pos+=60;
       }       
    }
}

sub writeCLRfile {
 my ($outfile, $create)=@_;
 local *SLICEFILE;
 local *OUTFILE;
 open(SLICEFILE, $file) || die "Error opening slice file '$file'\n";
 local $/="\n>";
 open (OUTFILE, ">$outfile") || die "Error creating work file '$outfile'\n";
 my $oldsel=select(OUTFILE); # outfile, only containing clear range, non-trashed sequences
 while (<SLICEFILE>) {
   s/^>//;
   chomp;
   my ($defline, $seq)=(m/^(.+?)\n(.+)/s); 
   my ($seqname, $seqdescr) = ($defline =~ m/^(\S+)\s*(.*)/);
   $seq =~ s/\s+//g;
   my $trash;
   my ($e5, $e3);
   if ($create) {     #descr,   end5, end3,   oldlength, percent N, trash code, cleaning comments, f5, f3 
     #already trash sequences shorter than 100 with code 'i'
     my $seqlen=length($seq);
     $trash='short' if ($tooshort && $seqlen<$tooshort);
                      # 0        1      2       3           4            5           6     7     8
     $seqs{$seqname}=[$seqdescr, 1, $seqlen, $seqlen,       0 ,       $trash,          '', 1, $seqlen];
     push(@seqlist, $seqname);
     }
    else { #extract clear range
     my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n";
     ($e5,$e3, $trash)=($$d[1], $$d[2], $$d[5]);
     ($$d[7], $$d[8])=($e5, $e3); #coordinates of the sequence written in this working file
     }
   &print_fasta($seqname,\$seq, $e5, $e3) unless $trash;
   }
 select($oldsel);
 close OUTFILE;
 close SLICEFILE; 
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size:	11856 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use FindBin;
4	umask 0002;
5
6	chdir($ENV{PVMSX_WD}) if ($ENV{PVMSX_WD});
7
8	my ($file, $numpass, $sliceno, $last, $skipped, $total, $userparams)=@ARGV;
9
10	#$ENV{'PATH'}=$FindBin::Bin.':'.$ENV{'PATH'};
11	$userparams =~ s/\~\+/ /g;
12	my ($dbfile, $flags, $vec_db, $screen_db, $minvlen, $minpid)=split(/:/, $userparams);
13	my ($dbname)=($dbfile =~ m/([^\/]+)$/);
14	$dbname=$dbfile unless $dbname;
15
16	#minimum theoretical length of a terminal vector/linker residue
17	$minvlen=11 unless $minvlen;
18
19	my $cmd;
20	my $wrkfile = $file.'.wrk';
21	my $repfile=sprintf('%06d_%s.cln', $sliceno, $dbname);
22
23	#my $outfile= sprintf('%06d_%s.outfasta', $sliceno, $dbname) if ($flags =~ m/O/);
24	my $log_file='log_std';
25	my $err_file='err_log';
26	my %seqs; # this is the hash with all sequence data
27	my @seqlist; #this is the list with all the sequences in this file
28	my ($tooshort) = ($flags =~ m/S(\d+)/);
29	my $lowqualcheck = ($flags =~ m/M/);
30	my $minseqlen = $tooshort \|\| 50;
31	open(STDERR, '>>'.$err_file);
32	open(STDOUT, '>>'.$log_file);
33	&writeCLRfile($wrkfile, 1); #create sequence hash and work file (length filtered)
34	my $polyA = ($flags =~ m/A/);
35	my $endN = ($flags =~ m/N/);
36	my $dust = ($flags =~ m/L/);
37	#run trimpoly unless disabled
38	if ($polyA \|\| $endN) {
39	my $trimcmd = ($polyA)? 'trimpoly -R -s10 -r8 -ex -l20 ':'trimpoly -R ';
40	if ($endN) {
41	$trimcmd.=' -n1 -m15 ';
42	$trimcmd.=' -N ' unless $polyA;
43	}
44	open (TRIMPOLY, $trimcmd." < $wrkfile \|") \|\| die "Error opening '$trimcmd pipe\n";
45	while (<TRIMPOLY>) {
46	chomp;
47	my ($seqname, $percN, $sh5, $sh3, $ilen)=split(/\t/);
48	my $d=$seqs{$seqname} \|\| die "Error: seq '$seqname' not found in hash!\n";
49	$$d[1]+=$sh5;
50	$$d[2]-=$sh3;
51	$$d[4]=$percN;
52	$$d[5]='low_qual' if ($lowqualcheck && $percN>3.00 && $endN); #trash
53	$$d[5]='shortq' if ($tooshort && $$d[2]-$$d[1]+1<$tooshort);
54	$$d[6].="trimpoly[+$sh5, -$sh3]; " if $sh5 \|\| $sh3;
55	}
56	close(TRIMPOLY);
57	writeCLRfile($wrkfile);
58	} #trimpoly part
59
60	#== trash low complexity (unless disabled)
61	if ($dust) {
62	$cmd="mdust $wrkfile -c \|";
63	open (INCOMING, $cmd) \|\| &MErrExit("Error running $cmd !\n");
64	my $lastseq;
65	my $lastlen;
66	my $trashcount=0;
67	while (<INCOMING>) {
68	my ($seq_name, $len, $c5,$c3)=split(/\s+/);
69	my $d=$seqs{$seq_name} \|\| die "Error at dust: $seq_name not in hash!\n";
70	my $rlen = ($seq_name eq $lastseq) ? $lastlen-($c3-$c5) : $len-($c3-$c5); #remaining non-lc
71	if ($rlen<40) {
72	$$d[5]='dust' unless ($$d[5]);
73	$$d[6].='low complexity; ';
74	$trashcount++;
75	}
76	$lastseq=$seq_name;
77	$lastlen=$rlen;
78	}
79	close(INCOMING);
80	writeCLRfile($wrkfile) if $trashcount;
81	}
82
83	#=== the vector search; assumes the vector database is already
84	# formatted with formatdb
85
86	my $minperc = ($minpid>0) ? $minpid : 93;
87
88	foreach my $vdb (split(/,/,$vec_db)) {
89	my $changed;
90	my $noflt;
91	{ local $/='^'; $noflt=chomp($vdb); }
92	die "Error locating contaminant database $vdb\n" unless -e $vdb;
93	my ($fvdb)=($vdb =~ m/([^\/]+)$/);
94	$fvdb=$vdb unless $fvdb;
95	my $tabout=$wrkfile.'.vtab';
96	$cmd="blastall -p blastn -d $vdb -i $wrkfile ";
97	$cmd.=$noflt?' -F F ':' -F "m D" ';
98	my $blout=$wrkfile.'.blout';
99	$cmd.=' -q -4 -G 3 -E 3 -v 10000 -b 10000 -e 700 -Y 1.75e10 -m 8 > '.$blout;
100	sysrun($cmd);
101	$cmd= 'grep -v "^#" '.$blout.' \| sort -k1,1 -k12,12 -nr '." > $tabout";
102	sysrun($cmd);
103
104	if (-s $tabout) {
105	local $_;
106	open (TABOUT, $tabout) \|\| die "Error opening '$tabout' at $vdb search\n";
107	while (<TABOUT>) {
108	next if m/^\s*#/;
109	my @t=split(/\t/);
110	die "Error: invalid line format in $tabout:\n$_\n" unless @t>=12;
111	my $d=$seqs{$t[0]} \|\|
112	die "Error: query '$t[0]' from $tabout not found in sequence hash!\n";
113	next if $$d[5];
114	#distance of hit from either end:
115	my ($d5, $d3)=($t[6]-1, ($$d[8]-$$d[7]+1)-$t[7]);
116	#hit should start within 30% distance of either (original) end
117	my $maxdist=0.30*$$d[3];
118	my $hitlen=$t[7]-$t[6];
119	my $mindist=$d5>$d3?$d3:$d5;
120	my $minhitlen=$minvlen+$mindist/2>35 ? 35 : $minvlen+$mindist/2;
121	#validate the hit: the further it is, the longer it should be
122
123	if ($t[2]>=$minperc && $mindist<$maxdist && $hitlen>=$minhitlen) {
124	my $adjusted;
125	if ($mindist == $d5) {
126	#trim 5' end
127	if ($$d[7]+$t[7]>$$d[1]) {
128	$$d[1]=$$d[7]+$t[7]; #set the new end5 after the 3' end of the vector hit
129	$adjusted=1;
130	}
131	}
132	else {
133	#trim 3' end
134	if ($$d[7]+$t[6]-2<$$d[2]) {
135	$$d[2]=$$d[7]+$t[6]-2; #set the new end3 before the 5' end of the vector hit
136	$adjusted=1;
137	}
138	}
139	if ($adjusted) {
140	$changed=1;
141	if ($$d[2]-$$d[1]<$minseqlen) {
142	$$d[5]=$fvdb; #trash
143	$$d[1]=$$d[2] if $$d[1]>$$d[2];
144	}
145	$$d[6].=" $t[1]".':<'.($$d[7]+$t[6]-1).'-'.($$d[7]+$t[7]-1).'>;' if $adjusted;
146	}
147	}
148	}
149	close(TABOUT);
150	}
151	unlink($tabout);unlink($blout);
152	writeCLRfile($wrkfile) if $changed;
153	} # vector search loop
154
155	# -- screening searches: large contaminant hit expected; that is,
156	# at least 60% of the query should get a hit of at least 96% identity
157	$minperc = ($minpid>10) ? $minpid : 96;
158	foreach my $sdb (split(/,/,$screen_db)) {
159	die "Error locating contaminant database $sdb\n" unless -e $sdb;
160	my ($fsdb)=($sdb =~ m/([^\/]+)$/);
161	$fsdb=$sdb unless $fsdb;
162	my $tabout=$wrkfile.'.stab';
163	my $changed;
164	# -- 2.2.11-13 megablast bug: it IGNORES -JF flag!
165	# at least blastall still works as it should..
166	# so let's just use mgblast instead here:
167	$cmd="mgblast -d $sdb -i $wrkfile -p$minperc".
168	' -W18 -JF -F "m D" -X30 -D3 \| '.
169	' grep -v "^#" \| sort -k1,1 -k12,12 -nr '.
170	" > $tabout";
171	# $minscov=60 unless $minscov;
172	sysrun($cmd);
173	if (-s $tabout) {
174	local $_;
175	open (TABOUT, $tabout) \|\| die "Error opening '$tabout' at $sdb search\n";
176	while (<TABOUT>) {
177	next if m/^\s*#/;
178	my @t=split(/\t/);
179	die "Error: invalid line format in $tabout:\n$_\n" unless @t>=12;
180	my $d=$seqs{$t[0]} \|\|
181	die "Error: query '$t[0]' from $tabout not found in sequence hash!\n";
182	next if $$d[5];
183	#distance of hit from either [original] end:
184	my ($d5, $d3)=($t[6]-1, ($$d[8]-$$d[7]+1)-$t[7]);
185	my $hitlen=$t[7]-$t[6];
186	#my $minhitlen= (0.75 * ($$d[8]-$$d[7]+1) < $minseqlen) ? $minseqlen: 0.75 * ($$d[8]-$$d[7]+1);
187	my $minhitlen = 60; #at least 60 base pairs overlap!
188
189	my $mindist=$d5>$d3?$d3:$d5;
190	if ($hitlen>=$minhitlen) {
191	my $adjusted;
192	if ($mindist == $d5) {
193	#trim 5' end
194	if ($$d[7]+$t[7]>$$d[1]) {
195	$$d[1]=$$d[7]+$t[7]; #set the new end5 after the 3' end of the vector hit
196	$adjusted=1;
197	}
198	}
199	else {
200	#trim 3' end
201	if ($$d[7]+$t[6]-2<$$d[2]) {
202	$$d[2]=$$d[7]+$t[6]-2; #set the new end3 before the 5' end of the vector hit
203	$adjusted=1;
204	}
205	}
206	if ($adjusted) {
207	$changed=1;
208	if ($$d[2]-$$d[1]<$minseqlen) {
209	$$d[5]=$fsdb ; #trash
210	$$d[1]=$$d[2] if $$d[1]>$$d[2];
211	}
212	$$d[6].=" $t[1]".':<'.($$d[7]+$t[6]-1).'-'.($$d[7]+$t[7]-1).'>;' if $adjusted;
213	}
214	}
215	} #while
216	close(TABOUT);
217	}
218	unlink($tabout);
219	writeCLRfile($wrkfile) if $changed;
220	} # larger contaminant screening
221
222	# run trimpoly mostly for computing the percentage of undetermined bases
223	my $trimcmd = ($polyA)? 'trimpoly -R -s10 -r8 -ex -l20 ':'trimpoly -R ';
224	$trimcmd.= ($endN ? ' -n1 -m15 ' : ' -n90 -m1 ');
225	$trimcmd.=' -N ' unless $polyA;
226	open (TRIMPOLY, $trimcmd." < $wrkfile \|") \|\| die "Error opening final '$trimcmd' pipe\n";
227	while (<TRIMPOLY>) {
228	chomp;
229	my ($seqname, $percN, $sh5, $sh3, $ilen)=split(/\t/);
230	my $d=$seqs{$seqname} \|\| die "Error: seq '$seqname' not found in hash!\n";
231	$$d[1]+=$sh5;
232	$$d[2]-=$sh3;
233	$$d[4]= $percN;
234	$$d[5]='low_qual' if ($lowqualcheck && $percN>3.00 && $endN); #trash
235	$$d[5]='shortq' if ($tooshort && $$d[2]-$$d[1]+1<$tooshort);
236	$$d[6].="trimpoly[+$sh5, -$sh3]; " if $sh5 \|\| $sh3;
237	}
238	close(TRIMPOLY);
239
240	#------- write cleaning report file for this slice
241	open(CLNFILE, '>'.$repfile) \|\| die "Error creating file '$repfile'\n";
242	foreach my $seqname (@seqlist) {
243	my $d=$seqs{$seqname} \|\| die "Error writing cleaning report: sequence $seqname not found!\n";
244	printf(CLNFILE "%-16s\t%5.2f\t%5s\t%5s\t%5s\t%12s\t%s\n",$seqname,
245	$$d[4],$$d[1],$$d[2],$$d[3],$$d[5], $$d[6]);
246	}
247	close(CLNFILE);
248
249	#------ cleaning up
250	$file.='*';
251	unlink(<${file}>);
252	print "<< finished $sliceno\n";
253
254	#==================
255	sub sysrun {
256	local $_ = `($_[0]) 2>&1`;
257	my $code=$?;
258	print STDERR $_;
259	my @cores = <core.*>;
260	if ($code && @cores>0) {
261	print STDERR "ERROR: command \n$cmd\n exited with code $code\n";
262	exit(1);
263	}
264	if ($code \|\| m/error\|failure\|fatal\|segmentation\|core/i) {
265	if (($_[0] =~ m/^\s*blastall/i)) {
266	my @olines=split(/\n/);
267	@olines=grep(m/error/i && !m/couldn't uncache/i && !m/No valid letters to be indexed/i,
268	@olines);
269	die "blastall unexpected error at\n$_[0]\n" if (@olines>0);
270	#my @e = (m/(error)/igs);
271	#my @r = (m/(couldn't uncache)/igs);
272	#if (scalar(@e) == scalar(@r)) {
273	# print STDERR "Please ignore (code $code) 'uncache' errors above.\n";
274	# return;
275	# }
276	#print STDERR "Warning: error code $code with not just uncaching errors at\n$_[0]\n";
277	}
278	else {
279	die "Error code: $code, message:\n$_) at command:\n$_[0]\n";
280	}
281	}
282
283	}
284	#----------------------------------------------------------------------------
285	#printing fasta sequence (60 chars per line), uppercase
286	# $seq is just a reference to an actual nucleotide string
287	# if e5,e3 are specified and e3>e5 only CLEAR range will be returned
288	#----------------------------------------------------------------------------
289	sub print_fasta {
290	my ($seq_name, $seq, $e5, $e3) = @_;
291	my $pos=$e5-1;
292	print '>'.$seq_name."\n" if $seq_name;
293	if ($e5 && $e3 && $e3>$e5) { #assumes e3 and e5 make sense
294	while ($e3-$pos>=60) {
295	print uc(substr($$seq,$pos,60))."\n";
296	$pos+=60;
297	}
298	print uc(substr($$seq,$pos,$e3-$pos))."\n"
299	if ($e3-$pos>0);
300	}
301	else { #no valid clear range provided, print whole sequence
302	my $len=length($$seq);
303	my $pos=0;
304	while ($pos<$len) {
305	print uc(substr($$seq,$pos,60))."\n";
306	$pos+=60;
307	}
308	}
309	}
310
311	sub writeCLRfile {
312	my ($outfile, $create)=@_;
313	local *SLICEFILE;
314	local *OUTFILE;
315	open(SLICEFILE, $file) \|\| die "Error opening slice file '$file'\n";
316	local $/="\n>";
317	open (OUTFILE, ">$outfile") \|\| die "Error creating work file '$outfile'\n";
318	my $oldsel=select(OUTFILE); # outfile, only containing clear range, non-trashed sequences
319	while (<SLICEFILE>) {
320	s/^>//;
321	chomp;
322	my ($defline, $seq)=(m/^(.+?)\n(.+)/s);
323	my ($seqname, $seqdescr) = ($defline =~ m/^(\S+)\s(.)/);
324	$seq =~ s/\s+//g;
325	my $trash;
326	my ($e5, $e3);
327	if ($create) { #descr, end5, end3, oldlength, percent N, trash code, cleaning comments, f5, f3
328	#already trash sequences shorter than 100 with code 'i'
329	my $seqlen=length($seq);
330	$trash='short' if ($tooshort && $seqlen<$tooshort);
331	# 0 1 2 3 4 5 6 7 8
332	$seqs{$seqname}=[$seqdescr, 1, $seqlen, $seqlen, 0 , $trash, '', 1, $seqlen];
333	push(@seqlist, $seqname);
334	}
335	else { #extract clear range
336	my $d=$seqs{$seqname} \|\| die "Error: seq '$seqname' not found in hash!\n";
337	($e5,$e3, $trash)=($$d[1], $$d[2], $$d[5]);
338	($$d[7], $$d[8])=($e5, $e3); #coordinates of the sequence written in this working file
339	}
340	&print_fasta($seqname,\$seq, $e5, $e3) unless $trash;
341	}
342	select($oldsel);
343	close OUTFILE;
344	close SLICEFILE;
345	}