1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
dust.pl [-c <repeat#>] <fasta_file> |
8 |
|
9 |
Masks all repeats of unit length 1 or greater that are repeated at |
10 |
least 4 times. |
11 |
Options: |
12 |
-c sets the repeat count to <repeat#> (default 4) |
13 |
/; |
14 |
umask 0002; |
15 |
getopts('c:o:') || die($usage."\n"); |
16 |
my $outfile=$Getopt::Std::opt_o; |
17 |
my $c=$Getopt::Std::opt_c || 4; |
18 |
|
19 |
{ |
20 |
local $/="\n>"; |
21 |
while (<>) { |
22 |
s/^>//; |
23 |
chomp; |
24 |
my ($header, $seq)=(m/^([^\n]+)\n(.+)/s); |
25 |
$seq =~ tr/\t \n\r//d; |
26 |
|
27 |
$seq =~ s/((\w{2,}?)\2{$c,})/'N' x length $1/oeg; |
28 |
# for poly-nucleotide |
29 |
#$seq =~ s/((\w)\2{6,})/'N' x length $1/oeg; |
30 |
$seq =~ s/((\w)\2{5,})/'N' x length $1/eg; |
31 |
#print $seq."\n"; |
32 |
print ">$header\n"; |
33 |
print join("\n", unpack('(A70)*', $seq))."\n"; |
34 |
} |
35 |
} |