1 |
#!/usr/bin/perl |
2 |
|
3 |
use strict; |
4 |
my $usage=q{ |
5 |
splitbyNs.pl <input_fasta> <output_name> [<multiple_files>] |
6 |
|
7 |
Looks for regions of at least 300Ns and splits the input sequence there. |
8 |
|
9 |
}; |
10 |
|
11 |
my ($fastafile,$outputfile,$multiple)=@ARGV; |
12 |
|
13 |
my $n=1; |
14 |
$/="\n>"; |
15 |
|
16 |
open(F,$fastafile); |
17 |
|
18 |
while(<F>) { |
19 |
s/^>//; |
20 |
chomp; |
21 |
next unless $_; |
22 |
my ($name, $descr, $seq)=(m/^(\S+)[ \t\x01]*(.*?)\n(.+)/s); |
23 |
die "ERROR 21: Wrong FASTA format: .$_." unless $name && $seq; |
24 |
$seq=~tr/\t \n\r//d; |
25 |
$seq=lc($seq); |
26 |
$n=processseq($name,$seq,$outputfile,$n,$multiple); |
27 |
} |
28 |
|
29 |
close(F); |
30 |
|
31 |
|
32 |
sub processseq { |
33 |
my($name,$seq,$output,$n,$multiple)=@_; |
34 |
|
35 |
my $laststart=0; |
36 |
|
37 |
my $ns='n' x 300; |
38 |
|
39 |
my $count=1; |
40 |
my $pos; |
41 |
my $newout; |
42 |
while(($pos=index($seq,$ns,$laststart))>0) { |
43 |
my $newseq=substr($seq,$laststart,$pos-$laststart); |
44 |
if ($multiple) { |
45 |
$newout=$output.".".$n; |
46 |
$n++; |
47 |
open(O,">$newout"); |
48 |
} |
49 |
else { |
50 |
open(O,">>$output"); |
51 |
} |
52 |
print O ">$name"; |
53 |
if($multiple) { |
54 |
print O " $laststart\n"; |
55 |
} |
56 |
else { |
57 |
print O "_$count $laststart\n"; |
58 |
} |
59 |
print O "$newseq\n"; |
60 |
close(O); |
61 |
$count++; |
62 |
$laststart=skipns($seq,$pos); |
63 |
} |
64 |
if($multiple) { |
65 |
$newout=$output.".".$n; |
66 |
$n++; |
67 |
open(O,">$newout"); |
68 |
} |
69 |
else { |
70 |
open(O,">>$output"); |
71 |
} |
72 |
my $newseq=substr($seq,$laststart); |
73 |
print O ">$name"; |
74 |
if($multiple) { |
75 |
print O " $laststart\n"; |
76 |
} |
77 |
else { |
78 |
print O "_$count $laststart\n"; |
79 |
} |
80 |
print O "$newseq\n"; |
81 |
close(O); |
82 |
|
83 |
} |
84 |
|
85 |
sub skipns { |
86 |
my($seq,$pos)=@_; |
87 |
|
88 |
while(substr($seq,$pos,1) eq "n") { $pos++;} |
89 |
|
90 |
return($pos); |
91 |
} |