ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/splitbyNs.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years ago) by gpertea
File size: 1587 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2
3 use strict;
4 my $usage=q{
5 splitbyNs.pl <input_fasta> <output_name> [<multiple_files>]
6
7 Looks for regions of at least 300Ns and splits the input sequence there.
8
9 };
10
11 my ($fastafile,$outputfile,$multiple)=@ARGV;
12
13 my $n=1;
14 $/="\n>";
15
16 open(F,$fastafile);
17
18 while(<F>) {
19 s/^>//;
20 chomp;
21 next unless $_;
22 my ($name, $descr, $seq)=(m/^(\S+)[ \t\x01]*(.*?)\n(.+)/s);
23 die "ERROR 21: Wrong FASTA format: .$_." unless $name && $seq;
24 $seq=~tr/\t \n\r//d;
25 $seq=lc($seq);
26 $n=processseq($name,$seq,$outputfile,$n,$multiple);
27 }
28
29 close(F);
30
31
32 sub processseq {
33 my($name,$seq,$output,$n,$multiple)=@_;
34
35 my $laststart=0;
36
37 my $ns='n' x 300;
38
39 my $count=1;
40 my $pos;
41 my $newout;
42 while(($pos=index($seq,$ns,$laststart))>0) {
43 my $newseq=substr($seq,$laststart,$pos-$laststart);
44 if ($multiple) {
45 $newout=$output.".".$n;
46 $n++;
47 open(O,">$newout");
48 }
49 else {
50 open(O,">>$output");
51 }
52 print O ">$name";
53 if($multiple) {
54 print O " $laststart\n";
55 }
56 else {
57 print O "_$count $laststart\n";
58 }
59 print O "$newseq\n";
60 close(O);
61 $count++;
62 $laststart=skipns($seq,$pos);
63 }
64 if($multiple) {
65 $newout=$output.".".$n;
66 $n++;
67 open(O,">$newout");
68 }
69 else {
70 open(O,">>$output");
71 }
72 my $newseq=substr($seq,$laststart);
73 print O ">$name";
74 if($multiple) {
75 print O " $laststart\n";
76 }
77 else {
78 print O "_$count $laststart\n";
79 }
80 print O "$newseq\n";
81 close(O);
82
83 }
84
85 sub skipns {
86 my($seq,$pos)=@_;
87
88 while(substr($seq,$pos,1) eq "n") { $pos++;}
89
90 return($pos);
91 }

Properties

Name Value
svn:executable *