1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/ |
7 |
Split a multi-fasta file in multiple smaller parts. |
8 |
Usage: |
9 |
fasplit {-n <seqs_per_part> | -s <partsize>} [-o <outname>] <multifasta.fa> |
10 |
|
11 |
Options: |
12 |
There are two modes of operation (mutually exclusive): |
13 |
-n split by number of sequences in a part |
14 |
-s split by part size; <partsize> is a numeric argument representing |
15 |
the desired part size in Megabytes |
16 |
|
17 |
-o specify the name prefix for the output files (parts);a 0-padded number |
18 |
and a .fa extension will be appended to <outname> for each part |
19 |
/; |
20 |
umask 0002; |
21 |
getopts('Nn:s:o:') || die($usage."\n"); |
22 |
my $infile=$ARGV[0] || die("No multifasta input file provided!\n"); |
23 |
my $outprefix=$Getopt::Std::opt_o; |
24 |
my $perSeq=$Getopt::Std::opt_N; |
25 |
unless ($outprefix) { |
26 |
$outprefix=$infile; |
27 |
$outprefix=~s/\.\w+$//; |
28 |
} |
29 |
my $partsize=$Getopt::Std::opt_s; |
30 |
my $numseqs=$Getopt::Std::opt_n; |
31 |
die("$usage\nOnly one of -n or -s options can be given!\n") |
32 |
if ($numseqs && $partsize); |
33 |
if ($perSeq) { $numseqs=1; $partsize=''; } |
34 |
my $bufProcess = $numseqs ? \&nBufProc : \&sBufProc; |
35 |
open(INFILE, $infile) || die("Input file $infile cannot be opened!"); |
36 |
$partsize=int($partsize*1024*1024); |
37 |
#my $rsize=0; #accumulated read size so far since the last file open |
38 |
my $rsize=0; |
39 |
my $wsize=0; #last written size |
40 |
#my $partbuf; |
41 |
my $scount=1; |
42 |
my $partnum=1; #current part number |
43 |
my $pfname=sprintf('%s%02d',$outprefix,$partnum).'.fa'; |
44 |
open(PFNAME, '>'.$pfname) || die ("Error creating file $pfname!\n"); |
45 |
while (<INFILE>) { |
46 |
tr/\n\r//d; |
47 |
next unless $_; |
48 |
if (m/^>/) { |
49 |
#starting record: |
50 |
#print STDERR "* starting record: $_\n"; |
51 |
&$bufProcess(); |
52 |
} |
53 |
$rsize+=length($_)+1; |
54 |
print PFNAME $_."\n"; |
55 |
|
56 |
} |
57 |
close(INFILE); |
58 |
#&$bufProcess(1); |
59 |
close(PFNAME); |
60 |
|
61 |
sub nBufProc { |
62 |
return unless $rsize>0; |
63 |
if ($scount>=$numseqs) { |
64 |
close(PFNAME); |
65 |
$partnum++; |
66 |
$pfname=sprintf('%s%02d',$outprefix,$partnum).'.fa'; |
67 |
#print STDERR "$scount > $numseqs : write to $pfname now.\n"; |
68 |
open(PFNAME, '>'.$pfname) || die ("Error creating file $pfname!\n"); |
69 |
$wsize=$rsize; |
70 |
$rsize=0; |
71 |
$scount=0; |
72 |
} |
73 |
$scount++; |
74 |
} |
75 |
|
76 |
sub sBufProc { |
77 |
return unless $rsize>0; |
78 |
if ($rsize>=$partsize) { |
79 |
#close previous part, open new one |
80 |
close(PFNAME); |
81 |
$partnum++; |
82 |
$pfname=sprintf('%s%02d',$outprefix,$partnum).'.fa'; |
83 |
open(PFNAME, '>'.$pfname) || die ("Error creating file $pfname!\n"); |
84 |
$wsize=$rsize; |
85 |
$rsize=0; |
86 |
$scount=0; |
87 |
} |
88 |
$scount++; |
89 |
} |