1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
|
5 |
my $usage = q/Usage: |
6 |
ace2fasta.pl -o <contigs_fasta> [-c <components_file>] <acefile> |
7 |
/; |
8 |
umask 0002; |
9 |
getopts('o:c:') || die($usage."\n"); |
10 |
my $outfile=$Getopt::Std::opt_o || |
11 |
die("$usage\nMust specify output file name!\n"); |
12 |
open(CFASTA, '>'.$outfile) || die("Error creating file $outfile!\n"); |
13 |
my $lnkfile=$Getopt::Std::opt_c; |
14 |
if ($lnkfile) { |
15 |
open(COMP, '>'.$lnkfile) || die("Error creating file $lnkfile!\n"); |
16 |
} |
17 |
|
18 |
my $ctg; #current contig |
19 |
my $ctgseq; #current contig sequence |
20 |
my $ctglen; #current contig seq length |
21 |
my $numseqs; |
22 |
my @ctgcomp; #list of component sequence names |
23 |
my %seqs; # seq => [ seqlen, strand, seqL, seqR, asmL, asmR ] |
24 |
my %ctgnames; |
25 |
while (<>) { |
26 |
if (m/^CO\s+(\S+)\s+(\d+)\s+(\d+)/) { |
27 |
my ($c, $l, $n)=($1, $2, $3); |
28 |
#if ($c ne $ctg) { |
29 |
&writeCtg() if $ctg; |
30 |
undef %seqs; |
31 |
undef @ctgcomp; |
32 |
my $ctgdup=++$ctgnames{$c}; |
33 |
$c='ASM_'.$c; |
34 |
$c.='.'.($ctgdup-1) if ($ctgdup>1); |
35 |
($ctg, $ctglen, $numseqs)=($c, $l, $n); |
36 |
# } |
37 |
$ctgseq=''; |
38 |
my $seqline; |
39 |
while (<>) { |
40 |
($seqline)=(/^(\S+)/); |
41 |
last unless $seqline; |
42 |
$ctgseq.=$seqline; |
43 |
} |
44 |
} #contig start line |
45 |
elsif (m/^AF (\S+) ([UC]) ([\-\d]+)/) { |
46 |
my $strand=($2 eq 'U')?'+':'-'; |
47 |
$seqs{$1}=[0, $strand, 0, 0, $3, 0]; #the untrimmed asmL position for now |
48 |
} |
49 |
elsif (m/^RD (\S+) (\d+) (\d+) (\d+)/) { |
50 |
my ($seqname, $seqlen)=($1, $2); |
51 |
my $seqd=$seqs{$seqname}; |
52 |
push(@ctgcomp, $seqname); |
53 |
die("Error at ACE parsing: no sequence found for RD $seqname\n") |
54 |
unless $seqd; |
55 |
$seqd->[0]=$seqlen; |
56 |
my $rdseq=''; |
57 |
my $seqline; |
58 |
while ( defined($_=<>) && (($seqline)=(/(\S+)/))) { |
59 |
$rdseq.=$seqline; |
60 |
} |
61 |
do { |
62 |
$_=<>; |
63 |
} until m/^QA (\d+) (\d+)/ || !defined($_); |
64 |
die("Couldn't find the QA entry for RD $seqname!\n") unless defined($_); |
65 |
($seqd->[2], $seqd->[3]) = ($1, $2); |
66 |
my ($trimL, $trimR)=($seqd->[2]-1, $seqlen-$seqd->[3]); |
67 |
$seqd->[5]=$seqd->[4]+$seqlen-$trimR-1; |
68 |
$seqd->[4]+=$trimL; |
69 |
} |
70 |
} |
71 |
|
72 |
writeCtg() if $ctg; |
73 |
|
74 |
close(CFASTA); |
75 |
close(COMP) if $lnkfile; |
76 |
|
77 |
sub writeCtg { |
78 |
print CFASTA ">$ctg $numseqs\n"; |
79 |
$ctgseq=~tr/-*\n\r//d; |
80 |
# $ctglen=length($ctgseq); |
81 |
print CFASTA join("\n", (unpack('(A72)*',$ctgseq)))."\n"; |
82 |
if ($lnkfile) { |
83 |
print COMP ">$ctg $numseqs $ctglen\n"; |
84 |
foreach my $seqname (@ctgcomp) { |
85 |
my $sd=$seqs{$seqname} || |
86 |
die("Error: no seqdata for component $seqname!\n"); |
87 |
print COMP $seqname.' '.join(' ',@$sd)."\n"; |
88 |
} |
89 |
} |
90 |
|
91 |
} |