1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
use Getopt::Std; |
4 |
|
|
|
5 |
|
|
my $usage=q{ |
6 |
|
|
Usage: |
7 |
|
|
|
8 |
|
|
tran6frames.pl [-O] [-t <tag>] [-R] <seqfile> |
9 |
|
|
|
10 |
|
|
-R : the sequence is masked for repeats |
11 |
|
|
(with lower case letters) and the translation |
12 |
|
|
will ignore (skip) such regions |
13 |
|
|
-O : use the offset information in the defline of each fasta |
14 |
|
|
record for generating the base name of the translated |
15 |
|
|
sequences; the generated ID will have the format: |
16 |
|
|
<original_seqid>|<original_offset>|<tran_start>_<tran_end> |
17 |
|
|
-t add the provided string <tag> to the seq ID generated for each |
18 |
|
|
record; the new ID will have the format: |
19 |
|
|
<original_seqid>|<tag>|<tran_start>_<tran_end> |
20 |
|
|
}; |
21 |
|
|
|
22 |
|
|
getopts('ORt:') || die($usage."\n"); |
23 |
|
|
|
24 |
|
|
my ($useoffset, $repeat, $nametag)= |
25 |
|
|
($Getopt::Std::opt_O, $Getopt::Std::opt_R, $Getopt::Std::opt_t); |
26 |
|
|
|
27 |
|
|
if ($useoffset && $nametag) { |
28 |
|
|
die("$usage\n Error: options -t and -O are exclusive!\n"); |
29 |
|
|
} |
30 |
|
|
|
31 |
|
|
my %codons=( |
32 |
|
|
'AAA'=>'K', 'AAC'=>'N', 'AAG'=>'K', 'AAR'=>'K', 'AAT'=>'N', |
33 |
|
|
'AAY'=>'N', 'ACA'=>'T', 'ACB'=>'T', 'ACC'=>'T', 'ACD'=>'T', |
34 |
|
|
'ACG'=>'T', 'ACH'=>'T', 'ACK'=>'T', 'ACM'=>'T', 'ACN'=>'T', |
35 |
|
|
'ACR'=>'T', 'ACS'=>'T', 'ACT'=>'T', 'ACV'=>'T', 'ACW'=>'T', |
36 |
|
|
'ACY'=>'T', 'AGA'=>'R', 'AGC'=>'S', 'AGG'=>'R', 'AGR'=>'R', |
37 |
|
|
'AGT'=>'S', 'AGY'=>'S', 'ATA'=>'I', 'ATC'=>'I', 'ATG'=>'M', |
38 |
|
|
'ATH'=>'I', 'ATM'=>'I', 'ATT'=>'I', 'ATW'=>'I', 'ATY'=>'I', |
39 |
|
|
'CAA'=>'Q', 'CAC'=>'H', 'CAG'=>'Q', 'CAR'=>'Q', 'CAT'=>'H', |
40 |
|
|
'CAY'=>'H', 'CCA'=>'P', 'CCB'=>'P', 'CCC'=>'P', 'CCD'=>'P', |
41 |
|
|
'CCG'=>'P', 'CCH'=>'P', 'CCK'=>'P', 'CCM'=>'P', 'CCN'=>'P', |
42 |
|
|
'CCR'=>'P', 'CCS'=>'P', 'CCT'=>'P', 'CCV'=>'P', 'CCW'=>'P', |
43 |
|
|
'CCY'=>'P', 'CGA'=>'R', 'CGB'=>'R', 'CGC'=>'R', 'CGD'=>'R', |
44 |
|
|
'CGG'=>'R', 'CGH'=>'R', 'CGK'=>'R', 'CGM'=>'R', 'CGN'=>'R', |
45 |
|
|
'CGR'=>'R', 'CGS'=>'R', 'CGT'=>'R', 'CGV'=>'R', 'CGW'=>'R', |
46 |
|
|
'CGY'=>'R', 'CTA'=>'L', 'CTB'=>'L', 'CTC'=>'L', 'CTD'=>'L', |
47 |
|
|
'CTG'=>'L', 'CTH'=>'L', 'CTK'=>'L', 'CTM'=>'L', 'CTN'=>'L', |
48 |
|
|
'CTR'=>'L', 'CTS'=>'L', 'CTT'=>'L', 'CTV'=>'L', 'CTW'=>'L', |
49 |
|
|
'CTY'=>'L', 'GAA'=>'E', 'GAC'=>'D', 'GAG'=>'E', 'GAR'=>'E', |
50 |
|
|
'GAT'=>'D', 'GAY'=>'D', 'GCA'=>'A', 'GCB'=>'A', 'GCC'=>'A', |
51 |
|
|
'GCD'=>'A', 'GCG'=>'A', 'GCH'=>'A', 'GCK'=>'A', 'GCM'=>'A', |
52 |
|
|
'GCN'=>'A', 'GCR'=>'A', 'GCS'=>'A', 'GCT'=>'A', 'GCV'=>'A', |
53 |
|
|
'GCW'=>'A', 'GCY'=>'A', 'GGA'=>'G', 'GGB'=>'G', 'GGC'=>'G', |
54 |
|
|
'GGD'=>'G', 'GGG'=>'G', 'GGH'=>'G', 'GGK'=>'G', 'GGM'=>'G', |
55 |
|
|
'GGN'=>'G', 'GGR'=>'G', 'GGS'=>'G', 'GGT'=>'G', 'GGV'=>'G', |
56 |
|
|
'GGW'=>'G', 'GGY'=>'G', 'GTA'=>'V', 'GTB'=>'V', 'GTC'=>'V', |
57 |
|
|
'GTD'=>'V', 'GTG'=>'V', 'GTH'=>'V', 'GTK'=>'V', 'GTM'=>'V', |
58 |
|
|
'GTN'=>'V', 'GTR'=>'V', 'GTS'=>'V', 'GTT'=>'V', 'GTV'=>'V', |
59 |
|
|
'GTW'=>'V', 'GTY'=>'V', 'MGA'=>'R', 'MGG'=>'R', 'MGR'=>'R', |
60 |
|
|
'NNN'=>'X', 'RAY'=>'B', 'SAR'=>'Z', 'TAA'=>'.', 'TAC'=>'Y', |
61 |
|
|
'TAG'=>'.', 'TAR'=>'.', 'TAT'=>'Y', 'TAY'=>'Y', 'TCA'=>'S', |
62 |
|
|
'TCB'=>'S', 'TCC'=>'S', 'TCD'=>'S', 'TCG'=>'S', 'TCH'=>'S', |
63 |
|
|
'TCK'=>'S', 'TCM'=>'S', 'TCN'=>'S', 'TCR'=>'S', 'TCS'=>'S', |
64 |
|
|
'TCT'=>'S', 'TCV'=>'S', 'TCW'=>'S', 'TCY'=>'S', 'TGA'=>'.', |
65 |
|
|
'TGC'=>'C', 'TGG'=>'W', 'TGT'=>'C', 'TGY'=>'C', 'TRA'=>'.', |
66 |
|
|
'TTA'=>'L', 'TTC'=>'F', 'TTG'=>'L', 'TTR'=>'L', 'TTT'=>'F', |
67 |
|
|
'TTY'=>'F', 'XXX'=>'X', 'YTA'=>'L', 'YTG'=>'L', 'YTR'=>'L' |
68 |
|
|
); |
69 |
|
|
|
70 |
|
|
#open(F,$seqfile) || die("Error opening $seqfile!\n"); |
71 |
|
|
|
72 |
|
|
$/="\n>"; |
73 |
|
|
while(<>) { |
74 |
|
|
chomp; |
75 |
|
|
s/^>//; |
76 |
|
|
next unless length($_)>0; |
77 |
|
|
my ($name, $descr, $seq)=(m/^(\S+)[ \t\x01]*(.*?)\n(.+)/s); |
78 |
|
|
die "ERROR 21: Wrong FASTA format: .$_." unless $name && $seq; |
79 |
|
|
$seq =~ tr/ \t\n\r//d; |
80 |
|
|
my ($ofs)=($descr=~m/^(\d+)/); |
81 |
|
|
if ($nametag) { |
82 |
|
|
$name.='|'.$nametag; |
83 |
|
|
} |
84 |
|
|
elsif ($useoffset && defined($ofs)) { |
85 |
|
|
$name.='|'.$ofs; |
86 |
|
|
} |
87 |
|
|
process($name,$seq); |
88 |
|
|
} |
89 |
|
|
|
90 |
|
|
sub process { |
91 |
|
|
my ($name,$seq)=@_; |
92 |
|
|
|
93 |
|
|
my $len=length($seq); |
94 |
|
|
#my $index=0; |
95 |
|
|
#$index=translate($name,$seq,$len,$index,1); |
96 |
|
|
translate($name,$seq,$len,1); |
97 |
|
|
$seq= reverse $seq; |
98 |
|
|
$seq =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/; |
99 |
|
|
#translate($name,$seq,$len,$index,-1); |
100 |
|
|
translate($name,$seq,$len); |
101 |
|
|
} |
102 |
|
|
|
103 |
|
|
sub translate { |
104 |
|
|
my ($name,$seq,$len,$fwd)=@_; |
105 |
|
|
my $i=0; |
106 |
|
|
my @start; |
107 |
|
|
|
108 |
|
|
$start[0]=0; |
109 |
|
|
$start[1]=1; |
110 |
|
|
$start[2]=2; |
111 |
|
|
|
112 |
|
|
my @prot; |
113 |
|
|
my @aa; |
114 |
|
|
my (@numaa, @numX); |
115 |
|
|
while($i+3<$len) { |
116 |
|
|
for(my $j=0;$j<3;$j++) { |
117 |
|
|
my $cod=substr($seq,$i+$j,3); |
118 |
|
|
$aa[$j]=$repeat ? $codons{$cod} : $codons{uc($cod)}; |
119 |
|
|
$aa[$j]='X' unless $aa[$j]; |
120 |
|
|
$numX[$j]++ if $aa[$j] eq 'X'; |
121 |
|
|
if($aa[$j] eq '.') { |
122 |
|
|
if($i+$j-$start[$j]>=60 && $numaa[$j]-$numX[$j]>6) { |
123 |
|
|
print ">$name|"; |
124 |
|
|
my ($cstart, $cend)=$fwd ?($start[$j]+1, $i+$j): |
125 |
|
|
($len-$start[$j], $len-$i-$j+1); |
126 |
|
|
print join('_',$cstart, $cend)."\n"; |
127 |
|
|
#$index++; |
128 |
|
|
#print $prot[$j],"\n"; |
129 |
|
|
&printfa($prot[$j]); |
130 |
|
|
} |
131 |
|
|
$start[$j]=$i+$j+3; |
132 |
|
|
$prot[$j]=""; |
133 |
|
|
$numaa[$j]=0; |
134 |
|
|
$numX[$j]=0; |
135 |
|
|
} #stop found |
136 |
|
|
elsif($aa[$j]) { |
137 |
|
|
$prot[$j].=$aa[$j]; |
138 |
|
|
$numaa[$j]++; |
139 |
|
|
} |
140 |
|
|
} #for $j |
141 |
|
|
$i+=3; |
142 |
|
|
} #while $i |
143 |
|
|
#process the last aa translation (after the last stop found) |
144 |
|
|
for(my $j=0;$j<3;$j++) { |
145 |
|
|
my $lenp=length($prot[$j]); |
146 |
|
|
if($start[$j]<$len && $lenp>=20 && $numaa[$j]-$numX[$j]>6) { |
147 |
|
|
print ">$name|"; |
148 |
|
|
#$index++; |
149 |
|
|
my ($cstart, $cend)= $fwd ? ($start[$j]+1, $start[$j]+3*$lenp) : |
150 |
|
|
($len-$start[$j], $len-$start[$j]-3*$lenp+1); |
151 |
|
|
$cend=1 if $cend<1; |
152 |
|
|
#if ($cstart==389) { |
153 |
|
|
# print STDERR join(", ", "len=$len", "fwd=$fwd","start[j]=$start[$j] (j=$j)", "lenp=$lenp")."\n"; |
154 |
|
|
# } |
155 |
|
|
print join('_',$cstart, $cend)."\n"; |
156 |
|
|
&printfa($prot[$j]); |
157 |
|
|
$prot[$j]=""; |
158 |
|
|
} |
159 |
|
|
} |
160 |
|
|
#return $index; |
161 |
|
|
} |
162 |
|
|
|
163 |
|
|
sub printfa { |
164 |
|
|
my $seqlen=length($_[0]); |
165 |
|
|
my $pos=0; |
166 |
|
|
while ($pos<$seqlen) { |
167 |
|
|
print substr($_[0],$pos,60)."\n"; |
168 |
|
|
$pos+=60; |
169 |
|
|
} |
170 |
|
|
} |