ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/tran6frames.pl
Revision: 23
Committed: Tue Jul 26 21:44:38 2011 UTC (13 years ago) by gpertea
Original Path: ann_bin/tran6frames.pl
File size: 5880 byte(s)
Log Message:
adding misc scripts

Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4    
5     my $usage=q{
6     Usage:
7    
8     tran6frames.pl [-O] [-t <tag>] [-R] <seqfile>
9    
10     -R : the sequence is masked for repeats
11     (with lower case letters) and the translation
12     will ignore (skip) such regions
13     -O : use the offset information in the defline of each fasta
14     record for generating the base name of the translated
15     sequences; the generated ID will have the format:
16     <original_seqid>|<original_offset>|<tran_start>_<tran_end>
17     -t add the provided string <tag> to the seq ID generated for each
18     record; the new ID will have the format:
19     <original_seqid>|<tag>|<tran_start>_<tran_end>
20     };
21    
22     getopts('ORt:') || die($usage."\n");
23    
24     my ($useoffset, $repeat, $nametag)=
25     ($Getopt::Std::opt_O, $Getopt::Std::opt_R, $Getopt::Std::opt_t);
26    
27     if ($useoffset && $nametag) {
28     die("$usage\n Error: options -t and -O are exclusive!\n");
29     }
30    
31     my %codons=(
32     'AAA'=>'K', 'AAC'=>'N', 'AAG'=>'K', 'AAR'=>'K', 'AAT'=>'N',
33     'AAY'=>'N', 'ACA'=>'T', 'ACB'=>'T', 'ACC'=>'T', 'ACD'=>'T',
34     'ACG'=>'T', 'ACH'=>'T', 'ACK'=>'T', 'ACM'=>'T', 'ACN'=>'T',
35     'ACR'=>'T', 'ACS'=>'T', 'ACT'=>'T', 'ACV'=>'T', 'ACW'=>'T',
36     'ACY'=>'T', 'AGA'=>'R', 'AGC'=>'S', 'AGG'=>'R', 'AGR'=>'R',
37     'AGT'=>'S', 'AGY'=>'S', 'ATA'=>'I', 'ATC'=>'I', 'ATG'=>'M',
38     'ATH'=>'I', 'ATM'=>'I', 'ATT'=>'I', 'ATW'=>'I', 'ATY'=>'I',
39     'CAA'=>'Q', 'CAC'=>'H', 'CAG'=>'Q', 'CAR'=>'Q', 'CAT'=>'H',
40     'CAY'=>'H', 'CCA'=>'P', 'CCB'=>'P', 'CCC'=>'P', 'CCD'=>'P',
41     'CCG'=>'P', 'CCH'=>'P', 'CCK'=>'P', 'CCM'=>'P', 'CCN'=>'P',
42     'CCR'=>'P', 'CCS'=>'P', 'CCT'=>'P', 'CCV'=>'P', 'CCW'=>'P',
43     'CCY'=>'P', 'CGA'=>'R', 'CGB'=>'R', 'CGC'=>'R', 'CGD'=>'R',
44     'CGG'=>'R', 'CGH'=>'R', 'CGK'=>'R', 'CGM'=>'R', 'CGN'=>'R',
45     'CGR'=>'R', 'CGS'=>'R', 'CGT'=>'R', 'CGV'=>'R', 'CGW'=>'R',
46     'CGY'=>'R', 'CTA'=>'L', 'CTB'=>'L', 'CTC'=>'L', 'CTD'=>'L',
47     'CTG'=>'L', 'CTH'=>'L', 'CTK'=>'L', 'CTM'=>'L', 'CTN'=>'L',
48     'CTR'=>'L', 'CTS'=>'L', 'CTT'=>'L', 'CTV'=>'L', 'CTW'=>'L',
49     'CTY'=>'L', 'GAA'=>'E', 'GAC'=>'D', 'GAG'=>'E', 'GAR'=>'E',
50     'GAT'=>'D', 'GAY'=>'D', 'GCA'=>'A', 'GCB'=>'A', 'GCC'=>'A',
51     'GCD'=>'A', 'GCG'=>'A', 'GCH'=>'A', 'GCK'=>'A', 'GCM'=>'A',
52     'GCN'=>'A', 'GCR'=>'A', 'GCS'=>'A', 'GCT'=>'A', 'GCV'=>'A',
53     'GCW'=>'A', 'GCY'=>'A', 'GGA'=>'G', 'GGB'=>'G', 'GGC'=>'G',
54     'GGD'=>'G', 'GGG'=>'G', 'GGH'=>'G', 'GGK'=>'G', 'GGM'=>'G',
55     'GGN'=>'G', 'GGR'=>'G', 'GGS'=>'G', 'GGT'=>'G', 'GGV'=>'G',
56     'GGW'=>'G', 'GGY'=>'G', 'GTA'=>'V', 'GTB'=>'V', 'GTC'=>'V',
57     'GTD'=>'V', 'GTG'=>'V', 'GTH'=>'V', 'GTK'=>'V', 'GTM'=>'V',
58     'GTN'=>'V', 'GTR'=>'V', 'GTS'=>'V', 'GTT'=>'V', 'GTV'=>'V',
59     'GTW'=>'V', 'GTY'=>'V', 'MGA'=>'R', 'MGG'=>'R', 'MGR'=>'R',
60     'NNN'=>'X', 'RAY'=>'B', 'SAR'=>'Z', 'TAA'=>'.', 'TAC'=>'Y',
61     'TAG'=>'.', 'TAR'=>'.', 'TAT'=>'Y', 'TAY'=>'Y', 'TCA'=>'S',
62     'TCB'=>'S', 'TCC'=>'S', 'TCD'=>'S', 'TCG'=>'S', 'TCH'=>'S',
63     'TCK'=>'S', 'TCM'=>'S', 'TCN'=>'S', 'TCR'=>'S', 'TCS'=>'S',
64     'TCT'=>'S', 'TCV'=>'S', 'TCW'=>'S', 'TCY'=>'S', 'TGA'=>'.',
65     'TGC'=>'C', 'TGG'=>'W', 'TGT'=>'C', 'TGY'=>'C', 'TRA'=>'.',
66     'TTA'=>'L', 'TTC'=>'F', 'TTG'=>'L', 'TTR'=>'L', 'TTT'=>'F',
67     'TTY'=>'F', 'XXX'=>'X', 'YTA'=>'L', 'YTG'=>'L', 'YTR'=>'L'
68     );
69    
70     #open(F,$seqfile) || die("Error opening $seqfile!\n");
71    
72     $/="\n>";
73     while(<>) {
74     chomp;
75     s/^>//;
76     next unless length($_)>0;
77     my ($name, $descr, $seq)=(m/^(\S+)[ \t\x01]*(.*?)\n(.+)/s);
78     die "ERROR 21: Wrong FASTA format: .$_." unless $name && $seq;
79     $seq =~ tr/ \t\n\r//d;
80     my ($ofs)=($descr=~m/^(\d+)/);
81     if ($nametag) {
82     $name.='|'.$nametag;
83     }
84     elsif ($useoffset && defined($ofs)) {
85     $name.='|'.$ofs;
86     }
87     process($name,$seq);
88     }
89    
90     sub process {
91     my ($name,$seq)=@_;
92    
93     my $len=length($seq);
94     #my $index=0;
95     #$index=translate($name,$seq,$len,$index,1);
96     translate($name,$seq,$len,1);
97     $seq= reverse $seq;
98     $seq =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
99     #translate($name,$seq,$len,$index,-1);
100     translate($name,$seq,$len);
101     }
102    
103     sub translate {
104     my ($name,$seq,$len,$fwd)=@_;
105     my $i=0;
106     my @start;
107    
108     $start[0]=0;
109     $start[1]=1;
110     $start[2]=2;
111    
112     my @prot;
113     my @aa;
114     my (@numaa, @numX);
115     while($i+3<$len) {
116     for(my $j=0;$j<3;$j++) {
117     my $cod=substr($seq,$i+$j,3);
118     $aa[$j]=$repeat ? $codons{$cod} : $codons{uc($cod)};
119     $aa[$j]='X' unless $aa[$j];
120     $numX[$j]++ if $aa[$j] eq 'X';
121     if($aa[$j] eq '.') {
122     if($i+$j-$start[$j]>=60 && $numaa[$j]-$numX[$j]>6) {
123     print ">$name|";
124     my ($cstart, $cend)=$fwd ?($start[$j]+1, $i+$j):
125     ($len-$start[$j], $len-$i-$j+1);
126     print join('_',$cstart, $cend)."\n";
127     #$index++;
128     #print $prot[$j],"\n";
129     &printfa($prot[$j]);
130     }
131     $start[$j]=$i+$j+3;
132     $prot[$j]="";
133     $numaa[$j]=0;
134     $numX[$j]=0;
135     } #stop found
136     elsif($aa[$j]) {
137     $prot[$j].=$aa[$j];
138     $numaa[$j]++;
139     }
140     } #for $j
141     $i+=3;
142     } #while $i
143     #process the last aa translation (after the last stop found)
144     for(my $j=0;$j<3;$j++) {
145     my $lenp=length($prot[$j]);
146     if($start[$j]<$len && $lenp>=20 && $numaa[$j]-$numX[$j]>6) {
147     print ">$name|";
148     #$index++;
149     my ($cstart, $cend)= $fwd ? ($start[$j]+1, $start[$j]+3*$lenp) :
150     ($len-$start[$j], $len-$start[$j]-3*$lenp+1);
151     $cend=1 if $cend<1;
152     #if ($cstart==389) {
153     # print STDERR join(", ", "len=$len", "fwd=$fwd","start[j]=$start[$j] (j=$j)", "lenp=$lenp")."\n";
154     # }
155     print join('_',$cstart, $cend)."\n";
156     &printfa($prot[$j]);
157     $prot[$j]="";
158     }
159     }
160     #return $index;
161     }
162    
163     sub printfa {
164     my $seqlen=length($_[0]);
165     my $pos=0;
166     while ($pos<$seqlen) {
167     print substr($_[0],$pos,60)."\n";
168     $pos+=60;
169     }
170     }

Properties

Name Value
svn:executable *