1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
use Getopt::Std; |
4 |
|
|
use Fcntl qw(SEEK_SET SEEK_END SEEK_CUR); # for seek |
5 |
|
|
|
6 |
|
|
my $usage=q/ |
7 |
|
|
A simple fasta sequence(s) manipulation tool. |
8 |
|
|
Usage: |
9 |
|
|
seqmanip [-C] [-r<ranges>] [-L] [-D|-X] [-I] [-G] [-T|-t <frame>|-Z] |
10 |
|
|
[-M|-m <ncount>] [-f <ranges_file>] [fasta file(s)...] |
11 |
|
|
The input files should be one or more nucleotide or aminoacid sequences |
12 |
|
|
in FASTA or raw format; if no files are given, input is expected at STDIN |
13 |
|
|
|
14 |
|
|
Options: |
15 |
|
|
-C reverse complement (is -r is given, first the subsequence is extracted |
16 |
|
|
and only then reverse-complemented) |
17 |
|
|
-l the line length for the output fasta record (default 70) |
18 |
|
|
-r extract a range (subsequence) from the first input fasta sequence; |
19 |
|
|
<range> can have one of the formats: |
20 |
|
|
<start>..<end> or <start>-<end> or <start>:<len> |
21 |
|
|
Multiple such intervals can be given after the -r option (comma or space |
22 |
|
|
delimited) and they will be spliced together |
23 |
|
|
This option doesn't work with -M or -n options. |
24 |
|
|
-f same as -r but for multi-fasta\/seq input; expects a file containing |
25 |
|
|
lines of this format: |
26 |
|
|
<seqname> <range> |
27 |
|
|
The format of <range> is the same with the -r above or it may be just |
28 |
|
|
space delimited <start> <end> numbers; the output will be a multi-fasta |
29 |
|
|
file with one subsequence for each line in the <ranges_file> |
30 |
|
|
-M merge (concatenate) all input sequences into a single sequence |
31 |
|
|
-m merge all input sequences separating them by <ncount> Ns |
32 |
|
|
-d provides a custom defline for -M and -N options |
33 |
|
|
-D add basic defline if the input doesn't provide one |
34 |
|
|
-X exclude defline from the output (just print the sequence) |
35 |
|
|
-L provide one-line output for each sequence |
36 |
|
|
-T first frame translation (DNA to aminoacid) |
37 |
|
|
-t <frame>th frame translation; <frame> could be 1,2,3,-1,-2,-3 |
38 |
|
|
-E show exon\/splice-sites\/start\/stop info (requires -r) |
39 |
|
|
-G show GFF output for the given range(s) (requires -r) |
40 |
|
|
-Z raw reverse translate the output (aminoacid to DNA, rather useless) |
41 |
|
|
-V only show a sequence if it's a valid, complete CDS |
42 |
|
|
-q only show a sequence if it's longer than <minseqlen> |
43 |
|
|
-n discard sequences with a percentage of Ns (or Xs) higher than <maxpercN> |
44 |
|
|
-y remove spans of Ns longer than <maxnspan> replacing them by <maxnspan> Ns |
45 |
|
|
-Y trim Ns from either end of sequence |
46 |
|
|
-U cleanup\/remove extraneous characters(e.g. digits, dashes, stars) |
47 |
|
|
from every input sequence |
48 |
|
|
-R convert RNA to DNA (simply replacing U wih T) |
49 |
|
|
-P convert all sequence letters to uppercase |
50 |
|
|
-A use ANSI colors to highlight features in terminal (for -E option) |
51 |
|
|
-O return longest ORF sequence for each input sequence (start..stop) |
52 |
|
|
-Q same as -O but doesn't require a start codon |
53 |
|
|
/; |
54 |
|
|
|
55 |
|
|
my %codons=( |
56 |
|
|
'AAA'=>'K', 'AAC'=>'N', 'AAG'=>'K', 'AAR'=>'K', 'AAT'=>'N', |
57 |
|
|
'AAY'=>'N', 'ACA'=>'T', 'ACB'=>'T', 'ACC'=>'T', 'ACD'=>'T', |
58 |
|
|
'ACG'=>'T', 'ACH'=>'T', 'ACK'=>'T', 'ACM'=>'T', 'ACN'=>'T', |
59 |
|
|
'ACR'=>'T', 'ACS'=>'T', 'ACT'=>'T', 'ACV'=>'T', 'ACW'=>'T', |
60 |
|
|
'ACY'=>'T', 'AGA'=>'R', 'AGC'=>'S', 'AGG'=>'R', 'AGR'=>'R', |
61 |
|
|
'AGT'=>'S', 'AGY'=>'S', 'ATA'=>'I', 'ATC'=>'I', 'ATG'=>'M', |
62 |
|
|
'ATH'=>'I', 'ATM'=>'I', 'ATT'=>'I', 'ATW'=>'I', 'ATY'=>'I', |
63 |
|
|
'CAA'=>'Q', 'CAC'=>'H', 'CAG'=>'Q', 'CAR'=>'Q', 'CAT'=>'H', |
64 |
|
|
'CAY'=>'H', 'CCA'=>'P', 'CCB'=>'P', 'CCC'=>'P', 'CCD'=>'P', |
65 |
|
|
'CCG'=>'P', 'CCH'=>'P', 'CCK'=>'P', 'CCM'=>'P', 'CCN'=>'P', |
66 |
|
|
'CCR'=>'P', 'CCS'=>'P', 'CCT'=>'P', 'CCV'=>'P', 'CCW'=>'P', |
67 |
|
|
'CCY'=>'P', 'CGA'=>'R', 'CGB'=>'R', 'CGC'=>'R', 'CGD'=>'R', |
68 |
|
|
'CGG'=>'R', 'CGH'=>'R', 'CGK'=>'R', 'CGM'=>'R', 'CGN'=>'R', |
69 |
|
|
'CGR'=>'R', 'CGS'=>'R', 'CGT'=>'R', 'CGV'=>'R', 'CGW'=>'R', |
70 |
|
|
'CGY'=>'R', 'CTA'=>'L', 'CTB'=>'L', 'CTC'=>'L', 'CTD'=>'L', |
71 |
|
|
'CTG'=>'L', 'CTH'=>'L', 'CTK'=>'L', 'CTM'=>'L', 'CTN'=>'L', |
72 |
|
|
'CTR'=>'L', 'CTS'=>'L', 'CTT'=>'L', 'CTV'=>'L', 'CTW'=>'L', |
73 |
|
|
'CTY'=>'L', 'GAA'=>'E', 'GAC'=>'D', 'GAG'=>'E', 'GAR'=>'E', |
74 |
|
|
'GAT'=>'D', 'GAY'=>'D', 'GCA'=>'A', 'GCB'=>'A', 'GCC'=>'A', |
75 |
|
|
'GCD'=>'A', 'GCG'=>'A', 'GCH'=>'A', 'GCK'=>'A', 'GCM'=>'A', |
76 |
|
|
'GCN'=>'A', 'GCR'=>'A', 'GCS'=>'A', 'GCT'=>'A', 'GCV'=>'A', |
77 |
|
|
'GCW'=>'A', 'GCY'=>'A', 'GGA'=>'G', 'GGB'=>'G', 'GGC'=>'G', |
78 |
|
|
'GGD'=>'G', 'GGG'=>'G', 'GGH'=>'G', 'GGK'=>'G', 'GGM'=>'G', |
79 |
|
|
'GGN'=>'G', 'GGR'=>'G', 'GGS'=>'G', 'GGT'=>'G', 'GGV'=>'G', |
80 |
|
|
'GGW'=>'G', 'GGY'=>'G', 'GTA'=>'V', 'GTB'=>'V', 'GTC'=>'V', |
81 |
|
|
'GTD'=>'V', 'GTG'=>'V', 'GTH'=>'V', 'GTK'=>'V', 'GTM'=>'V', |
82 |
|
|
'GTN'=>'V', 'GTR'=>'V', 'GTS'=>'V', 'GTT'=>'V', 'GTV'=>'V', |
83 |
|
|
'GTW'=>'V', 'GTY'=>'V', 'MGA'=>'R', 'MGG'=>'R', 'MGR'=>'R', |
84 |
|
|
'NNN'=>'X', 'RAY'=>'B', 'SAR'=>'Z', 'TAA'=>'.', 'TAC'=>'Y', |
85 |
|
|
'TAG'=>'.', 'TAR'=>'.', 'TAT'=>'Y', 'TAY'=>'Y', 'TCA'=>'S', |
86 |
|
|
'TCB'=>'S', 'TCC'=>'S', 'TCD'=>'S', 'TCG'=>'S', 'TCH'=>'S', |
87 |
|
|
'TCK'=>'S', 'TCM'=>'S', 'TCN'=>'S', 'TCR'=>'S', 'TCS'=>'S', |
88 |
|
|
'TCT'=>'S', 'TCV'=>'S', 'TCW'=>'S', 'TCY'=>'S', 'TGA'=>'.', |
89 |
|
|
'TGC'=>'C', 'TGG'=>'W', 'TGT'=>'C', 'TGY'=>'C', 'TRA'=>'.', |
90 |
|
|
'TTA'=>'L', 'TTC'=>'F', 'TTG'=>'L', 'TTR'=>'L', 'TTT'=>'F', |
91 |
|
|
'TTY'=>'F', 'XXX'=>'X', 'YTA'=>'L', 'YTG'=>'L', 'YTR'=>'L' |
92 |
|
|
); |
93 |
|
|
|
94 |
|
|
|
95 |
|
|
getopts('AZTUPMEGNRXDVCLYOQr:d:m:q:l:t:f:y:n:') || die "$usage\n"; |
96 |
|
|
#my %ranges; # seqname => [offset, len] only populated by -r or -f option; |
97 |
|
|
my @ranges; # list of [chr, strand, defline, [[offset1,len1], [offset1,len2], ..]] |
98 |
|
|
# 0 1 2 3 |
99 |
|
|
#my @range_chrs; # just to keep track of the order of ranges requested |
100 |
|
|
my $fai_loaded=0; |
101 |
|
|
my %fai_h; # seqID => [fpos, linelen, delimlen, seqlen] |
102 |
|
|
|
103 |
|
|
my $m_linerest; #for the merge case (-M) |
104 |
|
|
my $linelen=$Getopt::Std::opt_l || 70; |
105 |
|
|
my $user_defline=$Getopt::Std::opt_d || 'dummyname'; |
106 |
|
|
my $ansicolors=$Getopt::Std::opt_A; |
107 |
|
|
#some ansi colors: |
108 |
|
|
my ($clred, $clgreen, $clreset)=("\e[1;33;41m", # bright yellow on red |
109 |
|
|
"\e[0;32;40m", # green on black |
110 |
|
|
"\e[0m") #reset colors |
111 |
|
|
if $ansicolors; |
112 |
|
|
my %toDNA=reverse(%codons); |
113 |
|
|
@toDNA{'P','F','K','G'}=('CCC','TTT','AAA','GGG'); |
114 |
|
|
my ($getOrf, $getOrfAny)=($Getopt::Std::opt_O, $Getopt::Std::opt_Q); |
115 |
|
|
$getOrf=1 if $getOrfAny; |
116 |
|
|
my $complement=$Getopt::Std::opt_C; |
117 |
|
|
my $validateCDS=$Getopt::Std::opt_V; |
118 |
|
|
my $exoninfo=$Getopt::Std::opt_E; |
119 |
|
|
my $gffout=$Getopt::Std::opt_G; |
120 |
|
|
my $rna2dna=$Getopt::Std::opt_R; |
121 |
|
|
my $mergedist=$Getopt::Std::opt_m; |
122 |
|
|
my $minseqlen=$Getopt::Std::opt_q; |
123 |
|
|
my $maxnspan=$Getopt::Std::opt_y; |
124 |
|
|
my $trimNs=$Getopt::Std::opt_Y; |
125 |
|
|
my $maxnrepl=''; |
126 |
|
|
if ($maxnspan) { |
127 |
|
|
$maxnrepl='N' x $maxnspan; |
128 |
|
|
$maxnspan++; |
129 |
|
|
} |
130 |
|
|
my $maxpercn=$Getopt::Std::opt_n; |
131 |
|
|
my $mergesep; |
132 |
|
|
if ($mergedist>0) { |
133 |
|
|
$Getopt::Std::opt_M=1; |
134 |
|
|
$mergesep='N' x $mergedist; |
135 |
|
|
} |
136 |
|
|
my $rangetarget; #if a specific seq target was provided |
137 |
|
|
# if this is not set, then the first fasta sequence given as input |
138 |
|
|
# will be used for range queries |
139 |
|
|
# parse range information if available |
140 |
|
|
if ($Getopt::Std::opt_r) { |
141 |
|
|
my $rangetext=$Getopt::Std::opt_r; |
142 |
|
|
my $rstrand='+'; |
143 |
|
|
#if ($rangetext=~s/([\-\+])$//) { $rstrand=$2;} |
144 |
|
|
if ($rangetext=~s/(\d+[\.\-\:_]+\d+)([\-\+])$/$1/) { $rstrand=$2; } |
145 |
|
|
my @ar=split(/[\,\s]+/,$rangetext); #allow multiple ranges |
146 |
|
|
my $seqtarget; |
147 |
|
|
unless ($ar[0]=~m/^\d+\:\d+$/) { |
148 |
|
|
if ($ar[0]=~s/^([\w\|\-\+\.]+)\://) { |
149 |
|
|
# target chromosome given |
150 |
|
|
$seqtarget=$1; |
151 |
|
|
if ($seqtarget=~s/([\-\+])$//) { $rstrand=$1;} |
152 |
|
|
$rangetarget=1; |
153 |
|
|
} |
154 |
|
|
} |
155 |
|
|
if ($seqtarget) { # 0 1 2 3 |
156 |
|
|
push(@ranges, [$seqtarget,$rstrand,'', []]); |
157 |
|
|
} |
158 |
|
|
else { |
159 |
|
|
push(@ranges, ['-',$rstrand,'',[]]); |
160 |
|
|
} |
161 |
|
|
my $rdata=$ranges[0]->[3]; # [ ] |
162 |
|
|
foreach my $rt (@ar) { |
163 |
|
|
my ($rStart, $rLen)=&parseRange($rt); |
164 |
|
|
push(@$rdata, [$rStart, $rLen]); |
165 |
|
|
} |
166 |
|
|
} |
167 |
|
|
elsif ($Getopt::Std::opt_f) { # file with ranges to pull |
168 |
|
|
my $frname=$Getopt::Std::opt_f; |
169 |
|
|
my $using_stdin; |
170 |
|
|
if ($frname eq '-' || $frname eq 'stdin') { |
171 |
|
|
#open(FRNG, <&STDIN); |
172 |
|
|
open(FRNG, "<&=STDIN") or die "Couldn't alias STDIN : $!"; |
173 |
|
|
$using_stdin=1; |
174 |
|
|
} |
175 |
|
|
else { |
176 |
|
|
open (FRNG, $frname) || die ("Error: cannot open seq ranges file '$frname'!\n"); |
177 |
|
|
} |
178 |
|
|
while (<FRNG>) { |
179 |
|
|
chomp; |
180 |
|
|
my $rstrand='+'; |
181 |
|
|
my ($seqname, $rangetxt)=split(/\s+/,$_,2); |
182 |
|
|
my $defline; |
183 |
|
|
if ($seqname=~m/^([\w\|\-\+\.]+)\:(\d+[\.\-_]+\d*[\-\+]?)/) { |
184 |
|
|
#chr:start-end format |
185 |
|
|
# strand may follow chr or end |
186 |
|
|
$seqname=$1; |
187 |
|
|
$defline=$rangetxt; |
188 |
|
|
$rangetxt=$2; |
189 |
|
|
if ($seqname=~s/([\-\+])$//) { $rstrand=$1;} |
190 |
|
|
elsif ($rangetxt=~s/(\d+[\.\-_]+\d+)([\-\+])$/$1/) { $rstrand=$2; } |
191 |
|
|
} |
192 |
|
|
else { #1st column is chr(strand), 2nd column is the interval |
193 |
|
|
if ($seqname=~s/([\-\+])$//) { $rstrand=$1; } |
194 |
|
|
$rangetxt=~s/(\d+)[\t ]+(\d+)/$1-$2/g; |
195 |
|
|
my @rt=split(/\s+/,$rangetxt,2); |
196 |
|
|
if ($rt[1]) { |
197 |
|
|
$defline=$rt[1]; |
198 |
|
|
$rangetxt=$rt[0]; |
199 |
|
|
} |
200 |
|
|
} |
201 |
|
|
next unless $rangetxt; |
202 |
|
|
push(@ranges, [$seqname, $rstrand, $defline, []]); |
203 |
|
|
my $rdata=$ranges[-1]->[3]; |
204 |
|
|
#$rangetxt=~s/(\d+)[\t ]+(\d+)/$1-$2/g; #could be space delimited |
205 |
|
|
my @ar=split(/\,/,$rangetxt); |
206 |
|
|
foreach my $rt (@ar) { |
207 |
|
|
my ($rStart, $rLen)=&parseRange($rt); |
208 |
|
|
push(@$rdata, [$rStart, $rLen]); |
209 |
|
|
} |
210 |
|
|
} |
211 |
|
|
$rangetarget=1 if @ranges>0; |
212 |
|
|
close(FRNG) unless $using_stdin; |
213 |
|
|
} |
214 |
|
|
my $seq; |
215 |
|
|
my $seq_len; |
216 |
|
|
my $count=0; |
217 |
|
|
my $defline; |
218 |
|
|
my $curseq; |
219 |
|
|
my $seqcount; |
220 |
|
|
print ">$user_defline\n" if ($Getopt::Std::opt_M && !$Getopt::Std::opt_X); |
221 |
|
|
die("Error: target sequence(s) given but no input file!\n") |
222 |
|
|
if ($rangetarget && (@ARGV==0 || ! -f $ARGV[0])); |
223 |
|
|
my $qrange=(@ranges>0 && @ARGV>0 && -f $ARGV[0]); |
224 |
|
|
if ($qrange) { #target |
225 |
|
|
if (@ranges==1 && $ranges[0]->[0] eq '-') { #single fasta file |
226 |
|
|
processRanges($ARGV[0], $ranges[0]); |
227 |
|
|
} # single-fasta file |
228 |
|
|
else { |
229 |
|
|
#actual sequence names provided, need cdbfasta or samtools index |
230 |
|
|
my $fasta=$ARGV[0]; |
231 |
|
|
#die("Error: no fasta file $fasta!\n") unless -f $fasta; |
232 |
|
|
my $faidx; |
233 |
|
|
if ($fasta=~s/\.fai$//) { |
234 |
|
|
$faidx=$fasta.'.fai'; |
235 |
|
|
} |
236 |
|
|
else { |
237 |
|
|
$faidx=$fasta.'.fai' if -f $fasta.'.fai'; |
238 |
|
|
} |
239 |
|
|
unless ($faidx) { |
240 |
|
|
if ($fasta=~s/\.cidx$//) { |
241 |
|
|
$faidx=$fasta.'.cidx'; |
242 |
|
|
} |
243 |
|
|
else { |
244 |
|
|
$faidx=$fasta.'.cidx' if -f $fasta.'.cidx'; |
245 |
|
|
} |
246 |
|
|
} |
247 |
|
|
#print STDERR "info: using fasta index $faidx for $fasta.\n"; |
248 |
|
|
die("Error: no index file for fasta file $fasta") unless $faidx; |
249 |
|
|
die("Error: no fasta file $fasta!\n") unless -f $fasta; |
250 |
|
|
foreach my $rd (@ranges) { |
251 |
|
|
processRanges($fasta, $rd, $faidx); |
252 |
|
|
} |
253 |
|
|
} # cidx file needed |
254 |
|
|
} # fast[er] range queries |
255 |
|
|
else { # normal (serial) stream processing, very slow range processing for large sequences |
256 |
|
|
while (<>) { |
257 |
|
|
if (m/^(>.+)/) { |
258 |
|
|
process_seq() if $curseq; #sequence is in $seq variable |
259 |
|
|
$defline=$1; |
260 |
|
|
($curseq)=($defline=~m/^>(\S+)/); |
261 |
|
|
$seq_len=0; |
262 |
|
|
next; |
263 |
|
|
} |
264 |
|
|
chomp; |
265 |
|
|
tr/ \t\r\n//d; |
266 |
|
|
$seq.=$_; |
267 |
|
|
$seq_len+=length(); |
268 |
|
|
process_seq() unless $curseq; #no defline = raw input: one sequence per line |
269 |
|
|
} |
270 |
|
|
process_seq() if $curseq; #sequence is in $seq variable |
271 |
|
|
} |
272 |
|
|
|
273 |
|
|
print "$m_linerest\n" if ($Getopt::Std::opt_M && $m_linerest); |
274 |
|
|
|
275 |
|
|
#============================ |
276 |
|
|
|
277 |
|
|
sub fai_load { |
278 |
|
|
open(FAI, $_[0]) || die ("Error opening index file $_[0]!\n"); |
279 |
|
|
while (<FAI>) { |
280 |
|
|
my @t=split(/\t/);# 0=seqID 1=seqlen 2=fpos 3=linelen 4=linelen+dlen |
281 |
|
|
next if m/^#/ && @t<5; |
282 |
|
|
$fai_h{$t[0]}=[$t[2], $t[3], $t[4]-$t[3], $t[1]]; |
283 |
|
|
# seqID => [fpos, linelen, delimlen, seqlen] |
284 |
|
|
} |
285 |
|
|
close(FAI); |
286 |
|
|
$fai_loaded=1; |
287 |
|
|
} |
288 |
|
|
|
289 |
|
|
sub processRanges { |
290 |
|
|
#my ($fasta, $chr, $chr_rdata, $faidx, $udefline)=@_; |
291 |
|
|
my ($fasta, $r_data, $faidx)=@_; |
292 |
|
|
my $chr=$$r_data[0]; |
293 |
|
|
$chr='' if $chr eq '-'; |
294 |
|
|
my $chr_strand=$$r_data[1]; |
295 |
|
|
my $udefline=$$r_data[2]; |
296 |
|
|
my $chr_rdata=$$r_data[3]; |
297 |
|
|
my @intvs = sort { $main::a->[0] <=> $main::b->[0] } @$chr_rdata; #range intervals |
298 |
|
|
my ($fpos, $seqlen, $dlen, $flinelen); |
299 |
|
|
open(FA, $fasta) || die("Error opening fasta file $fasta\n"); |
300 |
|
|
$fpos=0; |
301 |
|
|
my $iscdb=($faidx=~m/\.cidx$/); |
302 |
|
|
if ($chr) { |
303 |
|
|
die("Error: target sequence given, but no fasta index\n") unless $faidx; |
304 |
|
|
if ($iscdb) { |
305 |
|
|
my $r=`cdbyank -a '$chr' -P $faidx`; |
306 |
|
|
my $syserr=$?; |
307 |
|
|
chomp($r); |
308 |
|
|
die("Error at cdbyank -a '$chr' -P $faidx (exit code $syserr)\n") if length($r)==0 || $syserr; |
309 |
|
|
$fpos=int($r); |
310 |
|
|
seek(FA, $fpos, SEEK_SET); |
311 |
|
|
} |
312 |
|
|
else { #fasta index |
313 |
|
|
#only load the first time |
314 |
|
|
fai_load($faidx) unless ($fai_loaded); |
315 |
|
|
my $cd=$fai_h{$chr}; |
316 |
|
|
die("Error retrieving $chr data from fasta index!\n") unless $cd; |
317 |
|
|
($fpos, $flinelen, $dlen, $seqlen)=@$cd; |
318 |
|
|
$intvs[-1]->[1]=$seqlen-$intvs[-1]->[0]+1 unless $intvs[-1]->[1]; |
319 |
|
|
if ($intvs[-1]->[1]<=0) { |
320 |
|
|
#invalid range |
321 |
|
|
$seq=''; |
322 |
|
|
$defline=''; |
323 |
|
|
return; |
324 |
|
|
} |
325 |
|
|
$defline = $udefline ? '>'.$udefline : ">$chr"; |
326 |
|
|
seek(FA, $fpos, SEEK_SET); |
327 |
|
|
} #samtools fai |
328 |
|
|
} #indexed access |
329 |
|
|
if ($iscdb || !$chr) { |
330 |
|
|
#for cdb or plain fasta we have to determine line length by reading the first 1k bytes |
331 |
|
|
my $rbuf; |
332 |
|
|
read(FA, $rbuf, 1024); |
333 |
|
|
my @lines=split(/[\n\r]+/,$rbuf); |
334 |
|
|
my @ldelim=($rbuf=~m/([\n\r]+)/gs); |
335 |
|
|
if (@ldelim<2) { #we need at least 2 full lines read |
336 |
|
|
my $radd; |
337 |
|
|
read(FA, $radd, 4096); |
338 |
|
|
$rbuf.=$radd; |
339 |
|
|
@lines=split(/[\n\r]+/,$rbuf); |
340 |
|
|
@ldelim=($rbuf=~m/([\n\r]+)/gs); |
341 |
|
|
} |
342 |
|
|
$dlen=length($ldelim[0]); |
343 |
|
|
die("Error determining line ending type for $fasta!\n") if ($dlen==0); |
344 |
|
|
$defline = $udefline ? ">$udefline" : $lines[0]; |
345 |
|
|
$flinelen=length($lines[1]); #line length in fasta record |
346 |
|
|
$fpos+=length($lines[0])+$dlen; |
347 |
|
|
seek(FA, $fpos, SEEK_SET); #reposition at the beginning of the sequence |
348 |
|
|
} |
349 |
|
|
#now we are positioned at the beginning of the fasta record sequence |
350 |
|
|
#seek(FA, $fpos+length($lines[0])+$dlen, SEEK_SET); |
351 |
|
|
$seq=''; |
352 |
|
|
my $revC=1 if $chr_strand eq '-'; |
353 |
|
|
my $rstart=$intvs[0]->[0]-1; #first range start position, 0-based |
354 |
|
|
my $rstartpos= ($rstart<$flinelen) ? $rstart : ($rstart+$dlen*(int($rstart/$flinelen))); |
355 |
|
|
my $fstartpos=$fpos+$rstartpos; |
356 |
|
|
seek(FA, $fstartpos, SEEK_SET); |
357 |
|
|
if ($intvs[-1]->[1]) { #ending for last interval is known |
358 |
|
|
my $rend=$intvs[-1]->[0]+$intvs[-1]->[1]-2; |
359 |
|
|
#$seq_len=$rend-$rstart+1; |
360 |
|
|
my $rendpos=($rend<$flinelen) ? $rend : ($rend+$dlen*(int($rend/$flinelen))); |
361 |
|
|
#print STDERR "pulling from $fstartpos to $rendpos\n"; |
362 |
|
|
my $readlen=$rendpos-$rstartpos+1; |
363 |
|
|
#my $fstartpos=$fpos+length($lines[0])+$dlen+$rstartpos; |
364 |
|
|
read(FA, $seq, $readlen) || die("Error reading $readlen bytes from fasta $fasta at offset $fstartpos\n"); |
365 |
|
|
close(FA); |
366 |
|
|
} |
367 |
|
|
else { #last interval goes to the end of sequence, but seqlen is not known |
368 |
|
|
my $rend=$intvs[-1]->[0]+$intvs[-1]->[1]-2; |
369 |
|
|
#$seq_len=$rend-$rstart+1; |
370 |
|
|
local $_; |
371 |
|
|
while (<FA> && !m/^>/) { |
372 |
|
|
$seq.=$_; |
373 |
|
|
} |
374 |
|
|
close(FA); |
375 |
|
|
} |
376 |
|
|
$seq=~tr/\n\r//d; |
377 |
|
|
$seq_len=length($seq); |
378 |
|
|
#now extract the ranges: |
379 |
|
|
my @rdata; |
380 |
|
|
my $txt_intvs; |
381 |
|
|
foreach my $r (@intvs) { |
382 |
|
|
push(@rdata, [$$r[0]-$rstart, $$r[1]]); |
383 |
|
|
#$spliceseq.=substr($subseq, $cstart, $clen); |
384 |
|
|
$txt_intvs.='|'.$$r[0].'-'.($$r[0]+$$r[1]-1); |
385 |
|
|
} |
386 |
|
|
$defline.=$txt_intvs.$chr_strand unless $udefline; |
387 |
|
|
process_seq(\@rdata, $rstart, $revC); |
388 |
|
|
} |
389 |
|
|
|
390 |
|
|
sub process_seq { |
391 |
|
|
#my $range = $rangekey ? $ranges{$rangekey} : $ranges{$_[0]}; |
392 |
|
|
return unless $seq; |
393 |
|
|
my ($range, $basepos, $revC)=@_; |
394 |
|
|
if ($maxnspan) { |
395 |
|
|
$seq=~s/[NX]{$maxnspan,}/$maxnrepl/ig; |
396 |
|
|
$seq_len=length($seq); |
397 |
|
|
} |
398 |
|
|
if ($Getopt::Std::opt_U) { |
399 |
|
|
$seq=~s/[<>\d\s\-\*\.\,\;\:,\x7B-\xFF]+//sg; |
400 |
|
|
$seq_len=length($seq); |
401 |
|
|
} |
402 |
|
|
if ($rna2dna) { |
403 |
|
|
$seq =~ tr/Uu/Tt/; |
404 |
|
|
} |
405 |
|
|
$seq=uc($seq) if $Getopt::Std::opt_P; |
406 |
|
|
if ($trimNs) { |
407 |
|
|
$seq=~s/^[NX]+//; |
408 |
|
|
$seq=~s/[NX]+$//; |
409 |
|
|
$seq_len=length($seq); |
410 |
|
|
} |
411 |
|
|
if ($minseqlen && $seq_len<$minseqlen) { |
412 |
|
|
$seq=''; |
413 |
|
|
$seq_len=0; |
414 |
|
|
return; |
415 |
|
|
} |
416 |
|
|
if ($maxpercn) { |
417 |
|
|
my $nN=($seq=~tr/AaCcGgTtUu//c); |
418 |
|
|
if ((($nN*100.0)/$seq_len)>$maxpercn) { |
419 |
|
|
$seq=''; |
420 |
|
|
$seq_len=0; |
421 |
|
|
return; |
422 |
|
|
} |
423 |
|
|
} |
424 |
|
|
# basepos is a 0 based offset |
425 |
|
|
unless ($range || @ranges==0) { |
426 |
|
|
$range=$ranges[0]->[3]; |
427 |
|
|
#print STDERR "using default, first ranges!\n"; #DEBUG |
428 |
|
|
} |
429 |
|
|
#print STDERR "key=$_[0], range=$$range[0] ($$range[1])\n"; |
430 |
|
|
#print STDERR "$range\t$seq\n"; |
431 |
|
|
|
432 |
|
|
$seqcount++; |
433 |
|
|
if ($Getopt::Std::opt_M) { #merge only; |
434 |
|
|
#merge all input |
435 |
|
|
if ($mergedist>0 && $seqcount>1) { |
436 |
|
|
$seq=$m_linerest.$mergesep.$seq; |
437 |
|
|
} |
438 |
|
|
else { |
439 |
|
|
$seq=$m_linerest.$seq; |
440 |
|
|
} |
441 |
|
|
$m_linerest=''; |
442 |
|
|
$seq_len=length($seq); |
443 |
|
|
for (my $p=0;$p<$seq_len;$p+=$linelen) { |
444 |
|
|
my $sline=substr($seq, $p, $linelen); |
445 |
|
|
if (length($sline)==$linelen) { |
446 |
|
|
print $sline."\n"; |
447 |
|
|
} |
448 |
|
|
else { |
449 |
|
|
$m_linerest=$sline; |
450 |
|
|
last; |
451 |
|
|
} |
452 |
|
|
} |
453 |
|
|
$seq=''; |
454 |
|
|
$seq_len=0; |
455 |
|
|
return; |
456 |
|
|
} |
457 |
|
|
my @seqranges; |
458 |
|
|
my $intr_ends=''; # 0 1 2 3 |
459 |
|
|
my @gfdata; #array of [exon_start, exon_len, cdsphase, diagram] |
460 |
|
|
if ($range) { #extract range |
461 |
|
|
my @intvs = sort { $main::a->[0] <=> $main::b->[0] } @$range; |
462 |
|
|
#print STDERR length($seq)." (seq_len=$seq_len)\n"; |
463 |
|
|
#print STDERR "beginning: ".substr($seq,0,10)."\n"; |
464 |
|
|
#print STDERR " ending: ".substr($seq,-10)."\n"; |
465 |
|
|
my $i=0; |
466 |
|
|
foreach my $rng (@intvs) { |
467 |
|
|
$i++; |
468 |
|
|
if ($$rng[1]>0) { |
469 |
|
|
push(@seqranges, substr($seq, $$rng[0]-1, $$rng[1])); |
470 |
|
|
my $ss_before=substr($seq, $$rng[0]-3,2); |
471 |
|
|
$ss_before='NN' unless length($ss_before)==2; |
472 |
|
|
my $ss_after=substr($seq, $$rng[0]+$$rng[1]-1,2); |
473 |
|
|
$ss_after='NN' unless length($ss_after)==2; |
474 |
|
|
$intr_ends.=$ss_before.$ss_after; |
475 |
|
|
push(@gfdata, [$$rng[0], $$rng[1], 0,'']); |
476 |
|
|
#print STDERR ">>$seq<<\n"; |
477 |
|
|
} |
478 |
|
|
#elsif ($$rng[1]<0) { # DON"T use this or mix ranges it's not consistent! |
479 |
|
|
# my $rseq=substr($seq, $$rng[0]-1, -$$rng[1]); |
480 |
|
|
# push(@seqranges, reverseComplement($rseq)); |
481 |
|
|
# push(@gfdata, [$$rng[0], $$rng[1], 0, '']); |
482 |
|
|
# } |
483 |
|
|
else { #0 length means to the end of sequence |
484 |
|
|
push(@seqranges, substr($seq, $$rng[0]-1)); |
485 |
|
|
push(@gfdata, [$$rng[0], length($seqranges[-1]), 0,'']); |
486 |
|
|
} |
487 |
|
|
}#for each range |
488 |
|
|
#print STDERR "intr_ends=$intr_ends\n"; |
489 |
|
|
} |
490 |
|
|
else { #no ranges requested, create a dummy one with the whole sequence |
491 |
|
|
push(@seqranges, $seq); |
492 |
|
|
push(@gfdata, [1, length($seqranges[-1]),0, '']); |
493 |
|
|
} |
494 |
|
|
# -- first and last codons |
495 |
|
|
#there is that rare case when the start/stop codons can be |
496 |
|
|
#broken/split by an intron |
497 |
|
|
my ($fcodon, $lcodon); |
498 |
|
|
if (@seqranges>1) { |
499 |
|
|
my $l=$#seqranges; |
500 |
|
|
($fcodon, $lcodon)=( uc(substr($seqranges[0].$seqranges[1],0,3)), |
501 |
|
|
uc(substr($seqranges[$l-1].$seqranges[$l],-3)) ); |
502 |
|
|
} |
503 |
|
|
else { # single-exon |
504 |
|
|
($fcodon, $lcodon)=( uc(substr($seqranges[0],0,3)), |
505 |
|
|
uc(substr($seqranges[-1],-3)) ); |
506 |
|
|
} |
507 |
|
|
my $trframe=$Getopt::Std::opt_t || $Getopt::Std::opt_T; |
508 |
|
|
my $splseq; |
509 |
|
|
$revC=($revC || $complement); |
510 |
|
|
foreach my $sq (@seqranges) { |
511 |
|
|
# -- complement requested? |
512 |
|
|
if ($revC) { |
513 |
|
|
$sq=reverseComplement($sq); |
514 |
|
|
#complement the range coordinates too! |
515 |
|
|
#$rStart=length($seq)-($rStart+$rLen)+2 if $range; |
516 |
|
|
$splseq=$sq.$splseq; |
517 |
|
|
} |
518 |
|
|
else { |
519 |
|
|
$splseq.=$sq; |
520 |
|
|
} |
521 |
|
|
} # for each range |
522 |
|
|
|
523 |
|
|
if (length($intr_ends)>4) { # have introns |
524 |
|
|
$intr_ends=substr($intr_ends,2,-2); |
525 |
|
|
if ($revC) { |
526 |
|
|
$intr_ends=reverseComplement($intr_ends); |
527 |
|
|
} |
528 |
|
|
} |
529 |
|
|
|
530 |
|
|
if ($revC) { |
531 |
|
|
@gfdata=reverse(@gfdata); |
532 |
|
|
($fcodon, $lcodon)=(reverseComplement($lcodon), |
533 |
|
|
reverseComplement($fcodon)); |
534 |
|
|
} |
535 |
|
|
# determine CDS phases |
536 |
|
|
my $initphase=0; #could be provided |
537 |
|
|
my $acclen=3-$initphase; |
538 |
|
|
foreach my $x (@gfdata) { |
539 |
|
|
#$$x[2]=($initframe+$acclen) % 3; |
540 |
|
|
$$x[2]=(3-$acclen%3) % 3; |
541 |
|
|
$acclen+=$$x[1]; |
542 |
|
|
} |
543 |
|
|
my $printflag=1; #should it be printed or not? |
544 |
|
|
# $splseq has the actual sequence to be processed |
545 |
|
|
# ========== vvvv - any other seq processing/trimming should be applied here: |
546 |
|
|
if ($getOrf) { |
547 |
|
|
my ($orf_start, $orf_end); |
548 |
|
|
($splseq, $orf_start, $orf_end)=getLongestOrf($splseq); |
549 |
|
|
if ($orf_start<=0) { |
550 |
|
|
$defline.=' ORF:none'; |
551 |
|
|
$orf_start=0;$orf_end=0; |
552 |
|
|
$splseq=''; |
553 |
|
|
$printflag=0; |
554 |
|
|
} |
555 |
|
|
else { |
556 |
|
|
$defline.=" ORF:$orf_start-$orf_end" if $defline; |
557 |
|
|
} |
558 |
|
|
} |
559 |
|
|
my $transl; |
560 |
|
|
if ($trframe) { |
561 |
|
|
$transl=&dna2prot($splseq, $trframe); #<trframe>th frame translate request |
562 |
|
|
$splseq=$transl; |
563 |
|
|
} |
564 |
|
|
elsif ($Getopt::Std::opt_Z) { |
565 |
|
|
$splseq=&prot2dna($splseq); #back-translate request |
566 |
|
|
} |
567 |
|
|
# ========== ^^^^ |
568 |
|
|
#------ output processing here: |
569 |
|
|
if ($validateCDS) { |
570 |
|
|
$transl=&dna2prot($splseq, $trframe) unless $transl; |
571 |
|
|
$printflag= ($splseq%3 == 0 && $codons{$fcodon} eq 'M' |
572 |
|
|
&& index($transl, '.')==length($transl)-1) |
573 |
|
|
} |
574 |
|
|
unless ($printflag) { |
575 |
|
|
$seq=''; # signal that it was processed |
576 |
|
|
$seq_len=0; |
577 |
|
|
return; |
578 |
|
|
} |
579 |
|
|
unless ($gffout) { |
580 |
|
|
if ($defline) { #format back to fasta |
581 |
|
|
print $defline."\n" unless ($Getopt::Std::opt_X); |
582 |
|
|
} |
583 |
|
|
else { #no defline found in the original file |
584 |
|
|
if ($Getopt::Std::opt_D && !$Getopt::Std::opt_X) { |
585 |
|
|
#my $defl='>SEQ'.$seqcount.'_'.length($seq); |
586 |
|
|
my $defl='>SEQ'.$seqcount.'_'.$seq_len; |
587 |
|
|
print "$defl\n"; |
588 |
|
|
} |
589 |
|
|
} |
590 |
|
|
} #print defline here |
591 |
|
|
unless ($gffout) { |
592 |
|
|
if ($Getopt::Std::opt_L) { #one line printing |
593 |
|
|
print $splseq."\n"; |
594 |
|
|
} |
595 |
|
|
else { # fasta multi-line printing |
596 |
|
|
for (my $p=0;$p<length($splseq);$p+=$linelen) { |
597 |
|
|
my $pl=substr($splseq, $p, $linelen)."\n"; |
598 |
|
|
print( ($trframe && $ansicolors) ? &colorize($pl, '.') : $pl ); |
599 |
|
|
} |
600 |
|
|
} |
601 |
|
|
} |
602 |
|
|
my @wmsg; |
603 |
|
|
if ($exoninfo || $gffout) { |
604 |
|
|
$transl=&dna2prot($splseq, $trframe) unless $transl; |
605 |
|
|
push(@wmsg,'NoStartCodon') if $codons{$fcodon} ne 'M'; |
606 |
|
|
#$wmsg.=';NoStopCodon' if $codons{$lcodon} ne '.'; |
607 |
|
|
push(@wmsg,'NoStopCodon') if substr($transl,-1) ne '.'; |
608 |
|
|
} |
609 |
|
|
|
610 |
|
|
if ($exoninfo) { #show exon info: |
611 |
|
|
print STDERR '----> Exon/CDS info ('.int(length($intr_ends)/4)." exons):\n"; |
612 |
|
|
my $afcodon=$fcodon; |
613 |
|
|
$afcodon=$clgreen.$fcodon.$clreset if ($fcodon eq 'ATG'); |
614 |
|
|
my @gfd;; |
615 |
|
|
my $pstrand=' '; |
616 |
|
|
if ($revC) { |
617 |
|
|
@gfd=map { $_->[0]+$_->[1]-1 } (@gfdata); |
618 |
|
|
$pstrand='-'; |
619 |
|
|
} |
620 |
|
|
else { @gfd=map { $_->[0] } @gfdata; } |
621 |
|
|
printf STDERR '%8d%s [%s~~', $basepos+$gfd[0],$pstrand,$afcodon; |
622 |
|
|
#print STDERR ' '.($basepos+int($gfdata[0]->[2])).' ['.$fcodon.'~~'; |
623 |
|
|
if (@gfdata>1) { # multi-exon |
624 |
|
|
my @splsites=unpack('(A4)*',$intr_ends); |
625 |
|
|
my $xn=1; |
626 |
|
|
foreach my $pair (@splsites) { |
627 |
|
|
my $don=substr($pair,0,2); |
628 |
|
|
$don=$clred.$don.$clreset if ($ansicolors && $don ne 'GT'); |
629 |
|
|
my $acc=substr($pair,-2); |
630 |
|
|
$acc=$clred.$acc.$clreset if ($ansicolors && $acc ne 'AG'); |
631 |
|
|
print STDERR '~~~]'.$don."\n"; |
632 |
|
|
#print STDERR ' '.($basepos+int($gfdata[$xn]->[2])).' '. |
633 |
|
|
# $acc. '[~~~~~'; |
634 |
|
|
printf STDERR '%8d%s %s[~~~~~', $basepos+$gfd[$xn], $pstrand,$acc; |
635 |
|
|
$xn++; |
636 |
|
|
} |
637 |
|
|
} #multi-exon |
638 |
|
|
my $alcodon=$lcodon; |
639 |
|
|
$alcodon=$clgreen.$lcodon.$clreset if $codons{uc($lcodon)} eq '.'; |
640 |
|
|
print STDERR $lcodon."]\n----------------------------"; |
641 |
|
|
if (@wmsg>0) { |
642 |
|
|
print STDERR "Warning: ".$clred.join($clreset.', '.$clred,@wmsg).$clreset; |
643 |
|
|
} |
644 |
|
|
print STDERR "\n"; |
645 |
|
|
} |
646 |
|
|
if ($gffout) { |
647 |
|
|
#show GFF output |
648 |
|
|
my $strand='+'; |
649 |
|
|
if ($revC) { |
650 |
|
|
@gfdata=reverse(@gfdata); #restore in fact, because we reversed it earlier |
651 |
|
|
$strand='-'; |
652 |
|
|
} |
653 |
|
|
my ($seqid)=($defline=~m/^>(\S+)/); |
654 |
|
|
my $warns=''; |
655 |
|
|
$warns=';'.join(';',@wmsg) if (@wmsg>0); |
656 |
|
|
print join("\t", $seqid, 'jigsaw', 'mRNA', $basepos+$gfdata[0]->[0], |
657 |
|
|
$basepos+$gfdata[-1]->[0]+$gfdata[-1]->[1]-1,'.',$strand,'.', 'ID=tjsm.XX;Name=tjsm.XX'.$warns)."\n"; |
658 |
|
|
foreach my $gfl (@gfdata) { |
659 |
|
|
print join("\t", $seqid, 'jigsaw', 'CDS', $basepos+$$gfl[0], |
660 |
|
|
$basepos+$$gfl[0]+$$gfl[1]-1,'.',$strand,$$gfl[2], 'Parent=tjsm.XX')."\n"; |
661 |
|
|
} |
662 |
|
|
} |
663 |
|
|
$seq=''; #processed! |
664 |
|
|
$seq_len=0; |
665 |
|
|
} |
666 |
|
|
|
667 |
|
|
|
668 |
|
|
sub parseRange { |
669 |
|
|
my $r=$_[0]; |
670 |
|
|
my ($rStart, $sep, $rLen)= ($r =~ m/^(\d+)([ \t\.\-\:_]+)(\d*)/); |
671 |
|
|
if ($rLen) { |
672 |
|
|
if ($sep ne ':') { # start-to-end format |
673 |
|
|
($rLen, $rStart) = ($rLen<$rStart) ? ($rStart-$rLen+1, $rLen) : ($rLen-$rStart+1, $rStart); |
674 |
|
|
} |
675 |
|
|
} |
676 |
|
|
#$rLen=-$rLen if ($rLen<0 && $r=~m/\d+\s*\-\s*$/); |
677 |
|
|
$rLen=-$rLen if ($rLen<0); |
678 |
|
|
return ($rStart, $rLen); #$rLen could be empty |
679 |
|
|
} |
680 |
|
|
|
681 |
|
|
|
682 |
|
|
sub colorize { |
683 |
|
|
#colorize all $p strings in $s |
684 |
|
|
my ($s, $p)=@_; |
685 |
|
|
#$red="\e[1;33;41m"; |
686 |
|
|
#$normal="\e[0m"; |
687 |
|
|
my $r="\e[1;33;41m"; # bright yellow on red |
688 |
|
|
my $b="\e[0m"; #reset colors |
689 |
|
|
$p=~s/\./\\./g; |
690 |
|
|
$s=~s/($p)/$r$1$b/g; |
691 |
|
|
return $s; |
692 |
|
|
} |
693 |
|
|
|
694 |
|
|
sub reverseComplement { |
695 |
|
|
my $s=reverse($_[0]); |
696 |
|
|
$s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/; |
697 |
|
|
return $s; |
698 |
|
|
} |
699 |
|
|
|
700 |
|
|
sub dna2prot { |
701 |
|
|
my $frame=$_[1] || 1; |
702 |
|
|
my $s; |
703 |
|
|
if ($frame<0) { |
704 |
|
|
$s=reverseComplement($_[0]); |
705 |
|
|
$frame=-$frame; |
706 |
|
|
} |
707 |
|
|
else { $s=$_[0]; } |
708 |
|
|
if ($frame==1) { $s= uc($s); } |
709 |
|
|
else { $s = uc(substr($s, $frame-1)); } |
710 |
|
|
#my @cods = ($s =~ m/([A-Z][A-Z][A-Z])/g); |
711 |
|
|
my @cods = unpack('(A3)*', $s); |
712 |
|
|
my $r; |
713 |
|
|
foreach my $c (@cods) { |
714 |
|
|
my $aa=$codons{$c} || 'X'; |
715 |
|
|
$r.=$aa; |
716 |
|
|
} |
717 |
|
|
return $r; |
718 |
|
|
} |
719 |
|
|
|
720 |
|
|
sub prot2dna { |
721 |
|
|
my $s= uc($_[0]); |
722 |
|
|
my @aa = ($s =~ m/([A-Z])/g); |
723 |
|
|
my $r; |
724 |
|
|
foreach my $a (@aa) { |
725 |
|
|
my $codon=$toDNA{$a} || 'N'; |
726 |
|
|
$r.=$codon; |
727 |
|
|
} |
728 |
|
|
return $r; |
729 |
|
|
} |
730 |
|
|
|
731 |
|
|
sub getLongestOrf { |
732 |
|
|
my ($seq)=@_; |
733 |
|
|
my $best=0; |
734 |
|
|
my ($bests,$beste)=(-1,-1); |
735 |
|
|
#my $bestorf=""; |
736 |
|
|
my $seqlen=length($seq); |
737 |
|
|
my @starts=(); |
738 |
|
|
my @ends=(); |
739 |
|
|
if ($getOrfAny) { # include start frames if not a stop codon directly |
740 |
|
|
for (my $frame=0;$frame<3;$frame++) { |
741 |
|
|
unless ($seq=~m/^.{$frame}(taa|tga|tag)/i) { |
742 |
|
|
push @starts,$frame+1; |
743 |
|
|
} |
744 |
|
|
} |
745 |
|
|
} |
746 |
|
|
while ($seq=~m/(atg)/gi) { |
747 |
|
|
push @starts,pos($seq)-2; |
748 |
|
|
} |
749 |
|
|
|
750 |
|
|
while ($seq=~m/(taa|tga|tag)/gi) { |
751 |
|
|
push @ends,pos($seq)-2; |
752 |
|
|
} |
753 |
|
|
push @ends,($seqlen-2,$seqlen-1,$seqlen); |
754 |
|
|
for my $s (@starts) { |
755 |
|
|
for my $e (@ends) { |
756 |
|
|
if ($e%3==$s%3 and $e>$s) { |
757 |
|
|
if ($e-$s>$best) { |
758 |
|
|
$best=$e-$s; |
759 |
|
|
($bests,$beste)=($s,$e); |
760 |
|
|
#$bestorf=substr($seq,$s-1,$e-$s+1); |
761 |
|
|
} |
762 |
|
|
last |
763 |
|
|
} |
764 |
|
|
else { |
765 |
|
|
next |
766 |
|
|
} |
767 |
|
|
} #for each end |
768 |
|
|
} #for each start |
769 |
|
|
if ($bests<=0) { |
770 |
|
|
return ('',0,0); |
771 |
|
|
} |
772 |
|
|
$beste+=2 if $beste<=length($seq)-2; #make it so it includes the stop codon! |
773 |
|
|
return (substr($seq,$bests-1,$beste-$bests+1), $bests, $beste); |
774 |
|
|
} |