ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/seqmanip
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 26471 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use Fcntl qw(SEEK_SET SEEK_END SEEK_CUR); # for seek
5    
6     my $usage=q/
7     A simple fasta sequence(s) manipulation tool.
8     Usage:
9     seqmanip [-C] [-r<ranges>] [-L] [-D|-X] [-I] [-G] [-T|-t <frame>|-Z]
10     [-M|-m <ncount>] [-f <ranges_file>] [fasta file(s)...]
11     The input files should be one or more nucleotide or aminoacid sequences
12     in FASTA or raw format; if no files are given, input is expected at STDIN
13    
14     Options:
15     -C reverse complement (is -r is given, first the subsequence is extracted
16     and only then reverse-complemented)
17     -l the line length for the output fasta record (default 70)
18     -r extract a range (subsequence) from the first input fasta sequence;
19     <range> can have one of the formats:
20     <start>..<end> or <start>-<end> or <start>:<len>
21     Multiple such intervals can be given after the -r option (comma or space
22     delimited) and they will be spliced together
23     This option doesn't work with -M or -n options.
24     -f same as -r but for multi-fasta\/seq input; expects a file containing
25     lines of this format:
26     <seqname> <range>
27     The format of <range> is the same with the -r above or it may be just
28     space delimited <start> <end> numbers; the output will be a multi-fasta
29     file with one subsequence for each line in the <ranges_file>
30     -M merge (concatenate) all input sequences into a single sequence
31     -m merge all input sequences separating them by <ncount> Ns
32     -d provides a custom defline for -M and -N options
33     -D add basic defline if the input doesn't provide one
34     -X exclude defline from the output (just print the sequence)
35     -L provide one-line output for each sequence
36     -T first frame translation (DNA to aminoacid)
37     -t <frame>th frame translation; <frame> could be 1,2,3,-1,-2,-3
38     -E show exon\/splice-sites\/start\/stop info (requires -r)
39     -G show GFF output for the given range(s) (requires -r)
40     -Z raw reverse translate the output (aminoacid to DNA, rather useless)
41     -V only show a sequence if it's a valid, complete CDS
42     -q only show a sequence if it's longer than <minseqlen>
43     -n discard sequences with a percentage of Ns (or Xs) higher than <maxpercN>
44     -y remove spans of Ns longer than <maxnspan> replacing them by <maxnspan> Ns
45     -Y trim Ns from either end of sequence
46     -U cleanup\/remove extraneous characters(e.g. digits, dashes, stars)
47     from every input sequence
48     -R convert RNA to DNA (simply replacing U wih T)
49     -P convert all sequence letters to uppercase
50     -A use ANSI colors to highlight features in terminal (for -E option)
51     -O return longest ORF sequence for each input sequence (start..stop)
52     -Q same as -O but doesn't require a start codon
53     /;
54    
55     my %codons=(
56     'AAA'=>'K', 'AAC'=>'N', 'AAG'=>'K', 'AAR'=>'K', 'AAT'=>'N',
57     'AAY'=>'N', 'ACA'=>'T', 'ACB'=>'T', 'ACC'=>'T', 'ACD'=>'T',
58     'ACG'=>'T', 'ACH'=>'T', 'ACK'=>'T', 'ACM'=>'T', 'ACN'=>'T',
59     'ACR'=>'T', 'ACS'=>'T', 'ACT'=>'T', 'ACV'=>'T', 'ACW'=>'T',
60     'ACY'=>'T', 'AGA'=>'R', 'AGC'=>'S', 'AGG'=>'R', 'AGR'=>'R',
61     'AGT'=>'S', 'AGY'=>'S', 'ATA'=>'I', 'ATC'=>'I', 'ATG'=>'M',
62     'ATH'=>'I', 'ATM'=>'I', 'ATT'=>'I', 'ATW'=>'I', 'ATY'=>'I',
63     'CAA'=>'Q', 'CAC'=>'H', 'CAG'=>'Q', 'CAR'=>'Q', 'CAT'=>'H',
64     'CAY'=>'H', 'CCA'=>'P', 'CCB'=>'P', 'CCC'=>'P', 'CCD'=>'P',
65     'CCG'=>'P', 'CCH'=>'P', 'CCK'=>'P', 'CCM'=>'P', 'CCN'=>'P',
66     'CCR'=>'P', 'CCS'=>'P', 'CCT'=>'P', 'CCV'=>'P', 'CCW'=>'P',
67     'CCY'=>'P', 'CGA'=>'R', 'CGB'=>'R', 'CGC'=>'R', 'CGD'=>'R',
68     'CGG'=>'R', 'CGH'=>'R', 'CGK'=>'R', 'CGM'=>'R', 'CGN'=>'R',
69     'CGR'=>'R', 'CGS'=>'R', 'CGT'=>'R', 'CGV'=>'R', 'CGW'=>'R',
70     'CGY'=>'R', 'CTA'=>'L', 'CTB'=>'L', 'CTC'=>'L', 'CTD'=>'L',
71     'CTG'=>'L', 'CTH'=>'L', 'CTK'=>'L', 'CTM'=>'L', 'CTN'=>'L',
72     'CTR'=>'L', 'CTS'=>'L', 'CTT'=>'L', 'CTV'=>'L', 'CTW'=>'L',
73     'CTY'=>'L', 'GAA'=>'E', 'GAC'=>'D', 'GAG'=>'E', 'GAR'=>'E',
74     'GAT'=>'D', 'GAY'=>'D', 'GCA'=>'A', 'GCB'=>'A', 'GCC'=>'A',
75     'GCD'=>'A', 'GCG'=>'A', 'GCH'=>'A', 'GCK'=>'A', 'GCM'=>'A',
76     'GCN'=>'A', 'GCR'=>'A', 'GCS'=>'A', 'GCT'=>'A', 'GCV'=>'A',
77     'GCW'=>'A', 'GCY'=>'A', 'GGA'=>'G', 'GGB'=>'G', 'GGC'=>'G',
78     'GGD'=>'G', 'GGG'=>'G', 'GGH'=>'G', 'GGK'=>'G', 'GGM'=>'G',
79     'GGN'=>'G', 'GGR'=>'G', 'GGS'=>'G', 'GGT'=>'G', 'GGV'=>'G',
80     'GGW'=>'G', 'GGY'=>'G', 'GTA'=>'V', 'GTB'=>'V', 'GTC'=>'V',
81     'GTD'=>'V', 'GTG'=>'V', 'GTH'=>'V', 'GTK'=>'V', 'GTM'=>'V',
82     'GTN'=>'V', 'GTR'=>'V', 'GTS'=>'V', 'GTT'=>'V', 'GTV'=>'V',
83     'GTW'=>'V', 'GTY'=>'V', 'MGA'=>'R', 'MGG'=>'R', 'MGR'=>'R',
84     'NNN'=>'X', 'RAY'=>'B', 'SAR'=>'Z', 'TAA'=>'.', 'TAC'=>'Y',
85     'TAG'=>'.', 'TAR'=>'.', 'TAT'=>'Y', 'TAY'=>'Y', 'TCA'=>'S',
86     'TCB'=>'S', 'TCC'=>'S', 'TCD'=>'S', 'TCG'=>'S', 'TCH'=>'S',
87     'TCK'=>'S', 'TCM'=>'S', 'TCN'=>'S', 'TCR'=>'S', 'TCS'=>'S',
88     'TCT'=>'S', 'TCV'=>'S', 'TCW'=>'S', 'TCY'=>'S', 'TGA'=>'.',
89     'TGC'=>'C', 'TGG'=>'W', 'TGT'=>'C', 'TGY'=>'C', 'TRA'=>'.',
90     'TTA'=>'L', 'TTC'=>'F', 'TTG'=>'L', 'TTR'=>'L', 'TTT'=>'F',
91     'TTY'=>'F', 'XXX'=>'X', 'YTA'=>'L', 'YTG'=>'L', 'YTR'=>'L'
92     );
93    
94    
95     getopts('AZTUPMEGNRXDVCLYOQr:d:m:q:l:t:f:y:n:') || die "$usage\n";
96     #my %ranges; # seqname => [offset, len] only populated by -r or -f option;
97     my @ranges; # list of [chr, strand, defline, [[offset1,len1], [offset1,len2], ..]]
98     # 0 1 2 3
99     #my @range_chrs; # just to keep track of the order of ranges requested
100     my $fai_loaded=0;
101     my %fai_h; # seqID => [fpos, linelen, delimlen, seqlen]
102    
103     my $m_linerest; #for the merge case (-M)
104     my $linelen=$Getopt::Std::opt_l || 70;
105     my $user_defline=$Getopt::Std::opt_d || 'dummyname';
106     my $ansicolors=$Getopt::Std::opt_A;
107     #some ansi colors:
108     my ($clred, $clgreen, $clreset)=("\e[1;33;41m", # bright yellow on red
109     "\e[0;32;40m", # green on black
110     "\e[0m") #reset colors
111     if $ansicolors;
112     my %toDNA=reverse(%codons);
113     @toDNA{'P','F','K','G'}=('CCC','TTT','AAA','GGG');
114     my ($getOrf, $getOrfAny)=($Getopt::Std::opt_O, $Getopt::Std::opt_Q);
115     $getOrf=1 if $getOrfAny;
116     my $complement=$Getopt::Std::opt_C;
117     my $validateCDS=$Getopt::Std::opt_V;
118     my $exoninfo=$Getopt::Std::opt_E;
119     my $gffout=$Getopt::Std::opt_G;
120     my $rna2dna=$Getopt::Std::opt_R;
121     my $mergedist=$Getopt::Std::opt_m;
122     my $minseqlen=$Getopt::Std::opt_q;
123     my $maxnspan=$Getopt::Std::opt_y;
124     my $trimNs=$Getopt::Std::opt_Y;
125     my $maxnrepl='';
126     if ($maxnspan) {
127     $maxnrepl='N' x $maxnspan;
128     $maxnspan++;
129     }
130     my $maxpercn=$Getopt::Std::opt_n;
131     my $mergesep;
132     if ($mergedist>0) {
133     $Getopt::Std::opt_M=1;
134     $mergesep='N' x $mergedist;
135     }
136     my $rangetarget; #if a specific seq target was provided
137     # if this is not set, then the first fasta sequence given as input
138     # will be used for range queries
139     # parse range information if available
140     if ($Getopt::Std::opt_r) {
141     my $rangetext=$Getopt::Std::opt_r;
142     my $rstrand='+';
143     #if ($rangetext=~s/([\-\+])$//) { $rstrand=$2;}
144     if ($rangetext=~s/(\d+[\.\-\:_]+\d+)([\-\+])$/$1/) { $rstrand=$2; }
145     my @ar=split(/[\,\s]+/,$rangetext); #allow multiple ranges
146     my $seqtarget;
147     unless ($ar[0]=~m/^\d+\:\d+$/) {
148     if ($ar[0]=~s/^([\w\|\-\+\.]+)\://) {
149     # target chromosome given
150     $seqtarget=$1;
151     if ($seqtarget=~s/([\-\+])$//) { $rstrand=$1;}
152     $rangetarget=1;
153     }
154     }
155     if ($seqtarget) { # 0 1 2 3
156     push(@ranges, [$seqtarget,$rstrand,'', []]);
157     }
158     else {
159     push(@ranges, ['-',$rstrand,'',[]]);
160     }
161     my $rdata=$ranges[0]->[3]; # [ ]
162     foreach my $rt (@ar) {
163     my ($rStart, $rLen)=&parseRange($rt);
164     push(@$rdata, [$rStart, $rLen]);
165     }
166     }
167     elsif ($Getopt::Std::opt_f) { # file with ranges to pull
168     my $frname=$Getopt::Std::opt_f;
169     my $using_stdin;
170     if ($frname eq '-' || $frname eq 'stdin') {
171     #open(FRNG, <&STDIN);
172     open(FRNG, "<&=STDIN") or die "Couldn't alias STDIN : $!";
173     $using_stdin=1;
174     }
175     else {
176     open (FRNG, $frname) || die ("Error: cannot open seq ranges file '$frname'!\n");
177     }
178     while (<FRNG>) {
179     chomp;
180     my $rstrand='+';
181     my ($seqname, $rangetxt)=split(/\s+/,$_,2);
182     my $defline;
183     if ($seqname=~m/^([\w\|\-\+\.]+)\:(\d+[\.\-_]+\d*[\-\+]?)/) {
184     #chr:start-end format
185     # strand may follow chr or end
186     $seqname=$1;
187     $defline=$rangetxt;
188     $rangetxt=$2;
189     if ($seqname=~s/([\-\+])$//) { $rstrand=$1;}
190     elsif ($rangetxt=~s/(\d+[\.\-_]+\d+)([\-\+])$/$1/) { $rstrand=$2; }
191     }
192     else { #1st column is chr(strand), 2nd column is the interval
193     if ($seqname=~s/([\-\+])$//) { $rstrand=$1; }
194     $rangetxt=~s/(\d+)[\t ]+(\d+)/$1-$2/g;
195     my @rt=split(/\s+/,$rangetxt,2);
196     if ($rt[1]) {
197     $defline=$rt[1];
198     $rangetxt=$rt[0];
199     }
200     }
201     next unless $rangetxt;
202     push(@ranges, [$seqname, $rstrand, $defline, []]);
203     my $rdata=$ranges[-1]->[3];
204     #$rangetxt=~s/(\d+)[\t ]+(\d+)/$1-$2/g; #could be space delimited
205     my @ar=split(/\,/,$rangetxt);
206     foreach my $rt (@ar) {
207     my ($rStart, $rLen)=&parseRange($rt);
208     push(@$rdata, [$rStart, $rLen]);
209     }
210     }
211     $rangetarget=1 if @ranges>0;
212     close(FRNG) unless $using_stdin;
213     }
214     my $seq;
215     my $seq_len;
216     my $count=0;
217     my $defline;
218     my $curseq;
219     my $seqcount;
220     print ">$user_defline\n" if ($Getopt::Std::opt_M && !$Getopt::Std::opt_X);
221     die("Error: target sequence(s) given but no input file!\n")
222     if ($rangetarget && (@ARGV==0 || ! -f $ARGV[0]));
223     my $qrange=(@ranges>0 && @ARGV>0 && -f $ARGV[0]);
224     if ($qrange) { #target
225     if (@ranges==1 && $ranges[0]->[0] eq '-') { #single fasta file
226     processRanges($ARGV[0], $ranges[0]);
227     } # single-fasta file
228     else {
229     #actual sequence names provided, need cdbfasta or samtools index
230     my $fasta=$ARGV[0];
231     #die("Error: no fasta file $fasta!\n") unless -f $fasta;
232     my $faidx;
233     if ($fasta=~s/\.fai$//) {
234     $faidx=$fasta.'.fai';
235     }
236     else {
237     $faidx=$fasta.'.fai' if -f $fasta.'.fai';
238     }
239     unless ($faidx) {
240     if ($fasta=~s/\.cidx$//) {
241     $faidx=$fasta.'.cidx';
242     }
243     else {
244     $faidx=$fasta.'.cidx' if -f $fasta.'.cidx';
245     }
246     }
247     #print STDERR "info: using fasta index $faidx for $fasta.\n";
248     die("Error: no index file for fasta file $fasta") unless $faidx;
249     die("Error: no fasta file $fasta!\n") unless -f $fasta;
250     foreach my $rd (@ranges) {
251     processRanges($fasta, $rd, $faidx);
252     }
253     } # cidx file needed
254     } # fast[er] range queries
255     else { # normal (serial) stream processing, very slow range processing for large sequences
256     while (<>) {
257     if (m/^(>.+)/) {
258     process_seq() if $curseq; #sequence is in $seq variable
259     $defline=$1;
260     ($curseq)=($defline=~m/^>(\S+)/);
261     $seq_len=0;
262     next;
263     }
264     chomp;
265     tr/ \t\r\n//d;
266     $seq.=$_;
267     $seq_len+=length();
268     process_seq() unless $curseq; #no defline = raw input: one sequence per line
269     }
270     process_seq() if $curseq; #sequence is in $seq variable
271     }
272    
273     print "$m_linerest\n" if ($Getopt::Std::opt_M && $m_linerest);
274    
275     #============================
276    
277     sub fai_load {
278     open(FAI, $_[0]) || die ("Error opening index file $_[0]!\n");
279     while (<FAI>) {
280     my @t=split(/\t/);# 0=seqID 1=seqlen 2=fpos 3=linelen 4=linelen+dlen
281     next if m/^#/ && @t<5;
282     $fai_h{$t[0]}=[$t[2], $t[3], $t[4]-$t[3], $t[1]];
283     # seqID => [fpos, linelen, delimlen, seqlen]
284     }
285     close(FAI);
286     $fai_loaded=1;
287     }
288    
289     sub processRanges {
290     #my ($fasta, $chr, $chr_rdata, $faidx, $udefline)=@_;
291     my ($fasta, $r_data, $faidx)=@_;
292     my $chr=$$r_data[0];
293     $chr='' if $chr eq '-';
294     my $chr_strand=$$r_data[1];
295     my $udefline=$$r_data[2];
296     my $chr_rdata=$$r_data[3];
297     my @intvs = sort { $main::a->[0] <=> $main::b->[0] } @$chr_rdata; #range intervals
298     my ($fpos, $seqlen, $dlen, $flinelen);
299     open(FA, $fasta) || die("Error opening fasta file $fasta\n");
300     $fpos=0;
301     my $iscdb=($faidx=~m/\.cidx$/);
302     if ($chr) {
303     die("Error: target sequence given, but no fasta index\n") unless $faidx;
304     if ($iscdb) {
305     my $r=`cdbyank -a '$chr' -P $faidx`;
306     my $syserr=$?;
307     chomp($r);
308     die("Error at cdbyank -a '$chr' -P $faidx (exit code $syserr)\n") if length($r)==0 || $syserr;
309     $fpos=int($r);
310     seek(FA, $fpos, SEEK_SET);
311     }
312     else { #fasta index
313     #only load the first time
314     fai_load($faidx) unless ($fai_loaded);
315     my $cd=$fai_h{$chr};
316     die("Error retrieving $chr data from fasta index!\n") unless $cd;
317     ($fpos, $flinelen, $dlen, $seqlen)=@$cd;
318     $intvs[-1]->[1]=$seqlen-$intvs[-1]->[0]+1 unless $intvs[-1]->[1];
319     if ($intvs[-1]->[1]<=0) {
320     #invalid range
321     $seq='';
322     $defline='';
323     return;
324     }
325     $defline = $udefline ? '>'.$udefline : ">$chr";
326     seek(FA, $fpos, SEEK_SET);
327     } #samtools fai
328     } #indexed access
329     if ($iscdb || !$chr) {
330     #for cdb or plain fasta we have to determine line length by reading the first 1k bytes
331     my $rbuf;
332     read(FA, $rbuf, 1024);
333     my @lines=split(/[\n\r]+/,$rbuf);
334     my @ldelim=($rbuf=~m/([\n\r]+)/gs);
335     if (@ldelim<2) { #we need at least 2 full lines read
336     my $radd;
337     read(FA, $radd, 4096);
338     $rbuf.=$radd;
339     @lines=split(/[\n\r]+/,$rbuf);
340     @ldelim=($rbuf=~m/([\n\r]+)/gs);
341     }
342     $dlen=length($ldelim[0]);
343     die("Error determining line ending type for $fasta!\n") if ($dlen==0);
344     $defline = $udefline ? ">$udefline" : $lines[0];
345     $flinelen=length($lines[1]); #line length in fasta record
346     $fpos+=length($lines[0])+$dlen;
347     seek(FA, $fpos, SEEK_SET); #reposition at the beginning of the sequence
348     }
349     #now we are positioned at the beginning of the fasta record sequence
350     #seek(FA, $fpos+length($lines[0])+$dlen, SEEK_SET);
351     $seq='';
352     my $revC=1 if $chr_strand eq '-';
353     my $rstart=$intvs[0]->[0]-1; #first range start position, 0-based
354     my $rstartpos= ($rstart<$flinelen) ? $rstart : ($rstart+$dlen*(int($rstart/$flinelen)));
355     my $fstartpos=$fpos+$rstartpos;
356     seek(FA, $fstartpos, SEEK_SET);
357     if ($intvs[-1]->[1]) { #ending for last interval is known
358     my $rend=$intvs[-1]->[0]+$intvs[-1]->[1]-2;
359     #$seq_len=$rend-$rstart+1;
360     my $rendpos=($rend<$flinelen) ? $rend : ($rend+$dlen*(int($rend/$flinelen)));
361     #print STDERR "pulling from $fstartpos to $rendpos\n";
362     my $readlen=$rendpos-$rstartpos+1;
363     #my $fstartpos=$fpos+length($lines[0])+$dlen+$rstartpos;
364     read(FA, $seq, $readlen) || die("Error reading $readlen bytes from fasta $fasta at offset $fstartpos\n");
365     close(FA);
366     }
367     else { #last interval goes to the end of sequence, but seqlen is not known
368     my $rend=$intvs[-1]->[0]+$intvs[-1]->[1]-2;
369     #$seq_len=$rend-$rstart+1;
370     local $_;
371     while (<FA> && !m/^>/) {
372     $seq.=$_;
373     }
374     close(FA);
375     }
376     $seq=~tr/\n\r//d;
377     $seq_len=length($seq);
378     #now extract the ranges:
379     my @rdata;
380     my $txt_intvs;
381     foreach my $r (@intvs) {
382     push(@rdata, [$$r[0]-$rstart, $$r[1]]);
383     #$spliceseq.=substr($subseq, $cstart, $clen);
384     $txt_intvs.='|'.$$r[0].'-'.($$r[0]+$$r[1]-1);
385     }
386     $defline.=$txt_intvs.$chr_strand unless $udefline;
387     process_seq(\@rdata, $rstart, $revC);
388     }
389    
390     sub process_seq {
391     #my $range = $rangekey ? $ranges{$rangekey} : $ranges{$_[0]};
392     return unless $seq;
393     my ($range, $basepos, $revC)=@_;
394     if ($maxnspan) {
395     $seq=~s/[NX]{$maxnspan,}/$maxnrepl/ig;
396     $seq_len=length($seq);
397     }
398     if ($Getopt::Std::opt_U) {
399     $seq=~s/[<>\d\s\-\*\.\,\;\:,\x7B-\xFF]+//sg;
400     $seq_len=length($seq);
401     }
402     if ($rna2dna) {
403     $seq =~ tr/Uu/Tt/;
404     }
405     $seq=uc($seq) if $Getopt::Std::opt_P;
406     if ($trimNs) {
407     $seq=~s/^[NX]+//;
408     $seq=~s/[NX]+$//;
409     $seq_len=length($seq);
410     }
411     if ($minseqlen && $seq_len<$minseqlen) {
412     $seq='';
413     $seq_len=0;
414     return;
415     }
416     if ($maxpercn) {
417     my $nN=($seq=~tr/AaCcGgTtUu//c);
418     if ((($nN*100.0)/$seq_len)>$maxpercn) {
419     $seq='';
420     $seq_len=0;
421     return;
422     }
423     }
424     # basepos is a 0 based offset
425     unless ($range || @ranges==0) {
426     $range=$ranges[0]->[3];
427     #print STDERR "using default, first ranges!\n"; #DEBUG
428     }
429     #print STDERR "key=$_[0], range=$$range[0] ($$range[1])\n";
430     #print STDERR "$range\t$seq\n";
431    
432     $seqcount++;
433     if ($Getopt::Std::opt_M) { #merge only;
434     #merge all input
435     if ($mergedist>0 && $seqcount>1) {
436     $seq=$m_linerest.$mergesep.$seq;
437     }
438     else {
439     $seq=$m_linerest.$seq;
440     }
441     $m_linerest='';
442     $seq_len=length($seq);
443     for (my $p=0;$p<$seq_len;$p+=$linelen) {
444     my $sline=substr($seq, $p, $linelen);
445     if (length($sline)==$linelen) {
446     print $sline."\n";
447     }
448     else {
449     $m_linerest=$sline;
450     last;
451     }
452     }
453     $seq='';
454     $seq_len=0;
455     return;
456     }
457     my @seqranges;
458     my $intr_ends=''; # 0 1 2 3
459     my @gfdata; #array of [exon_start, exon_len, cdsphase, diagram]
460     if ($range) { #extract range
461     my @intvs = sort { $main::a->[0] <=> $main::b->[0] } @$range;
462     #print STDERR length($seq)." (seq_len=$seq_len)\n";
463     #print STDERR "beginning: ".substr($seq,0,10)."\n";
464     #print STDERR " ending: ".substr($seq,-10)."\n";
465     my $i=0;
466     foreach my $rng (@intvs) {
467     $i++;
468     if ($$rng[1]>0) {
469     push(@seqranges, substr($seq, $$rng[0]-1, $$rng[1]));
470     my $ss_before=substr($seq, $$rng[0]-3,2);
471     $ss_before='NN' unless length($ss_before)==2;
472     my $ss_after=substr($seq, $$rng[0]+$$rng[1]-1,2);
473     $ss_after='NN' unless length($ss_after)==2;
474     $intr_ends.=$ss_before.$ss_after;
475     push(@gfdata, [$$rng[0], $$rng[1], 0,'']);
476     #print STDERR ">>$seq<<\n";
477     }
478     #elsif ($$rng[1]<0) { # DON"T use this or mix ranges it's not consistent!
479     # my $rseq=substr($seq, $$rng[0]-1, -$$rng[1]);
480     # push(@seqranges, reverseComplement($rseq));
481     # push(@gfdata, [$$rng[0], $$rng[1], 0, '']);
482     # }
483     else { #0 length means to the end of sequence
484     push(@seqranges, substr($seq, $$rng[0]-1));
485     push(@gfdata, [$$rng[0], length($seqranges[-1]), 0,'']);
486     }
487     }#for each range
488     #print STDERR "intr_ends=$intr_ends\n";
489     }
490     else { #no ranges requested, create a dummy one with the whole sequence
491     push(@seqranges, $seq);
492     push(@gfdata, [1, length($seqranges[-1]),0, '']);
493     }
494     # -- first and last codons
495     #there is that rare case when the start/stop codons can be
496     #broken/split by an intron
497     my ($fcodon, $lcodon);
498     if (@seqranges>1) {
499     my $l=$#seqranges;
500     ($fcodon, $lcodon)=( uc(substr($seqranges[0].$seqranges[1],0,3)),
501     uc(substr($seqranges[$l-1].$seqranges[$l],-3)) );
502     }
503     else { # single-exon
504     ($fcodon, $lcodon)=( uc(substr($seqranges[0],0,3)),
505     uc(substr($seqranges[-1],-3)) );
506     }
507     my $trframe=$Getopt::Std::opt_t || $Getopt::Std::opt_T;
508     my $splseq;
509     $revC=($revC || $complement);
510     foreach my $sq (@seqranges) {
511     # -- complement requested?
512     if ($revC) {
513     $sq=reverseComplement($sq);
514     #complement the range coordinates too!
515     #$rStart=length($seq)-($rStart+$rLen)+2 if $range;
516     $splseq=$sq.$splseq;
517     }
518     else {
519     $splseq.=$sq;
520     }
521     } # for each range
522    
523     if (length($intr_ends)>4) { # have introns
524     $intr_ends=substr($intr_ends,2,-2);
525     if ($revC) {
526     $intr_ends=reverseComplement($intr_ends);
527     }
528     }
529    
530     if ($revC) {
531     @gfdata=reverse(@gfdata);
532     ($fcodon, $lcodon)=(reverseComplement($lcodon),
533     reverseComplement($fcodon));
534     }
535     # determine CDS phases
536     my $initphase=0; #could be provided
537     my $acclen=3-$initphase;
538     foreach my $x (@gfdata) {
539     #$$x[2]=($initframe+$acclen) % 3;
540     $$x[2]=(3-$acclen%3) % 3;
541     $acclen+=$$x[1];
542     }
543     my $printflag=1; #should it be printed or not?
544     # $splseq has the actual sequence to be processed
545     # ========== vvvv - any other seq processing/trimming should be applied here:
546     if ($getOrf) {
547     my ($orf_start, $orf_end);
548     ($splseq, $orf_start, $orf_end)=getLongestOrf($splseq);
549     if ($orf_start<=0) {
550     $defline.=' ORF:none';
551     $orf_start=0;$orf_end=0;
552     $splseq='';
553     $printflag=0;
554     }
555     else {
556     $defline.=" ORF:$orf_start-$orf_end" if $defline;
557     }
558     }
559     my $transl;
560     if ($trframe) {
561     $transl=&dna2prot($splseq, $trframe); #<trframe>th frame translate request
562     $splseq=$transl;
563     }
564     elsif ($Getopt::Std::opt_Z) {
565     $splseq=&prot2dna($splseq); #back-translate request
566     }
567     # ========== ^^^^
568     #------ output processing here:
569     if ($validateCDS) {
570     $transl=&dna2prot($splseq, $trframe) unless $transl;
571     $printflag= ($splseq%3 == 0 && $codons{$fcodon} eq 'M'
572     && index($transl, '.')==length($transl)-1)
573     }
574     unless ($printflag) {
575     $seq=''; # signal that it was processed
576     $seq_len=0;
577     return;
578     }
579     unless ($gffout) {
580     if ($defline) { #format back to fasta
581     print $defline."\n" unless ($Getopt::Std::opt_X);
582     }
583     else { #no defline found in the original file
584     if ($Getopt::Std::opt_D && !$Getopt::Std::opt_X) {
585     #my $defl='>SEQ'.$seqcount.'_'.length($seq);
586     my $defl='>SEQ'.$seqcount.'_'.$seq_len;
587     print "$defl\n";
588     }
589     }
590     } #print defline here
591     unless ($gffout) {
592     if ($Getopt::Std::opt_L) { #one line printing
593     print $splseq."\n";
594     }
595     else { # fasta multi-line printing
596     for (my $p=0;$p<length($splseq);$p+=$linelen) {
597     my $pl=substr($splseq, $p, $linelen)."\n";
598     print( ($trframe && $ansicolors) ? &colorize($pl, '.') : $pl );
599     }
600     }
601     }
602     my @wmsg;
603     if ($exoninfo || $gffout) {
604     $transl=&dna2prot($splseq, $trframe) unless $transl;
605     push(@wmsg,'NoStartCodon') if $codons{$fcodon} ne 'M';
606     #$wmsg.=';NoStopCodon' if $codons{$lcodon} ne '.';
607     push(@wmsg,'NoStopCodon') if substr($transl,-1) ne '.';
608     }
609    
610     if ($exoninfo) { #show exon info:
611     print STDERR '----> Exon/CDS info ('.int(length($intr_ends)/4)." exons):\n";
612     my $afcodon=$fcodon;
613     $afcodon=$clgreen.$fcodon.$clreset if ($fcodon eq 'ATG');
614     my @gfd;;
615     my $pstrand=' ';
616     if ($revC) {
617     @gfd=map { $_->[0]+$_->[1]-1 } (@gfdata);
618     $pstrand='-';
619     }
620     else { @gfd=map { $_->[0] } @gfdata; }
621     printf STDERR '%8d%s [%s~~', $basepos+$gfd[0],$pstrand,$afcodon;
622     #print STDERR ' '.($basepos+int($gfdata[0]->[2])).' ['.$fcodon.'~~';
623     if (@gfdata>1) { # multi-exon
624     my @splsites=unpack('(A4)*',$intr_ends);
625     my $xn=1;
626     foreach my $pair (@splsites) {
627     my $don=substr($pair,0,2);
628     $don=$clred.$don.$clreset if ($ansicolors && $don ne 'GT');
629     my $acc=substr($pair,-2);
630     $acc=$clred.$acc.$clreset if ($ansicolors && $acc ne 'AG');
631     print STDERR '~~~]'.$don."\n";
632     #print STDERR ' '.($basepos+int($gfdata[$xn]->[2])).' '.
633     # $acc. '[~~~~~';
634     printf STDERR '%8d%s %s[~~~~~', $basepos+$gfd[$xn], $pstrand,$acc;
635     $xn++;
636     }
637     } #multi-exon
638     my $alcodon=$lcodon;
639     $alcodon=$clgreen.$lcodon.$clreset if $codons{uc($lcodon)} eq '.';
640     print STDERR $lcodon."]\n----------------------------";
641     if (@wmsg>0) {
642     print STDERR "Warning: ".$clred.join($clreset.', '.$clred,@wmsg).$clreset;
643     }
644     print STDERR "\n";
645     }
646     if ($gffout) {
647     #show GFF output
648     my $strand='+';
649     if ($revC) {
650     @gfdata=reverse(@gfdata); #restore in fact, because we reversed it earlier
651     $strand='-';
652     }
653     my ($seqid)=($defline=~m/^>(\S+)/);
654     my $warns='';
655     $warns=';'.join(';',@wmsg) if (@wmsg>0);
656     print join("\t", $seqid, 'jigsaw', 'mRNA', $basepos+$gfdata[0]->[0],
657     $basepos+$gfdata[-1]->[0]+$gfdata[-1]->[1]-1,'.',$strand,'.', 'ID=tjsm.XX;Name=tjsm.XX'.$warns)."\n";
658     foreach my $gfl (@gfdata) {
659     print join("\t", $seqid, 'jigsaw', 'CDS', $basepos+$$gfl[0],
660     $basepos+$$gfl[0]+$$gfl[1]-1,'.',$strand,$$gfl[2], 'Parent=tjsm.XX')."\n";
661     }
662     }
663     $seq=''; #processed!
664     $seq_len=0;
665     }
666    
667    
668     sub parseRange {
669     my $r=$_[0];
670     my ($rStart, $sep, $rLen)= ($r =~ m/^(\d+)([ \t\.\-\:_]+)(\d*)/);
671     if ($rLen) {
672     if ($sep ne ':') { # start-to-end format
673     ($rLen, $rStart) = ($rLen<$rStart) ? ($rStart-$rLen+1, $rLen) : ($rLen-$rStart+1, $rStart);
674     }
675     }
676     #$rLen=-$rLen if ($rLen<0 && $r=~m/\d+\s*\-\s*$/);
677     $rLen=-$rLen if ($rLen<0);
678     return ($rStart, $rLen); #$rLen could be empty
679     }
680    
681    
682     sub colorize {
683     #colorize all $p strings in $s
684     my ($s, $p)=@_;
685     #$red="\e[1;33;41m";
686     #$normal="\e[0m";
687     my $r="\e[1;33;41m"; # bright yellow on red
688     my $b="\e[0m"; #reset colors
689     $p=~s/\./\\./g;
690     $s=~s/($p)/$r$1$b/g;
691     return $s;
692     }
693    
694     sub reverseComplement {
695     my $s=reverse($_[0]);
696     $s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
697     return $s;
698     }
699    
700     sub dna2prot {
701     my $frame=$_[1] || 1;
702     my $s;
703     if ($frame<0) {
704     $s=reverseComplement($_[0]);
705     $frame=-$frame;
706     }
707     else { $s=$_[0]; }
708     if ($frame==1) { $s= uc($s); }
709     else { $s = uc(substr($s, $frame-1)); }
710     #my @cods = ($s =~ m/([A-Z][A-Z][A-Z])/g);
711     my @cods = unpack('(A3)*', $s);
712     my $r;
713     foreach my $c (@cods) {
714     my $aa=$codons{$c} || 'X';
715     $r.=$aa;
716     }
717     return $r;
718     }
719    
720     sub prot2dna {
721     my $s= uc($_[0]);
722     my @aa = ($s =~ m/([A-Z])/g);
723     my $r;
724     foreach my $a (@aa) {
725     my $codon=$toDNA{$a} || 'N';
726     $r.=$codon;
727     }
728     return $r;
729     }
730    
731     sub getLongestOrf {
732     my ($seq)=@_;
733     my $best=0;
734     my ($bests,$beste)=(-1,-1);
735     #my $bestorf="";
736     my $seqlen=length($seq);
737     my @starts=();
738     my @ends=();
739     if ($getOrfAny) { # include start frames if not a stop codon directly
740     for (my $frame=0;$frame<3;$frame++) {
741     unless ($seq=~m/^.{$frame}(taa|tga|tag)/i) {
742     push @starts,$frame+1;
743     }
744     }
745     }
746     while ($seq=~m/(atg)/gi) {
747     push @starts,pos($seq)-2;
748     }
749    
750     while ($seq=~m/(taa|tga|tag)/gi) {
751     push @ends,pos($seq)-2;
752     }
753     push @ends,($seqlen-2,$seqlen-1,$seqlen);
754     for my $s (@starts) {
755     for my $e (@ends) {
756     if ($e%3==$s%3 and $e>$s) {
757     if ($e-$s>$best) {
758     $best=$e-$s;
759     ($bests,$beste)=($s,$e);
760     #$bestorf=substr($seq,$s-1,$e-$s+1);
761     }
762     last
763     }
764     else {
765     next
766     }
767     } #for each end
768     } #for each start
769     if ($bests<=0) {
770     return ('',0,0);
771     }
772     $beste+=2 if $beste<=length($seq)-2; #make it so it includes the stop codon!
773     return (substr($seq,$bests-1,$beste-$bests+1), $bests, $beste);
774     }

Properties

Name Value
svn:executable *