ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2gaps.pl
Revision: 23
Committed: Tue Jul 26 21:44:38 2011 UTC (13 years, 2 months ago) by gpertea
Original Path: ann_bin/gff2gaps.pl
File size: 2089 byte(s)
Log Message:
adding misc scripts

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 my $usage=q/
4 gff2gaps.pl <gff_file> <gseq_w_gaps.fa>
5
6 It will output a fasta file with the same content in <gseq_w_gaps.fa>
7 but with exon nucleotides changed to E and introns to I
8 Only the first set of exons in the GFF file are considered..
9 /;
10
11 my $gffile=shift(@ARGV);
12 die("$usage\n") unless $gffile;
13 my @ex;
14 my @cd;
15 open(GFF, $gffile) || die ("Error opening $gffile!\n");
16 while (<GFF>) {
17 next if m/^#/;
18 my ($chr, $mt, $ftype, $fstart, $fend, $rest)=split(/\t/, $_, 6);
19 if ($ftype eq 'exon' || $ftype eq 'CDS') {
20 ($fstart, $fend)=($fend, $fstart) if ($fend<$fstart);
21
22 if ($ftype eq 'exon') { push(@ex, [$fstart, $fend]) }
23 else { push(@cd, [$fstart, $fend]) }
24 }
25 else {
26 last if @ex>0 || @cd>0; #assumes all exons/CDSs are grouped
27 }
28 }
29 close(GFF);
30 my @se = sort { $main::a->[0] <=> $main::b->[0] } @ex;
31 my @sc = sort { $main::a->[0] <=> $main::b->[0] } @cd;
32
33 #print STDERR scalar(@se)." exons found:\n";
34 #foreach my $e (@se) { print STDERR " $$e[0]..$$e[1] length: ".($$e[1]-$$e[0]+1)."\n"; }
35 #exit;
36 my $ei=0;
37 my $ci=0;
38 my $p=0;
39 my $ap=0;
40 my $isExon=0;
41 my $isCoding=0;
42 my $prevState=0;
43 while (<>) {
44 if (m/^>/) {
45 s/^>(\S+)/>$1\|refmsk/;
46 print $_;
47 next;
48 }
49 #sequence
50 tr/\n\r \t//d;
51 my $l=$_;
52 for (my $i=0;$i<length($l);$i++) {
53 my $c=substr($l,$i,1);
54 my $ich='.';
55 $ap++; #allignment position
56 goto IREST if ( $c eq '-');
57 $isExon=0;
58 $isCoding=0;
59 $p++;
60 goto IREST if ($ei >= @se); #beyond last exon
61 if (@sc>0) {
62 if ($p<=$sc[$ci]->[1]) { # before cur cdseg end
63 $isCoding=1 if ($p>=$sc[$ci]->[0]);
64 }
65 else { # beyond cur cdseg end
66 $ci++;
67 }
68 } # cds defined
69 if ($p<=$se[$ei]->[1]) { # before cur exon end
70 $isExon=1 if ($p>=$se[$ei]->[0]);
71 }
72 else { # beyond cur exon end
73 $ei++;
74 }
75 $ich=$c if ($ei>0 && ($p-$se[$ei-1]->[1]<3 || $se[$ei]->[0]-$p<3));
76 IREST:
77 if ($prevState != $isExon) {
78 my ($sep, $xp) = $prevState==1 ? ("\n",$ap-1):(' - ',$ap);
79 print STDERR $xp.$sep;
80 $prevState=$isExon;
81 }
82 substr($l, $i, 1)=$isExon ? ($isCoding?'C':'T') : $ich;
83 }
84 print $l."\n";
85 }

Properties

Name Value
svn:executable *