ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2gaps.pl
Revision: 23
Committed: Tue Jul 26 21:44:38 2011 UTC (13 years, 2 months ago) by gpertea
Original Path: ann_bin/gff2gaps.pl
File size: 2089 byte(s)
Log Message:
adding misc scripts

Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     my $usage=q/
4     gff2gaps.pl <gff_file> <gseq_w_gaps.fa>
5    
6     It will output a fasta file with the same content in <gseq_w_gaps.fa>
7     but with exon nucleotides changed to E and introns to I
8     Only the first set of exons in the GFF file are considered..
9     /;
10    
11     my $gffile=shift(@ARGV);
12     die("$usage\n") unless $gffile;
13     my @ex;
14     my @cd;
15     open(GFF, $gffile) || die ("Error opening $gffile!\n");
16     while (<GFF>) {
17     next if m/^#/;
18     my ($chr, $mt, $ftype, $fstart, $fend, $rest)=split(/\t/, $_, 6);
19     if ($ftype eq 'exon' || $ftype eq 'CDS') {
20     ($fstart, $fend)=($fend, $fstart) if ($fend<$fstart);
21    
22     if ($ftype eq 'exon') { push(@ex, [$fstart, $fend]) }
23     else { push(@cd, [$fstart, $fend]) }
24     }
25     else {
26     last if @ex>0 || @cd>0; #assumes all exons/CDSs are grouped
27     }
28     }
29     close(GFF);
30     my @se = sort { $main::a->[0] <=> $main::b->[0] } @ex;
31     my @sc = sort { $main::a->[0] <=> $main::b->[0] } @cd;
32    
33     #print STDERR scalar(@se)." exons found:\n";
34     #foreach my $e (@se) { print STDERR " $$e[0]..$$e[1] length: ".($$e[1]-$$e[0]+1)."\n"; }
35     #exit;
36     my $ei=0;
37     my $ci=0;
38     my $p=0;
39     my $ap=0;
40     my $isExon=0;
41     my $isCoding=0;
42     my $prevState=0;
43     while (<>) {
44     if (m/^>/) {
45     s/^>(\S+)/>$1\|refmsk/;
46     print $_;
47     next;
48     }
49     #sequence
50     tr/\n\r \t//d;
51     my $l=$_;
52     for (my $i=0;$i<length($l);$i++) {
53     my $c=substr($l,$i,1);
54     my $ich='.';
55     $ap++; #allignment position
56     goto IREST if ( $c eq '-');
57     $isExon=0;
58     $isCoding=0;
59     $p++;
60     goto IREST if ($ei >= @se); #beyond last exon
61     if (@sc>0) {
62     if ($p<=$sc[$ci]->[1]) { # before cur cdseg end
63     $isCoding=1 if ($p>=$sc[$ci]->[0]);
64     }
65     else { # beyond cur cdseg end
66     $ci++;
67     }
68     } # cds defined
69     if ($p<=$se[$ei]->[1]) { # before cur exon end
70     $isExon=1 if ($p>=$se[$ei]->[0]);
71     }
72     else { # beyond cur exon end
73     $ei++;
74     }
75     $ich=$c if ($ei>0 && ($p-$se[$ei-1]->[1]<3 || $se[$ei]->[0]-$p<3));
76     IREST:
77     if ($prevState != $isExon) {
78     my ($sep, $xp) = $prevState==1 ? ("\n",$ap-1):(' - ',$ap);
79     print STDERR $xp.$sep;
80     $prevState=$isExon;
81     }
82     substr($l, $i, 1)=$isExon ? ($isCoding?'C':'T') : $ich;
83     }
84     print $l."\n";
85     }

Properties

Name Value
svn:executable *