1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
my $usage=q/ |
4 |
gff2gaps.pl <gff_file> <gseq_w_gaps.fa> |
5 |
|
6 |
It will output a fasta file with the same content in <gseq_w_gaps.fa> |
7 |
but with exon nucleotides changed to E and introns to I |
8 |
Only the first set of exons in the GFF file are considered.. |
9 |
/; |
10 |
|
11 |
my $gffile=shift(@ARGV); |
12 |
die("$usage\n") unless $gffile; |
13 |
my @ex; |
14 |
my @cd; |
15 |
open(GFF, $gffile) || die ("Error opening $gffile!\n"); |
16 |
while (<GFF>) { |
17 |
next if m/^#/; |
18 |
my ($chr, $mt, $ftype, $fstart, $fend, $rest)=split(/\t/, $_, 6); |
19 |
if ($ftype eq 'exon' || $ftype eq 'CDS') { |
20 |
($fstart, $fend)=($fend, $fstart) if ($fend<$fstart); |
21 |
|
22 |
if ($ftype eq 'exon') { push(@ex, [$fstart, $fend]) } |
23 |
else { push(@cd, [$fstart, $fend]) } |
24 |
} |
25 |
else { |
26 |
last if @ex>0 || @cd>0; #assumes all exons/CDSs are grouped |
27 |
} |
28 |
} |
29 |
close(GFF); |
30 |
my @se = sort { $main::a->[0] <=> $main::b->[0] } @ex; |
31 |
my @sc = sort { $main::a->[0] <=> $main::b->[0] } @cd; |
32 |
|
33 |
#print STDERR scalar(@se)." exons found:\n"; |
34 |
#foreach my $e (@se) { print STDERR " $$e[0]..$$e[1] length: ".($$e[1]-$$e[0]+1)."\n"; } |
35 |
#exit; |
36 |
my $ei=0; |
37 |
my $ci=0; |
38 |
my $p=0; |
39 |
my $ap=0; |
40 |
my $isExon=0; |
41 |
my $isCoding=0; |
42 |
my $prevState=0; |
43 |
while (<>) { |
44 |
if (m/^>/) { |
45 |
s/^>(\S+)/>$1\|refmsk/; |
46 |
print $_; |
47 |
next; |
48 |
} |
49 |
#sequence |
50 |
tr/\n\r \t//d; |
51 |
my $l=$_; |
52 |
for (my $i=0;$i<length($l);$i++) { |
53 |
my $c=substr($l,$i,1); |
54 |
my $ich='.'; |
55 |
$ap++; #allignment position |
56 |
goto IREST if ( $c eq '-'); |
57 |
$isExon=0; |
58 |
$isCoding=0; |
59 |
$p++; |
60 |
goto IREST if ($ei >= @se); #beyond last exon |
61 |
if (@sc>0) { |
62 |
if ($p<=$sc[$ci]->[1]) { # before cur cdseg end |
63 |
$isCoding=1 if ($p>=$sc[$ci]->[0]); |
64 |
} |
65 |
else { # beyond cur cdseg end |
66 |
$ci++; |
67 |
} |
68 |
} # cds defined |
69 |
if ($p<=$se[$ei]->[1]) { # before cur exon end |
70 |
$isExon=1 if ($p>=$se[$ei]->[0]); |
71 |
} |
72 |
else { # beyond cur exon end |
73 |
$ei++; |
74 |
} |
75 |
$ich=$c if ($ei>0 && ($p-$se[$ei-1]->[1]<3 || $se[$ei]->[0]-$p<3)); |
76 |
IREST: |
77 |
if ($prevState != $isExon) { |
78 |
my ($sep, $xp) = $prevState==1 ? ("\n",$ap-1):(' - ',$ap); |
79 |
print STDERR $xp.$sep; |
80 |
$prevState=$isExon; |
81 |
} |
82 |
substr($l, $i, 1)=$isExon ? ($isCoding?'C':'T') : $ich; |
83 |
} |
84 |
print $l."\n"; |
85 |
} |