1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
my $usage=q/ |
4 |
|
|
gff2gaps.pl <gff_file> <gseq_w_gaps.fa> |
5 |
|
|
|
6 |
|
|
It will output a fasta file with the same content in <gseq_w_gaps.fa> |
7 |
|
|
but with exon nucleotides changed to E and introns to I |
8 |
|
|
Only the first set of exons in the GFF file are considered.. |
9 |
|
|
/; |
10 |
|
|
|
11 |
|
|
my $gffile=shift(@ARGV); |
12 |
|
|
die("$usage\n") unless $gffile; |
13 |
|
|
my @ex; |
14 |
|
|
my @cd; |
15 |
|
|
open(GFF, $gffile) || die ("Error opening $gffile!\n"); |
16 |
|
|
while (<GFF>) { |
17 |
|
|
next if m/^#/; |
18 |
|
|
my ($chr, $mt, $ftype, $fstart, $fend, $rest)=split(/\t/, $_, 6); |
19 |
|
|
if ($ftype eq 'exon' || $ftype eq 'CDS') { |
20 |
|
|
($fstart, $fend)=($fend, $fstart) if ($fend<$fstart); |
21 |
|
|
|
22 |
|
|
if ($ftype eq 'exon') { push(@ex, [$fstart, $fend]) } |
23 |
|
|
else { push(@cd, [$fstart, $fend]) } |
24 |
|
|
} |
25 |
|
|
else { |
26 |
|
|
last if @ex>0 || @cd>0; #assumes all exons/CDSs are grouped |
27 |
|
|
} |
28 |
|
|
} |
29 |
|
|
close(GFF); |
30 |
|
|
my @se = sort { $main::a->[0] <=> $main::b->[0] } @ex; |
31 |
|
|
my @sc = sort { $main::a->[0] <=> $main::b->[0] } @cd; |
32 |
|
|
|
33 |
|
|
#print STDERR scalar(@se)." exons found:\n"; |
34 |
|
|
#foreach my $e (@se) { print STDERR " $$e[0]..$$e[1] length: ".($$e[1]-$$e[0]+1)."\n"; } |
35 |
|
|
#exit; |
36 |
|
|
my $ei=0; |
37 |
|
|
my $ci=0; |
38 |
|
|
my $p=0; |
39 |
|
|
my $ap=0; |
40 |
|
|
my $isExon=0; |
41 |
|
|
my $isCoding=0; |
42 |
|
|
my $prevState=0; |
43 |
|
|
while (<>) { |
44 |
|
|
if (m/^>/) { |
45 |
|
|
s/^>(\S+)/>$1\|refmsk/; |
46 |
|
|
print $_; |
47 |
|
|
next; |
48 |
|
|
} |
49 |
|
|
#sequence |
50 |
|
|
tr/\n\r \t//d; |
51 |
|
|
my $l=$_; |
52 |
|
|
for (my $i=0;$i<length($l);$i++) { |
53 |
|
|
my $c=substr($l,$i,1); |
54 |
|
|
my $ich='.'; |
55 |
|
|
$ap++; #allignment position |
56 |
|
|
goto IREST if ( $c eq '-'); |
57 |
|
|
$isExon=0; |
58 |
|
|
$isCoding=0; |
59 |
|
|
$p++; |
60 |
|
|
goto IREST if ($ei >= @se); #beyond last exon |
61 |
|
|
if (@sc>0) { |
62 |
|
|
if ($p<=$sc[$ci]->[1]) { # before cur cdseg end |
63 |
|
|
$isCoding=1 if ($p>=$sc[$ci]->[0]); |
64 |
|
|
} |
65 |
|
|
else { # beyond cur cdseg end |
66 |
|
|
$ci++; |
67 |
|
|
} |
68 |
|
|
} # cds defined |
69 |
|
|
if ($p<=$se[$ei]->[1]) { # before cur exon end |
70 |
|
|
$isExon=1 if ($p>=$se[$ei]->[0]); |
71 |
|
|
} |
72 |
|
|
else { # beyond cur exon end |
73 |
|
|
$ei++; |
74 |
|
|
} |
75 |
|
|
$ich=$c if ($ei>0 && ($p-$se[$ei-1]->[1]<3 || $se[$ei]->[0]-$p<3)); |
76 |
|
|
IREST: |
77 |
|
|
if ($prevState != $isExon) { |
78 |
|
|
my ($sep, $xp) = $prevState==1 ? ("\n",$ap-1):(' - ',$ap); |
79 |
|
|
print STDERR $xp.$sep; |
80 |
|
|
$prevState=$isExon; |
81 |
|
|
} |
82 |
|
|
substr($l, $i, 1)=$isExon ? ($isCoding?'C':'T') : $ich; |
83 |
|
|
} |
84 |
|
|
print $l."\n"; |
85 |
|
|
} |