1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
gff_get_introns.pl [-m <minlen>] <gff_data_stream..> |
8 |
Extract intron coordinates from given GFF data stream. |
9 |
Output this tabulated fomat: |
10 |
|
11 |
<chr><strand> <start> <end> <intron_id> <transcript_info> |
12 |
|
13 |
Options: |
14 |
-m minium intron length to extract (default 100000) |
15 |
/; |
16 |
umask 0002; |
17 |
getopts('m:o:') || die($usage."\n"); |
18 |
my $minlen=$Getopt::Std::opt_m || 100000; |
19 |
my $outfile=$Getopt::Std::opt_o; |
20 |
if ($outfile) { |
21 |
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n"); |
22 |
select(OUTF); |
23 |
} |
24 |
# --------------------- 0 1 2 3 4 5 6 7 |
25 |
my %recs; # recID => [ chr.strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ] |
26 |
# |
27 |
my $curtag; #chromosome and strand for current model |
28 |
my ($gname, $tdescr); |
29 |
my @exd; #exons for current model |
30 |
while (<>) { |
31 |
next if m/^\s*#/; |
32 |
chomp; |
33 |
my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
34 |
next unless $fstart>1 && $lnum; |
35 |
next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally |
36 |
my $gff3_ID; |
37 |
my $gff3_Parent; |
38 |
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
39 |
($gff3_ID)=($lnum=~m/\bID=([^;]+)/); |
40 |
($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/); |
41 |
if ($gff3_ID || $gff3_Parent) { # GFF format |
42 |
$gff3_ID=~tr/"//d; #" |
43 |
$gff3_Parent=~tr/"//d; #" |
44 |
$gff3_Parent='' if ($f eq 'mRNA'); |
45 |
if ($gff3_ID && !$gff3_Parent) { #top level feature |
46 |
if ($f=~m/RNA/i || $f=~/gene/) { |
47 |
# try to parse the description, if any |
48 |
$tdescr=''; |
49 |
$gname=''; |
50 |
if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) { |
51 |
$tdescr=$1; |
52 |
} |
53 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
54 |
$tdescr=$1; |
55 |
} |
56 |
if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) { |
57 |
$gname=$1; |
58 |
} |
59 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
60 |
$gname=$1; |
61 |
} |
62 |
$tdescr='' if ($tdescr eq $gname); |
63 |
$gname='' if $gname eq $gff3_ID; |
64 |
} |
65 |
die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"})); |
66 |
my $recID="$chr|$gff3_ID"; |
67 |
$curtag=$chr.$strand; |
68 |
$recs{$recID} = [$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [] ]; |
69 |
next; |
70 |
} # parent/top-level feature |
71 |
} #GFF |
72 |
else { #GTF format |
73 |
next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature |
74 |
} |
75 |
# -------------- exon/CDS line here: |
76 |
my $recID; |
77 |
if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) { |
78 |
$recID=$chr.'.jsm.'.$lnum; |
79 |
} |
80 |
elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) { |
81 |
$recID=$1; |
82 |
$recID=~tr/"//d; #" |
83 |
} |
84 |
elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) { |
85 |
$recID=$1; |
86 |
$recID=~tr/"//d; #" |
87 |
} |
88 |
else { |
89 |
die("Error: cannot parse locus/transcript name from input line:\n$_\n"); |
90 |
} |
91 |
if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) { |
92 |
$gname=$1; |
93 |
$gname=~tr/"//d; #" |
94 |
} |
95 |
$tdescr='' if index($recID, $tdescr)>=0; |
96 |
$gname='' if index($recID, $gname)>=0; |
97 |
$curtag=$chr.$strand; |
98 |
$recID=$chr.'|'.$recID; |
99 |
my $ld = $recs{$recID}; |
100 |
if ($ld) { #existing entry |
101 |
my $i=($f eq 'CDS') ? 7 : 6; |
102 |
my ($lstart, $lend)=($$ld[4], $$ld[5]); |
103 |
$$ld[4]=$fstart if $fstart<$lstart; |
104 |
$$ld[5]=$fend if $fend>$lend; |
105 |
push(@{$$ld[$i]}, [$fstart, $fend, $fscore]); |
106 |
} |
107 |
else { # first time seeing this locus/gene |
108 |
$recs{$recID} = ($f eq 'CDS') ? |
109 |
[$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] : |
110 |
[$curtag, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ; |
111 |
} |
112 |
} #while <> |
113 |
|
114 |
writeModels(); |
115 |
|
116 |
# -- |
117 |
if ($outfile) { |
118 |
select(STDOUT); |
119 |
close(OUTF); |
120 |
} |
121 |
|
122 |
#************ Subroutines ************** |
123 |
sub writeModels { |
124 |
return if keys(%recs)==0; |
125 |
while ( my ($l, $ld)=each(%recs) ) { |
126 |
# my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n"); |
127 |
my ($chrstrand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld; |
128 |
my ($mstart,$mend)=($lstart, $lend); |
129 |
my @ex; |
130 |
if (@$er<1 && @$cr>0) { |
131 |
@ex = sort { $main::a->[0] <=> $main::b->[0] } @$cr; |
132 |
} |
133 |
else { |
134 |
@ex = sort { $main::a->[0] <=> $main::b->[0] } @$er; |
135 |
} |
136 |
# -- now look at the introns |
137 |
next if @ex<2; |
138 |
my @in; #introns to be printed stored here |
139 |
#my $icount=0; #intron output counter |
140 |
($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); |
141 |
for (my $i=1;$i<@ex;$i++) { |
142 |
my ($istart, $iend)=($ex[$i-1]->[1]+1,$ex[$i]->[0]-1); |
143 |
next unless ($iend-$istart+1>=$minlen); |
144 |
push(@in, [$istart, $iend]); |
145 |
#$icount++; |
146 |
my $ann=""; |
147 |
$ann=" gene:$gname" if $gname; |
148 |
$ann.=" product:$descr" if $descr; |
149 |
print join("\t", $chrstrand, $istart, $iend, $l.".i$i", $ann)."\n"; |
150 |
} |
151 |
} #for each stored transcript |
152 |
} |