ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff_get_introns.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 5039 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 gff_get_introns.pl [-m <minlen>] <gff_data_stream..>
8 Extract intron coordinates from given GFF data stream.
9 Output this tabulated fomat:
10
11 <chr><strand> <start> <end> <intron_id> <transcript_info>
12
13 Options:
14 -m minium intron length to extract (default 100000)
15 /;
16 umask 0002;
17 getopts('m:o:') || die($usage."\n");
18 my $minlen=$Getopt::Std::opt_m || 100000;
19 my $outfile=$Getopt::Std::opt_o;
20 if ($outfile) {
21 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
22 select(OUTF);
23 }
24 # --------------------- 0 1 2 3 4 5 6 7
25 my %recs; # recID => [ chr.strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
26 #
27 my $curtag; #chromosome and strand for current model
28 my ($gname, $tdescr);
29 my @exd; #exons for current model
30 while (<>) {
31 next if m/^\s*#/;
32 chomp;
33 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
34 next unless $fstart>1 && $lnum;
35 next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally
36 my $gff3_ID;
37 my $gff3_Parent;
38 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
39 ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
40 ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
41 if ($gff3_ID || $gff3_Parent) { # GFF format
42 $gff3_ID=~tr/"//d; #"
43 $gff3_Parent=~tr/"//d; #"
44 $gff3_Parent='' if ($f eq 'mRNA');
45 if ($gff3_ID && !$gff3_Parent) { #top level feature
46 if ($f=~m/RNA/i || $f=~/gene/) {
47 # try to parse the description, if any
48 $tdescr='';
49 $gname='';
50 if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
51 $tdescr=$1;
52 }
53 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
54 $tdescr=$1;
55 }
56 if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
57 $gname=$1;
58 }
59 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
60 $gname=$1;
61 }
62 $tdescr='' if ($tdescr eq $gname);
63 $gname='' if $gname eq $gff3_ID;
64 }
65 die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"}));
66 my $recID="$chr|$gff3_ID";
67 $curtag=$chr.$strand;
68 $recs{$recID} = [$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [] ];
69 next;
70 } # parent/top-level feature
71 } #GFF
72 else { #GTF format
73 next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
74 }
75 # -------------- exon/CDS line here:
76 my $recID;
77 if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
78 $recID=$chr.'.jsm.'.$lnum;
79 }
80 elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
81 $recID=$1;
82 $recID=~tr/"//d; #"
83 }
84 elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
85 $recID=$1;
86 $recID=~tr/"//d; #"
87 }
88 else {
89 die("Error: cannot parse locus/transcript name from input line:\n$_\n");
90 }
91 if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
92 $gname=$1;
93 $gname=~tr/"//d; #"
94 }
95 $tdescr='' if index($recID, $tdescr)>=0;
96 $gname='' if index($recID, $gname)>=0;
97 $curtag=$chr.$strand;
98 $recID=$chr.'|'.$recID;
99 my $ld = $recs{$recID};
100 if ($ld) { #existing entry
101 my $i=($f eq 'CDS') ? 7 : 6;
102 my ($lstart, $lend)=($$ld[4], $$ld[5]);
103 $$ld[4]=$fstart if $fstart<$lstart;
104 $$ld[5]=$fend if $fend>$lend;
105 push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
106 }
107 else { # first time seeing this locus/gene
108 $recs{$recID} = ($f eq 'CDS') ?
109 [$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] :
110 [$curtag, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ;
111 }
112 } #while <>
113
114 writeModels();
115
116 # --
117 if ($outfile) {
118 select(STDOUT);
119 close(OUTF);
120 }
121
122 #************ Subroutines **************
123 sub writeModels {
124 return if keys(%recs)==0;
125 while ( my ($l, $ld)=each(%recs) ) {
126 # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
127 my ($chrstrand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld;
128 my ($mstart,$mend)=($lstart, $lend);
129 my @ex;
130 if (@$er<1 && @$cr>0) {
131 @ex = sort { $main::a->[0] <=> $main::b->[0] } @$cr;
132 }
133 else {
134 @ex = sort { $main::a->[0] <=> $main::b->[0] } @$er;
135 }
136 # -- now look at the introns
137 next if @ex<2;
138 my @in; #introns to be printed stored here
139 #my $icount=0; #intron output counter
140 ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
141 for (my $i=1;$i<@ex;$i++) {
142 my ($istart, $iend)=($ex[$i-1]->[1]+1,$ex[$i]->[0]-1);
143 next unless ($iend-$istart+1>=$minlen);
144 push(@in, [$istart, $iend]);
145 #$icount++;
146 my $ann="";
147 $ann=" gene:$gname" if $gname;
148 $ann.=" product:$descr" if $descr;
149 print join("\t", $chrstrand, $istart, $iend, $l.".i$i", $ann)."\n";
150 }
151 } #for each stored transcript
152 }

Properties

Name Value
svn:executable *