gclib/scripts/gff_get_introns.pl

#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;

my $usage = q/Usage:
 gff_get_introns.pl [-m <minlen>] <gff_data_stream..>
 Extract intron coordinates from given GFF data stream.
 Output this tabulated fomat:
 
 <chr><strand> <start> <end> <intron_id> <transcript_info>
 
 Options:
  -m minium intron length to extract (default 100000)
/;
umask 0002;
getopts('m:o:') || die($usage."\n");
my $minlen=$Getopt::Std::opt_m || 100000;
my $outfile=$Getopt::Std::opt_o;
if ($outfile) {
  open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
  select(OUTF);
  }
# ---------------------      0         1          2      3        4      5       6       7
my %recs; # recID =>  [ chr.strand, feat_type,  gname, tdescr, fstart, fend, [@exons], [@cds] ]
             #   
my $curtag; #chromosome and strand for current model
my ($gname, $tdescr);
my @exd; #exons for current model
while (<>) {
   next if m/^\s*#/;
   chomp;
   my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
   next unless $fstart>1 && $lnum;
   next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally
   my $gff3_ID;
   my $gff3_Parent;
   ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
   ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
   ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
   if ($gff3_ID || $gff3_Parent) { # GFF format
      $gff3_ID=~tr/"//d; #"
      $gff3_Parent=~tr/"//d; #"
      $gff3_Parent='' if ($f eq 'mRNA');
      if ($gff3_ID && !$gff3_Parent) { #top level feature
         if ($f=~m/RNA/i || $f=~/gene/) {
           # try to parse the description, if any
           $tdescr='';
           $gname='';
           if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
             $tdescr=$1;
             }
            elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
             $tdescr=$1;
             }
           if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
             $gname=$1;
             }
            elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
             $gname=$1;
             }
           $tdescr='' if ($tdescr eq $gname);
           $gname='' if $gname eq $gff3_ID;
           }
         die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"}));
         my $recID="$chr|$gff3_ID";
         $curtag=$chr.$strand;
         $recs{$recID} = [$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [] ];
         next;
         } # parent/top-level feature
      } #GFF
     else { #GTF format
      next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
      }
   # -------------- exon/CDS line here:
   my $recID;
   if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
     $recID=$chr.'.jsm.'.$lnum;
     }
    elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
     $recID=$1;
     $recID=~tr/"//d; #"
     }
    elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
     $recID=$1;
     $recID=~tr/"//d; #"
     }
    else {
     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
     }
   if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
      $gname=$1;
      $gname=~tr/"//d; #"
      }
   $tdescr='' if index($recID, $tdescr)>=0;
   $gname='' if index($recID, $gname)>=0;
   $curtag=$chr.$strand;
   $recID=$chr.'|'.$recID;
   my $ld = $recs{$recID};
   if ($ld) { #existing entry
     my $i=($f eq 'CDS') ? 7 : 6;
     my ($lstart, $lend)=($$ld[4], $$ld[5]);
     $$ld[4]=$fstart if $fstart<$lstart;
     $$ld[5]=$fend if $fend>$lend;
     push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
     }
    else { # first time seeing this locus/gene
     $recs{$recID} = ($f eq 'CDS') ? 
           [$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] :
           [$curtag, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ;
     }
} #while <>

writeModels();

# --
if ($outfile) {
 select(STDOUT);
 close(OUTF);
 }

#************ Subroutines **************
sub writeModels {
 return if keys(%recs)==0;
 while ( my ($l, $ld)=each(%recs) ) {
   # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
   my ($chrstrand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld;
   my ($mstart,$mend)=($lstart, $lend);
   my @ex;
   if (@$er<1 && @$cr>0) { 
     @ex = sort { $main::a->[0] <=> $main::b->[0] } @$cr;
     }
    else {
     @ex = sort { $main::a->[0] <=> $main::b->[0] } @$er;
     }
   # -- now look at the introns
   next if @ex<2;
   my @in; #introns to be printed stored here
   #my $icount=0; #intron output counter
   ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
   for (my $i=1;$i<@ex;$i++) {
     my ($istart, $iend)=($ex[$i-1]->[1]+1,$ex[$i]->[0]-1);
     next unless ($iend-$istart+1>=$minlen);
     push(@in, [$istart, $iend]);
     #$icount++;
     my $ann="";
     $ann=" gene:$gname" if $gname;
     $ann.=" product:$descr" if $descr;
     print join("\t", $chrstrand, $istart, $iend, $l.".i$i", $ann)."\n";
     }
  } #for each stored transcript
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size:	5039 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	use FindBin;use lib $FindBin::Bin;
5
6	my $usage = q/Usage:
7	gff_get_introns.pl [-m <minlen>] <gff_data_stream..>
8	Extract intron coordinates from given GFF data stream.
9	Output this tabulated fomat:
10
11	<chr><strand> <start> <end> <intron_id> <transcript_info>
12
13	Options:
14	-m minium intron length to extract (default 100000)
15	/;
16	umask 0002;
17	getopts('m:o:') \|\| die($usage."\n");
18	my $minlen=$Getopt::Std::opt_m \|\| 100000;
19	my $outfile=$Getopt::Std::opt_o;
20	if ($outfile) {
21	open(OUTF, '>'.$outfile) \|\| die("Error creating output file $outfile\n");
22	select(OUTF);
23	}
24	# --------------------- 0 1 2 3 4 5 6 7
25	my %recs; # recID => [ chr.strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
26	#
27	my $curtag; #chromosome and strand for current model
28	my ($gname, $tdescr);
29	my @exd; #exons for current model
30	while (<>) {
31	next if m/^\s*#/;
32	chomp;
33	my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
34	next unless $fstart>1 && $lnum;
35	next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally
36	my $gff3_ID;
37	my $gff3_Parent;
38	($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
39	($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
40	($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
41	if ($gff3_ID \|\| $gff3_Parent) { # GFF format
42	$gff3_ID=~tr/"//d; #"
43	$gff3_Parent=~tr/"//d; #"
44	$gff3_Parent='' if ($f eq 'mRNA');
45	if ($gff3_ID && !$gff3_Parent) { #top level feature
46	if ($f=~m/RNA/i \|\| $f=~/gene/) {
47	# try to parse the description, if any
48	$tdescr='';
49	$gname='';
50	if ($lnum=~m/\b(?:descr\|tophit\|info\|product)\s=\s"?([^;"]+)/i) {
51	$tdescr=$1;
52	}
53	elsif ($lnum=~m/Name\s=\s"?([^;"]+)/) {
54	$tdescr=$1;
55	}
56	if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
57	$gname=$1;
58	}
59	elsif ($lnum=~m/Name\s=\s"?([^;"]+)/) {
60	$gname=$1;
61	}
62	$tdescr='' if ($tdescr eq $gname);
63	$gname='' if $gname eq $gff3_ID;
64	}
65	die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr\|$gff3_ID"}));
66	my $recID="$chr\|$gff3_ID";
67	$curtag=$chr.$strand;
68	$recs{$recID} = [$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [] ];
69	next;
70	} # parent/top-level feature
71	} #GFF
72	else { #GTF format
73	next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
74	}
75	# -------------- exon/CDS line here:
76	my $recID;
77	if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
78	$recID=$chr.'.jsm.'.$lnum;
79	}
80	elsif ($lnum=~m/Parent=(['"\:\w\\|\-\.]+)/) {
81	$recID=$1;
82	$recID=~tr/"//d; #"
83	}
84	elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\\|\-]+)/) {
85	$recID=$1;
86	$recID=~tr/"//d; #"
87	}
88	else {
89	die("Error: cannot parse locus/transcript name from input line:\n$_\n");
90	}
91	if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\\|\-]+)/) {
92	$gname=$1;
93	$gname=~tr/"//d; #"
94	}
95	$tdescr='' if index($recID, $tdescr)>=0;
96	$gname='' if index($recID, $gname)>=0;
97	$curtag=$chr.$strand;
98	$recID=$chr.'\|'.$recID;
99	my $ld = $recs{$recID};
100	if ($ld) { #existing entry
101	my $i=($f eq 'CDS') ? 7 : 6;
102	my ($lstart, $lend)=($$ld[4], $$ld[5]);
103	$$ld[4]=$fstart if $fstart<$lstart;
104	$$ld[5]=$fend if $fend>$lend;
105	push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
106	}
107	else { # first time seeing this locus/gene
108	$recs{$recID} = ($f eq 'CDS') ?
109	[$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] :
110	[$curtag, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ;
111	}
112	} #while <>
113
114	writeModels();
115
116	# --
117	if ($outfile) {
118	select(STDOUT);
119	close(OUTF);
120	}
121
122	#********** Subroutines ************
123	sub writeModels {
124	return if keys(%recs)==0;
125	while ( my ($l, $ld)=each(%recs) ) {
126	# my $ld=$recs{$l} \|\| die ("Error: locus $l found in list but not in hash!\n");
127	my ($chrstrand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld;
128	my ($mstart,$mend)=($lstart, $lend);
129	my @ex;
130	if (@$er<1 && @$cr>0) {
131	@ex = sort { $main::a->[0] <=> $main::b->[0] } @$cr;
132	}
133	else {
134	@ex = sort { $main::a->[0] <=> $main::b->[0] } @$er;
135	}
136	# -- now look at the introns
137	next if @ex<2;
138	my @in; #introns to be printed stored here
139	#my $icount=0; #intron output counter
140	($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
141	for (my $i=1;$i<@ex;$i++) {
142	my ($istart, $iend)=($ex[$i-1]->[1]+1,$ex[$i]->[0]-1);
143	next unless ($iend-$istart+1>=$minlen);
144	push(@in, [$istart, $iend]);
145	#$icount++;
146	my $ann="";
147	$ann=" gene:$gname" if $gname;
148	$ann.=" product:$descr" if $descr;
149	print join("\t", $chrstrand, $istart, $iend, $l.".i$i", $ann)."\n";
150	}
151	} #for each stored transcript
152	}