ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2bed
Revision: 23
Committed: Tue Jul 26 21:44:38 2011 UTC (13 years, 1 month ago) by gpertea
Original Path: ann_bin/gff2bed
File size: 5703 byte(s)
Log Message:
adding misc scripts

Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     gff2bed [-c:<rgb_color>] <input.gtf>
8     Convert GFF annotation to BED12 format
9     /;
10     umask 0002;
11     getopts('c:o:') || die($usage."\n");
12     my $outfile=$Getopt::Std::opt_o;
13     if ($outfile) {
14     open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
15     select(OUTF);
16     }
17     my $tcolor= $Getopt::Std::opt_c || 0;
18     #$tcolor=hex($tcolor) if ($tcolor=~m/^0x/ || $tcolor=~m/[a-f,A-F]/);
19     # --------------------- 0 1 2 3 4 5 6 7 8
20     my %recs; # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
21     #
22     my ($gname, $tdescr);
23     while (<>) {
24     next if m/^\s*#/;
25     chomp;
26     my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
27     next unless $fstart>1 && $lnum;
28     next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
29     my $gff3_ID;
30     my $gff3_Parent;
31     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
32     ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
33     ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
34     if ($gff3_ID || $gff3_Parent) { # GFF format
35     $gff3_ID=~tr/"//d; #"
36     $gff3_Parent=~tr/"//d; #"
37     $gff3_Parent='' if ($f eq 'mRNA');
38     if ($gff3_ID && !$gff3_Parent) { #top level feature
39     if ($f=~m/RNA/i || $f=~/gene/) {
40     # try to parse the description, if any
41     ($gname,$tdescr)=();
42     if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
43     $tdescr=$1;
44     }
45     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
46     $tdescr=$1;
47     }
48     if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
49     $gname=$1;
50     }
51     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
52     $gname=$1;
53     }
54     $tdescr='' if ($tdescr eq $gname);
55     $gname='' if $gname eq $gff3_ID;
56     }
57     die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"}));
58     my $recID="$chr|$gff3_ID";
59     $recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [] ];
60     next;
61     } # parent/top-level feature
62     } #GFF
63     else { #GTF format
64     next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
65     }
66     # -------------- exon/CDS line here:
67     my $recID;
68     ($gname, $tdescr)=();
69     if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
70     $recID=$chr.'.jsm.'.$lnum;
71     }
72     elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
73     $recID=$1;
74     $recID=~tr/"//d; #"
75     }
76     elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
77     $recID=$1;
78     $recID=~tr/"//d; #"
79     }
80     else {
81     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
82     }
83     if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
84     $gname=$1;
85     $gname=~tr/"//d; #"
86     }
87     $tdescr='' if index($recID, $tdescr)>=0;
88     $gname='' if index($recID, $gname)>=0;
89     $recID=$chr.'|'.$recID;
90     my $ld = $recs{$recID};
91     if ($ld) { #existing entry
92     my $i=($f eq 'CDS') ? 8 : 7;
93     my ($lstart, $lend)=($$ld[5], $$ld[6]);
94     $$ld[5]=$fstart if $fstart<$lstart;
95     $$ld[6]=$fend if $fend>$lend;
96     push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
97     }
98     else { # first time seeing this locus/gene
99     $recs{$recID} = ($f eq 'CDS') ?
100     [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] :
101     [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ;
102     # 0 1 2 3 4 5 6 7 (exons) 8 (CDS)
103     ($gname,$tdescr)=();
104     }
105     } #while <>
106    
107    
108     # -- sort features by chromosome:
109     print STDERR "GFF data loaded. Sorting by chromosome location..\n";
110     my @sorted_features=sort sortByLoc keys(%recs);
111     print STDERR "Writing BED file..\n";
112     writeModels(\@sorted_features);
113     print STDERR "Done.\n";
114     # --
115     if ($outfile) {
116     select(STDOUT);
117     close(OUTF);
118     }
119    
120     #************ Subroutines **************
121     sub sortByLoc {
122     my $da=$recs{$a};
123     my $db=$recs{$b};
124     if ($$da[0] eq $$db[0]) {
125     return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ;
126     }
127     else { return $$da[0] cmp $$db[0] ; }
128     }
129    
130     sub writeModels {
131     #return if keys(%recs)==0;
132     my $rlist=shift(@_);
133     my @recs_keys;
134     unless ($rlist) {
135     @recs_keys=keys(%recs);
136     $rlist=\@recs_keys;
137     }
138    
139     foreach my $gffid (@$rlist) {
140     my $ld=$recs{$gffid};
141     # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
142     # 0 1 2 3 4 5 6 7 8
143     my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld;
144     my ($mstart,$mend)=($lstart, $lend);
145     my $CDexons=0;
146     my @ex;
147     my @cds;
148     if (@$er<1 && @$cr>0) {
149     @ex = sort { $a->[0] <=> $b->[0] } @$cr;
150     $CDexons=1;
151     }
152     else {
153     @ex = sort { $a->[0] <=> $b->[0] } @$er;
154     if (@$cr>0) {
155     @cds= sort { $a->[0] <=> $b->[0] } @$cr;
156     }
157     }
158     ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); # get the more accurate version of the start-end coords for the feature
159     my ($cds_start, $cds_end) = ($CDexons || @cds==0) ? ($mstart, $mend) : ($cds[0]->[0], $cds[-1]->[1]) ;
160     my @bstarts;
161     my @blens;
162     foreach my $ed (@ex) {
163     push(@bstarts, $$ed[0]-$mstart);
164     push(@blens, $$ed[1]-$$ed[0]+1);
165     }
166     print join("\t", $chr, $mstart-1, $mend, "$gffid|$gname", 666, $strand, $cds_start-1, $cds_end, $tcolor,
167     scalar(@bstarts), join(',', @blens), join(',', @bstarts))."\n";
168     } #for each stored transcript
169     }

Properties

Name Value
svn:executable *