1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
use Getopt::Std; |
4 |
|
|
use FindBin;use lib $FindBin::Bin; |
5 |
|
|
|
6 |
|
|
my $usage = q/Usage: |
7 |
|
|
gff2bed [-c:<rgb_color>] <input.gtf> |
8 |
|
|
Convert GFF annotation to BED12 format |
9 |
|
|
/; |
10 |
|
|
umask 0002; |
11 |
|
|
getopts('c:o:') || die($usage."\n"); |
12 |
|
|
my $outfile=$Getopt::Std::opt_o; |
13 |
|
|
if ($outfile) { |
14 |
|
|
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n"); |
15 |
|
|
select(OUTF); |
16 |
|
|
} |
17 |
|
|
my $tcolor= $Getopt::Std::opt_c || 0; |
18 |
|
|
#$tcolor=hex($tcolor) if ($tcolor=~m/^0x/ || $tcolor=~m/[a-f,A-F]/); |
19 |
|
|
# --------------------- 0 1 2 3 4 5 6 7 8 |
20 |
|
|
my %recs; # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ] |
21 |
|
|
# |
22 |
|
|
my ($gname, $tdescr); |
23 |
|
|
while (<>) { |
24 |
|
|
next if m/^\s*#/; |
25 |
|
|
chomp; |
26 |
|
|
my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
27 |
|
|
next unless $fstart>1 && $lnum; |
28 |
|
|
next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally |
29 |
|
|
my $gff3_ID; |
30 |
|
|
my $gff3_Parent; |
31 |
|
|
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
32 |
|
|
($gff3_ID)=($lnum=~m/\bID=([^;]+)/); |
33 |
|
|
($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/); |
34 |
|
|
if ($gff3_ID || $gff3_Parent) { # GFF format |
35 |
|
|
$gff3_ID=~tr/"//d; #" |
36 |
|
|
$gff3_Parent=~tr/"//d; #" |
37 |
|
|
$gff3_Parent='' if ($f eq 'mRNA'); |
38 |
|
|
if ($gff3_ID && !$gff3_Parent) { #top level feature |
39 |
|
|
if ($f=~m/RNA/i || $f=~/gene/) { |
40 |
|
|
# try to parse the description, if any |
41 |
|
|
($gname,$tdescr)=(); |
42 |
|
|
if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) { |
43 |
|
|
$tdescr=$1; |
44 |
|
|
} |
45 |
|
|
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
46 |
|
|
$tdescr=$1; |
47 |
|
|
} |
48 |
|
|
if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) { |
49 |
|
|
$gname=$1; |
50 |
|
|
} |
51 |
|
|
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
52 |
|
|
$gname=$1; |
53 |
|
|
} |
54 |
|
|
$tdescr='' if ($tdescr eq $gname); |
55 |
|
|
$gname='' if $gname eq $gff3_ID; |
56 |
|
|
} |
57 |
|
|
die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"})); |
58 |
|
|
my $recID="$chr|$gff3_ID"; |
59 |
|
|
$recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [] ]; |
60 |
|
|
next; |
61 |
|
|
} # parent/top-level feature |
62 |
|
|
} #GFF |
63 |
|
|
else { #GTF format |
64 |
|
|
next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature |
65 |
|
|
} |
66 |
|
|
# -------------- exon/CDS line here: |
67 |
|
|
my $recID; |
68 |
|
|
($gname, $tdescr)=(); |
69 |
|
|
if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) { |
70 |
|
|
$recID=$chr.'.jsm.'.$lnum; |
71 |
|
|
} |
72 |
|
|
elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) { |
73 |
|
|
$recID=$1; |
74 |
|
|
$recID=~tr/"//d; #" |
75 |
|
|
} |
76 |
|
|
elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) { |
77 |
|
|
$recID=$1; |
78 |
|
|
$recID=~tr/"//d; #" |
79 |
|
|
} |
80 |
|
|
else { |
81 |
|
|
die("Error: cannot parse locus/transcript name from input line:\n$_\n"); |
82 |
|
|
} |
83 |
|
|
if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) { |
84 |
|
|
$gname=$1; |
85 |
|
|
$gname=~tr/"//d; #" |
86 |
|
|
} |
87 |
|
|
$tdescr='' if index($recID, $tdescr)>=0; |
88 |
|
|
$gname='' if index($recID, $gname)>=0; |
89 |
|
|
$recID=$chr.'|'.$recID; |
90 |
|
|
my $ld = $recs{$recID}; |
91 |
|
|
if ($ld) { #existing entry |
92 |
|
|
my $i=($f eq 'CDS') ? 8 : 7; |
93 |
|
|
my ($lstart, $lend)=($$ld[5], $$ld[6]); |
94 |
|
|
$$ld[5]=$fstart if $fstart<$lstart; |
95 |
|
|
$$ld[6]=$fend if $fend>$lend; |
96 |
|
|
push(@{$$ld[$i]}, [$fstart, $fend, $fscore]); |
97 |
|
|
} |
98 |
|
|
else { # first time seeing this locus/gene |
99 |
|
|
$recs{$recID} = ($f eq 'CDS') ? |
100 |
|
|
[$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] : |
101 |
|
|
[$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ; |
102 |
|
|
# 0 1 2 3 4 5 6 7 (exons) 8 (CDS) |
103 |
|
|
($gname,$tdescr)=(); |
104 |
|
|
} |
105 |
|
|
} #while <> |
106 |
|
|
|
107 |
|
|
|
108 |
|
|
# -- sort features by chromosome: |
109 |
|
|
print STDERR "GFF data loaded. Sorting by chromosome location..\n"; |
110 |
|
|
my @sorted_features=sort sortByLoc keys(%recs); |
111 |
|
|
print STDERR "Writing BED file..\n"; |
112 |
|
|
writeModels(\@sorted_features); |
113 |
|
|
print STDERR "Done.\n"; |
114 |
|
|
# -- |
115 |
|
|
if ($outfile) { |
116 |
|
|
select(STDOUT); |
117 |
|
|
close(OUTF); |
118 |
|
|
} |
119 |
|
|
|
120 |
|
|
#************ Subroutines ************** |
121 |
|
|
sub sortByLoc { |
122 |
|
|
my $da=$recs{$a}; |
123 |
|
|
my $db=$recs{$b}; |
124 |
|
|
if ($$da[0] eq $$db[0]) { |
125 |
|
|
return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ; |
126 |
|
|
} |
127 |
|
|
else { return $$da[0] cmp $$db[0] ; } |
128 |
|
|
} |
129 |
|
|
|
130 |
|
|
sub writeModels { |
131 |
|
|
#return if keys(%recs)==0; |
132 |
|
|
my $rlist=shift(@_); |
133 |
|
|
my @recs_keys; |
134 |
|
|
unless ($rlist) { |
135 |
|
|
@recs_keys=keys(%recs); |
136 |
|
|
$rlist=\@recs_keys; |
137 |
|
|
} |
138 |
|
|
|
139 |
|
|
foreach my $gffid (@$rlist) { |
140 |
|
|
my $ld=$recs{$gffid}; |
141 |
|
|
# my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n"); |
142 |
|
|
# 0 1 2 3 4 5 6 7 8 |
143 |
|
|
my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld; |
144 |
|
|
my ($mstart,$mend)=($lstart, $lend); |
145 |
|
|
my $CDexons=0; |
146 |
|
|
my @ex; |
147 |
|
|
my @cds; |
148 |
|
|
if (@$er<1 && @$cr>0) { |
149 |
|
|
@ex = sort { $a->[0] <=> $b->[0] } @$cr; |
150 |
|
|
$CDexons=1; |
151 |
|
|
} |
152 |
|
|
else { |
153 |
|
|
@ex = sort { $a->[0] <=> $b->[0] } @$er; |
154 |
|
|
if (@$cr>0) { |
155 |
|
|
@cds= sort { $a->[0] <=> $b->[0] } @$cr; |
156 |
|
|
} |
157 |
|
|
} |
158 |
|
|
($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); # get the more accurate version of the start-end coords for the feature |
159 |
|
|
my ($cds_start, $cds_end) = ($CDexons || @cds==0) ? ($mstart, $mend) : ($cds[0]->[0], $cds[-1]->[1]) ; |
160 |
|
|
my @bstarts; |
161 |
|
|
my @blens; |
162 |
|
|
foreach my $ed (@ex) { |
163 |
|
|
push(@bstarts, $$ed[0]-$mstart); |
164 |
|
|
push(@blens, $$ed[1]-$$ed[0]+1); |
165 |
|
|
} |
166 |
|
|
print join("\t", $chr, $mstart-1, $mend, "$gffid|$gname", 666, $strand, $cds_start-1, $cds_end, $tcolor, |
167 |
|
|
scalar(@bstarts), join(',', @blens), join(',', @bstarts))."\n"; |
168 |
|
|
} #for each stored transcript |
169 |
|
|
} |