1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
tbl2gff [-F] [-g <chr_table>] [-t <track>] -p <prot2nuc.tab> <ncbi_tbl_file> |
8 |
|
9 |
Convert the given NCBI tbl file to GFF3 |
10 |
|
11 |
Options: |
12 |
-p protein_id to transcript_id mapping file (required) |
13 |
this file has 2 columns: |
14 |
GenPept accession and (nucleotide) accession |
15 |
it is required in order to assign CDS to their |
16 |
corresponding mRNA exons |
17 |
-t use given track name instead of 'gb_tbl' |
18 |
-g replace the genomic sequence names according |
19 |
to the translation table <chr_table> |
20 |
(only the first 2 columns in that file are used) |
21 |
-M only outputs mRNA entries (without gene lines) |
22 |
-F preserve all notes and comments |
23 |
/; |
24 |
umask 0002; |
25 |
my %kattrs; |
26 |
#key attributes to always keep |
27 |
@kattrs{'gene', 'product', 'transcript_id'}=(); |
28 |
getopts('FMt:g:p:') || die($usage."\n"); |
29 |
my $gxfile=$Getopt::Std::opt_g; |
30 |
my $p2nfile=$Getopt::Std::opt_p; |
31 |
my $mrnaonly=$Getopt::Std::opt_M; |
32 |
die("Error: -p is required -- to map protein_id to transcript_id\n") unless $p2nfile && -f $p2nfile; |
33 |
my %pmap; # protein_id => transcript_id |
34 |
open(PROTMAP, $p2nfile) || die("Error opening $p2nfile!\n"); |
35 |
while (<PROTMAP>) { |
36 |
chomp; |
37 |
my ($p, $n)=split; |
38 |
next unless $n; |
39 |
$p=~s/\.\d+$//; |
40 |
$n=~s/\.\d+$//; # discard versioning |
41 |
$pmap{$p}=$n; |
42 |
} |
43 |
close(PROTMAP); |
44 |
|
45 |
my $track=$Getopt::Std::opt_t || 'gb_tbl'; |
46 |
my $allattrs=$Getopt::Std::opt_F; |
47 |
my $last_line; |
48 |
my %gx; # translation table for genomic sequence names ('>Feature' line) |
49 |
if ($gxfile) { |
50 |
open(GX, $gxfile) || die("Error opening $gxfile\n"); |
51 |
while(<GX>) { |
52 |
chomp; |
53 |
my @t=split; |
54 |
if ($t[0] && $t[1]) { |
55 |
$gx{$t[0]}=$t[1]; |
56 |
$gx{$t[1]}=$t[0]; |
57 |
} |
58 |
} |
59 |
close(GX); |
60 |
} |
61 |
|
62 |
my $tblfile=$ARGV[0]; |
63 |
die($usage."\n") if $tblfile eq '-h' || ! -f $tblfile; |
64 |
|
65 |
my %gids; #ensure unique gene IDs |
66 |
#allow for duplication $gids{$gene}=[ [gstart1, gend1], [gstart2, gend2] ] |
67 |
my %tids; # and transcript IDs |
68 |
|
69 |
# -- per gbase: |
70 |
|
71 |
my %genes; # genes{$gene}=[$gbase, $strand, $gstart, $gend, [@gxrefs], \%gattrs, \%rnas, isPseudo] |
72 |
# 0 1 2 3 4 5 6 7 |
73 |
#my %rnas; |
74 |
# %rnas hash holds all mRNA data for a single gene (i.e. all isoforms) |
75 |
# 0 1 2 3 4 5 6 |
76 |
# $rnas{transcript_id}=[ $featname, [@exon], [@cds], \%tattrs, [@txrefs], $partial3, $partial5 ] |
77 |
#current values: |
78 |
my ($gbase, $curfeat, @exon, %cattrs, $gpseudo, $gstart, $gend, $partial5, $partial3); |
79 |
my $strand='+'; |
80 |
|
81 |
# product and transcript_id are found in %tattrs |
82 |
while (<>) { |
83 |
$last_line=$_; |
84 |
if (m/^>Feature\s+(\S+)/) { # misnomer - in fact this is the genomic sequence name |
85 |
my $newbase=$1; |
86 |
flushGSeq(); #write previous gene feature |
87 |
$gbase=''; |
88 |
if ($gxfile) { |
89 |
$gbase=$gx{$newbase}; |
90 |
unless ($gbase) { |
91 |
if ($newbase=~m/^[a-z]+\|([\w\.]+)/) { |
92 |
$newbase=$1; |
93 |
$gbase=$gx{$newbase}; |
94 |
unless ($gbase) { |
95 |
$newbase=~s/\.\d+$//; |
96 |
$gbase=$gx{$newbase}; |
97 |
} |
98 |
} |
99 |
} |
100 |
} #convert contig name |
101 |
$gbase=$newbase unless $gbase; |
102 |
next; |
103 |
} # base genomic sequence (contig/chromosome) established |
104 |
chomp; |
105 |
tr/\n\r//d; |
106 |
my @t=split(/\t/); |
107 |
my ($start, $end)=@t[0,1]; |
108 |
if ($start && $end) { # segment coordinates |
109 |
$partial5=1 if ($start=~s/^<//); |
110 |
$partial3=1 if ($end=~s/^>//); |
111 |
my $newstrand='+'; |
112 |
if ($end<$start) { |
113 |
$newstrand='-'; |
114 |
($start, $end)=($end, $start); |
115 |
} |
116 |
if ($t[2]) { # feature start (gene/mRNA/CDS) |
117 |
flushFeature() unless $t[2] eq 'exon'; #cleanup current gene/transcript data |
118 |
$strand=$newstrand; |
119 |
if ($t[2] eq 'gene') { |
120 |
#new gene feature starts |
121 |
$curfeat='gene'; |
122 |
$gstart=$start; |
123 |
$gend=$end; |
124 |
} # gene start |
125 |
else { #non-gene features starting |
126 |
if ($t[2] eq 'exon') { # special case, exons given one by one |
127 |
if ($curfeat ne 'RNA_exon') { #first exon |
128 |
flushFeature(); #just in case there was another mRNA (exons) given before |
129 |
} |
130 |
$curfeat='RNA_exon'; |
131 |
push(@exon, [$start, $end]); |
132 |
} |
133 |
elsif ($t[2] eq 'CDS' || $t[2]=~m/RNA/) { |
134 |
$curfeat=$t[2]; |
135 |
push(@exon, [$start, $end]); |
136 |
} |
137 |
else { $curfeat=''; next; } |
138 |
# -- skipping any other features here |
139 |
} #non-gene features starting |
140 |
} # feature start |
141 |
else { # feature continuation |
142 |
$strand=$newstrand; |
143 |
if ($curfeat eq 'CDS' || $curfeat=~/RNA/) { |
144 |
push(@exon, [$start, $end]); |
145 |
} |
146 |
else { |
147 |
die("Error: unexpected feature '$curfeat' continuation\n$last_line"); |
148 |
} |
149 |
} |
150 |
} # had segment coordinates |
151 |
else { # no segment coords => attributes for current feature |
152 |
my ($attr, $value)=@t[3,4]; |
153 |
$value=~tr/;/,/; |
154 |
if ($attr eq 'db_xref' || $attr eq 'gene_syn') { |
155 |
push(@{$cattrs{$attr}}, $value); |
156 |
} |
157 |
else { |
158 |
$cattrs{$attr}=$value if $attr ne 'number'; |
159 |
} |
160 |
} # attributes block |
161 |
} #while input lines |
162 |
|
163 |
flushGSeq(); |
164 |
|
165 |
#================ SUBROUTINES ============ |
166 |
|
167 |
sub writeGene { |
168 |
my ($gene, $gdata)=@_; |
169 |
my $geneid=$gene; |
170 |
my ($gene_name)=($geneid=~m/^([^~]+)/); |
171 |
$geneid=~tr/~/_/; |
172 |
($gbase, $strand, $gstart, $gend, $gpseudo)= |
173 |
($gdata->[0], $gdata->[1], $gdata->[2], $gdata->[3], $gdata->[7]); |
174 |
my $gfeature=($gpseudo)?'pseudogene' : 'gene'; |
175 |
#makeUniqID(\$gene, \%gids); # $gene will be updated if necessary |
176 |
#my $geneid=$gene; |
177 |
#$geneid=~tr/~/_/; |
178 |
my $attrs="ID=$geneid;Name=$gene_name"; |
179 |
my @gxrefs=@{$gdata->[4]}; |
180 |
$attrs.=';xrefs='.join(',', @gxrefs) if (@gxrefs>0); |
181 |
my $gattrs=$gdata->[5]; |
182 |
unless ($mrnaonly) { |
183 |
foreach my $k (keys(%$gattrs)) { |
184 |
$attrs.=";$k=".$gattrs->{$k}; |
185 |
} |
186 |
print join("\t", $gbase, $track, $gfeature, $gstart, $gend, '.', $strand, '.', $attrs)."\n"; |
187 |
} |
188 |
|
189 |
my $rnas=$gdata->[6]; |
190 |
foreach my $tid (keys(%$rnas)) { |
191 |
#die("Error: no transcript_id found at write_RNA for gene $gene ($geneid)\n$last_line\n") unless $tid; |
192 |
my $tdata=$rnas->{$tid}; |
193 |
my $rnafeat=$tdata->[0]; |
194 |
if ($rnafeat eq 'RNA_exon') { |
195 |
$rnafeat = $gpseudo ? 'pseudo_RNA' : 'misc_RNA'; |
196 |
} |
197 |
if ($mrnaonly) { |
198 |
$rnafeat=~s/misc_RNA/mRNA/; |
199 |
} |
200 |
|
201 |
next if $mrnaonly && $rnafeat ne 'mRNA'; |
202 |
my $texons=$tdata->[1]; |
203 |
my $tcds=$tdata->[2]; |
204 |
my $tahref=$tdata->[3]; #attributes |
205 |
my $txar=$tdata->[4]; # db_xrefs |
206 |
my ($part3, $part5)=($tdata->[5], $tdata->[6]); |
207 |
makeUniqID(\$tid, \%tids); |
208 |
@exon=sort { $main::a->[0] <=> $main::b->[0] } @$texons; |
209 |
my ($tstart, $tend)=($exon[0]->[0], $exon[-1]->[1]); |
210 |
my $attrs="ID=$tid"; |
211 |
$attrs.=";Parent=$geneid" unless $mrnaonly; |
212 |
$attrs.="Name=$gene_name;gene_name=$gene_name"; |
213 |
$attrs.=';partial=1' if ($part5 || $part3); |
214 |
my $product=$tahref->{'product'}; |
215 |
if ($product) { |
216 |
$product=~tr/"/'/; #" |
217 |
$attrs.=';product="'.$product.'"'; |
218 |
} |
219 |
if ($allattrs) { |
220 |
$attrs.=';xrefs='.join(',', @$txar) if (@$txar > 0); |
221 |
foreach my $k (keys(%$tahref)) { |
222 |
next if exists($kattrs{$k}); |
223 |
$attrs.=";$k=".$tahref->{$k}; |
224 |
} |
225 |
} |
226 |
print join("\t", $gbase, $track, $rnafeat, $tstart, $tend, '.', $strand, '.', $attrs)."\n"; |
227 |
foreach my $ex (@exon) { |
228 |
print join("\t", $gbase, $track, 'exon', $$ex[0], $$ex[1], '.', $strand, '.', "Parent=$tid")."\n"; |
229 |
} |
230 |
@exon=(); |
231 |
#write_CDS(); |
232 |
my @cds=@$tcds; |
233 |
if (@cds>0) { #have CDS, write it |
234 |
@cds=sort { $main::a->[0] <=> $main::b->[0] } @cds; |
235 |
my $codonstart=$tahref->{'codon_start'}; |
236 |
if ($codonstart>1) { |
237 |
$codonstart--; |
238 |
if ($strand eq '-') { |
239 |
$cds[-1]->[1]-=$codonstart; |
240 |
} |
241 |
else { |
242 |
$cds[0]->[0]+=$codonstart; |
243 |
} |
244 |
} |
245 |
else { $codonstart='0'; } |
246 |
|
247 |
foreach my $cd (@cds) { #TODO: calculate phase ? |
248 |
print join("\t", $gbase, $track, 'CDS', $$cd[0], $$cd[1], '.', $strand, '.', "Parent=$tid")."\n"; |
249 |
} |
250 |
} |
251 |
} #for each transcript |
252 |
} |
253 |
|
254 |
sub makeUniqID { |
255 |
my ($idref, $href)=@_; |
256 |
my $id=$$idref; |
257 |
$id=~s/~\d+$//; |
258 |
my $c = ++$$href{$id}; |
259 |
if ($c>1) { |
260 |
$c--; |
261 |
$$idref=$id.'.m'.$c; |
262 |
} |
263 |
return $$idref; |
264 |
} |
265 |
|
266 |
sub flushFeature { #called after an attribute block has been parsed into %cattrs |
267 |
return unless $curfeat && keys(%cattrs)>0; |
268 |
my $gene=$cattrs{'gene'}; |
269 |
die("Error: no gene attribute found for current feature $curfeat\n") |
270 |
unless $gene; |
271 |
if ($curfeat eq 'gene') { # gene attributes processing |
272 |
#make sure it's unique |
273 |
push(@{$gids{$gene}},[$gbase, $gstart, $gend]); |
274 |
my $gno=scalar(@{$gids{$gene}})-1; |
275 |
$gene.='~'.$gno if $gno>0; |
276 |
if (exists($genes{$gene})) { #should never happen, we just made sure it's unique! |
277 |
my $prev_gstart=$genes{$gene}->[2]; |
278 |
my $prev_gend=$genes{$gene}->[3]; |
279 |
# if (abs($prev_gstart-$gstart)>30000) { |
280 |
die("Error -- gene $gene is duplicated ($prev_gstart vs $gstart) !?\n"); |
281 |
# } |
282 |
# else { #update min..max coordinates |
283 |
# $genes{$gene}->[2]=$gstart if $gstart<$prev_gstart; |
284 |
# $genes{$gene}->[3]=$gend if $gend>$prev_gend; |
285 |
# ${$genes{$gene}->[5]}{'fragmented'}=1; |
286 |
# } |
287 |
} |
288 |
else |
289 |
{ # create a new gene entry here |
290 |
my $pseudo= delete($cattrs{'pseudo'}) ? 1 : 0; |
291 |
# 0 1 2 3 4 5 6 7 |
292 |
$genes{$gene}=[$gbase, $strand, $gstart, $gend, [], {}, {}, $pseudo]; |
293 |
} |
294 |
delete $cattrs{'gene'}; |
295 |
if ($allattrs) { |
296 |
if (exists($cattrs{'db_xref'})) { |
297 |
$genes{$gene}->[4]=[@{$cattrs{'db_xref'}}]; |
298 |
delete $cattrs{'db_xref'}; |
299 |
} |
300 |
#another special case: gene_syn |
301 |
if (exists($cattrs{'gene_syn'})) { |
302 |
#convert [list] to a string with comma delimited names |
303 |
$cattrs{'gene_syn'}=join(',',@{$cattrs{'gene_syn'}}); |
304 |
} |
305 |
my $garef=$genes{$gene}->[5]; |
306 |
my @k=keys(%cattrs); |
307 |
@{$garef}{@k}=(values(%cattrs)); |
308 |
} #all attributes are kept |
309 |
@exon=(); |
310 |
%cattrs=(); |
311 |
($partial3, $partial5)=(); |
312 |
return; #gene data added |
313 |
} # stored gene attributes |
314 |
# else |
315 |
# - storing RNA attributes |
316 |
# now make sure we place this RNA within the right gene - in case of gene duplications |
317 |
# locate the matching $gene |
318 |
@exon= sort { $main::a->[0] <=> $main::b->[0] } @exon; |
319 |
my ($xstart, $xend)=($exon[0]->[0], $exon[-1]->[1]); |
320 |
my $gxdata=$gids{$gene}; |
321 |
die("Error: gene $gene not found in \%gids !\n") unless $gxdata; #should never happen |
322 |
if (@$gxdata>1) { |
323 |
my $i=0; |
324 |
my $found; |
325 |
foreach my $gd (@$gxdata) { |
326 |
if ($gd->[0] eq $gbase && $gd->[1]<=$xstart && $gd->[2]>=$xend) { |
327 |
$found=1; |
328 |
last; |
329 |
} |
330 |
$i++; |
331 |
} |
332 |
die("Error: cannot find gene enclosing RNA data ". |
333 |
$cattrs{'transcript_id'}." ".$cattrs{'protein_id'}." (on $gbase, $xstart..$xend)\n") |
334 |
unless $found; |
335 |
$gene.='~'.$i if $i>0; |
336 |
} |
337 |
|
338 |
my $geneid=$gene; |
339 |
my $gdata=$genes{$gene}; |
340 |
die("Error: \$genes data for gene '$gene' cannot be found!\n") unless $gdata; |
341 |
my $rnas=$gdata->[6]; |
342 |
$geneid=~tr/~/_/; |
343 |
my ($tid, $pid); |
344 |
if ($curfeat eq 'CDS') { # just finished a CDS parsing |
345 |
my $pid=$cattrs{'protein_id'}; |
346 |
die("Error: no protein_id found for CDS feature (gene $geneid)!\n") unless $pid; |
347 |
#$pid=~s/^[a-z]+\|//i; # ref|NM_002314.1| |
348 |
#$pid=~s/\|(\w*)$//; # some entries have a suffix - e.g. ref|NM_006824.1|P40 |
349 |
my @t=split(/\|/,$pid); |
350 |
$pid=$t[1] if (@t>1); |
351 |
$pid=~s/\.\d+$//; #remove version |
352 |
$tid=$pmap{$pid}; |
353 |
die("Error: cannot get transcript for protein id $pid (gene $geneid)!\n") unless $tid; |
354 |
unless (exists($rnas->{$tid})) { #this should never happen - CDS without mRNA definition? |
355 |
# 0 1 2 3 4 5 6 |
356 |
#$rnas->{$tid}=['mRNA', [@exons], [@cds], {}, [], 0, 0]; |
357 |
$tid=~s/\.\d+$//; |
358 |
die("Error: cannot find mRNA entry for protein $pid (reported as coming from transcript $tid)\n") |
359 |
unless exists($rnas->{$tid}); |
360 |
} |
361 |
# #just add the CDS |
362 |
push(@{$rnas->{$tid}->[2]}, @exon); |
363 |
# } |
364 |
} #CDS |
365 |
else { # exons (RNA feature) |
366 |
$tid=$cattrs{'transcript_id'}; |
367 |
if ($tid) { |
368 |
my @t=split(/\|/,$tid); |
369 |
$tid=$t[1] if (@t>1); |
370 |
#$tid=~s/^[a-z]+\|//i; |
371 |
#$tid=~s/\|\w*$//; # some entries have a suffix - e.g. ref|NM_006824.1|P40 |
372 |
$tid=~s/\.\d+$//; #remove version! |
373 |
} |
374 |
else { |
375 |
$tid=$geneid.'_t'; #some special cases with 'exon' features for pseudo genes |
376 |
} |
377 |
unless (exists($rnas->{$tid})) { |
378 |
# 0 1 2 3 4 5 6 |
379 |
$rnas->{$tid}=[$curfeat, [@exon], [], {}, [], 0, 0]; |
380 |
} |
381 |
else { #just add the exons |
382 |
push(@{$rnas->{$tid}->[1]}, @exon); |
383 |
} |
384 |
} #exons |
385 |
my $tdata=$rnas->{$tid}; |
386 |
$tdata->[5]=1 if $partial5; |
387 |
$tdata->[6]=1 if $partial3; |
388 |
# add attributes |
389 |
delete @cattrs{'gene', 'transcript_id', 'protein_id'}; |
390 |
my $product = delete $cattrs{'product'}; |
391 |
my $ahref=$tdata->[3]; |
392 |
$ahref->{'product'}=$product if $product; |
393 |
if ($allattrs) { |
394 |
my %xrefs; |
395 |
my @cx; |
396 |
@cx=@{$cattrs{'db_xref'}} if exists $cattrs{'db_xref'}; |
397 |
push(@cx, @{$tdata->[4]}); |
398 |
@xrefs{@cx}=(); |
399 |
@cx=keys(%xrefs); |
400 |
$tdata->[4]=[@cx]; |
401 |
delete $cattrs{'db_xref'}; |
402 |
#add new attributes |
403 |
foreach my $k (keys %cattrs) { |
404 |
$ahref->{$k}=$cattrs{$k}; |
405 |
} |
406 |
} |
407 |
#--> clean up |
408 |
@exon=(); |
409 |
%cattrs=(); |
410 |
($partial3, $partial5)=(); |
411 |
} |
412 |
|
413 |
sub flushGSeq { |
414 |
return unless keys(%genes); |
415 |
flushFeature(); # write last RNA feature read |
416 |
my $gcount=keys(%genes); |
417 |
while (my ($g,$gd)=each(%genes)) { |
418 |
writeGene($g,$gd); |
419 |
} |
420 |
($gbase, $curfeat, @exon, %cattrs, $gpseudo, $gstart, $gend, |
421 |
$partial5, $partial3)=(); |
422 |
$strand='+'; |
423 |
@exon=(); |
424 |
%cattrs=(); |
425 |
%genes=(); |
426 |
#-- debug only: |
427 |
#exit; |
428 |
} |