1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
gbs2gff_chr.pl -p <prot_id2transcript_id-file> [-m <seq_contig.md>] [-M][-T] \ |
8 |
[-f <mapflt>] [-c genome.fa.fai] [-t <track>][-A][-a <attr1,attr2,..>] \ |
9 |
<gbs_stream> |
10 |
Creates chromosome gff annotation files from NCBI GB data |
11 |
|
12 |
Takes the contig mappings (onto chromosomes) from <seq_contig.md> (optional), |
13 |
filtering it by group_label if requested (e.g. -f Primary_Assembly) |
14 |
and converts the input GenBank formatted data into a gff file. |
15 |
|
16 |
Note: only these features are parsed: |
17 |
gene, mRNA, CDS, misc_RNA, ncRNA, rRNA, tRNA |
18 |
Currently these are discarded: |
19 |
C_region, V_segment |
20 |
Options: |
21 |
-A write all attributes |
22 |
-a write these attributes (comma delimited list) |
23 |
-M parse and write mRNA annotations only |
24 |
-T do not write parent "gene" records |
25 |
-K for -c option, skip genome sequences for which the sequence |
26 |
length cannot be verified |
27 |
/; |
28 |
# -S write fasta files with the chromosome\/contig sequences |
29 |
|
30 |
umask 0002; |
31 |
getopts('p:c:m:f:o:t:a:KTMA') || die($usage."\n"); |
32 |
my $ctgmapfile=$Getopt::Std::opt_m; |
33 |
my $f_out=$Getopt::Std::opt_o; |
34 |
my $skip_no_len_check=$Getopt::Std::opt_K; |
35 |
my $writeseq=$Getopt::Std::opt_S; |
36 |
my $addattrs=$Getopt::Std::opt_a; |
37 |
my $mapflt=$Getopt::Std::opt_f; |
38 |
my $p2nfile=$Getopt::Std::opt_p; |
39 |
my $allattrs=$Getopt::Std::opt_A; |
40 |
my $t_only=$Getopt::Std::opt_T; # no gene parent, print transcripts only |
41 |
my $mrnaonly=$Getopt::Std::opt_M; |
42 |
my $chrlen_test=$Getopt::Std::opt_c; #must be a SAMTools faidx file |
43 |
my %chrsizes; |
44 |
if ($chrlen_test && -f $chrlen_test) { |
45 |
open(FAI, $chrlen_test) || die("Error opening $chrlen_test!\n"); |
46 |
while (<FAI>) { |
47 |
chomp; |
48 |
next unless $_; |
49 |
next if m/^\s*#/; |
50 |
my @t=split; |
51 |
$chrsizes{$t[0]}=$t[1]; |
52 |
} |
53 |
close(FAI); |
54 |
} |
55 |
die("Error: -p is required -- to map protein_id to transcript_id\n") |
56 |
unless $p2nfile && -f $p2nfile; |
57 |
my %pmap; # protein_id => transcript_id |
58 |
open(PROTMAP, $p2nfile) || die("Error opening $p2nfile!\n"); |
59 |
while (<PROTMAP>) { |
60 |
chomp; |
61 |
my ($p, $n)=split; |
62 |
next unless $n; |
63 |
$p=~s/\.\d+$//; |
64 |
$n=~s/\.\d+$//; # discard versioning |
65 |
$pmap{$p}=$n; |
66 |
} |
67 |
close(PROTMAP); |
68 |
|
69 |
my %haplotypes = ( |
70 |
'GL000250' => 'chr6_apd_hap1', |
71 |
'GL000251' => 'chr6_cox_hap2', |
72 |
'GL000252' => 'chr6_dbb_hap3', |
73 |
'GL000253' => 'chr6_mann_hap4', |
74 |
'GL000254' => 'chr6_mcf_hap5', |
75 |
'GL000255' => 'chr6_qbl_hap6', |
76 |
'GL000256' => 'chr6_ssto_hap7', |
77 |
'GL000257' => 'chr4_ctg9_hap1', |
78 |
'GL000258' => 'chr17_ctg5_hap1' |
79 |
); |
80 |
|
81 |
# 0 1 2 3 4 5 6 |
82 |
my %ctgmap; # $ctg=> [chr, strand, chr_start, chr_end, ctg_gl_acc, chr_name, seq_definition] |
83 |
if ($ctgmapfile) { |
84 |
open(CTGMAP, $ctgmapfile) || die("Error opening $ctgmapfile\n"); |
85 |
while (<CTGMAP>) { |
86 |
next if m/^#/; |
87 |
my @t=split(/\t/); |
88 |
next if $t[6] eq 'na'; |
89 |
next if $mapflt && $t[8]!~/$mapflt/i; |
90 |
my $ctgacc=$t[5]; |
91 |
$ctgacc=~s/\.\d+$//; # remove version |
92 |
my $chrname='chr'.$t[1]; |
93 |
$chrname=~s/\.\d+$//; |
94 |
my ($chrstart, $chrend)=($t[2], $t[3]); |
95 |
($chrstart, $chrend)=($chrend, $chrstart) if $chrend<$chrstart; |
96 |
my $chrstrand=($t[4] eq '-') ? '-' : '+'; |
97 |
my $ctgdata=[$chrname, $chrstrand, $chrstart, $chrend, '', '']; |
98 |
$ctgmap{$t[5]}=$ctgdata; |
99 |
$ctgmap{$ctgacc}=$ctgdata; |
100 |
} |
101 |
close(CTGMAP); |
102 |
} |
103 |
#key attributes to always keep : |
104 |
my %kattrs; @kattrs{'gene', 'product', 'transcript_id', 'codon_start'}=(); |
105 |
my %xattrs; #extra attributes to keep |
106 |
if ($addattrs) { |
107 |
$addattrs=~tr/ //d; |
108 |
my @add=split(/\,/,$addattrs); |
109 |
@xattrs{@add}=(); |
110 |
} |
111 |
my $moreattrs = ( $allattrs || $addattrs ); |
112 |
# --------------- |
113 |
my %gids; #ensure unique gene IDs |
114 |
#allow for duplication $gids{$gene}=[ [gstart1, gend1], [gstart2, gend2] ] |
115 |
my %tids; # and transcript IDs |
116 |
#---- |
117 |
# -- per gbase: |
118 |
|
119 |
my %genes; # genes{$gene}=[$gbase, $strand, $gstart, $gend, [@gxrefs], \%gattrs, \%rnas, isPseudo, [@genesyms] ] |
120 |
# 0 1 2 3 4 5 6 7 8 |
121 |
# %rnas hash holds all mRNA data for a single gene (i.e. all isoforms) |
122 |
# 0 1 2 3 4 5 6 7 |
123 |
# $rnas{transcript_id}=[ $featname, [@exon], [@cds], \%tattrs, [@txrefs], $partial3, $partial5, [@genesyms] ] |
124 |
#current values: |
125 |
my ($curctg, $curctglen, $curcmt); # per contig |
126 |
my $cursection; |
127 |
my ($curfeat, $curfstrand, @fsegs, %fattrs, $fpartial5, $fpartial3); # per feature |
128 |
# $gpseudo, $gstart, $gend, $partial5, $partial3); |
129 |
my $last_line; |
130 |
my $lastattr; |
131 |
my $track=$Getopt::Std::opt_t || 'ncbi'; |
132 |
#only parse these sections: |
133 |
my %sparse;@sparse{'FEATURES', 'COMMENT'}=(1,1); #sections for which to parse multiple rows |
134 |
|
135 |
#only parse these features (WARNING: check this everyonce in a while, NCBI may change these tags |
136 |
# gzip -cd *.gbs.gz | perl -ne 'print $1."\n" if m/^\s{4,8}(\S+)\s{2,}\S+/' | sort -u |
137 |
# |
138 |
my %fparse; |
139 |
@fparse{'gene','misc_RNA','mRNA','ncRNA', 'CDS', 'rRNA', |
140 |
'tRNA', 'snoRNA','miRNA','microRNA','micro_RNA', 'exon', |
141 |
'C_region', 'D_segment', 'J_segment', 'V_segment'}=(); |
142 |
my %fskip; @fskip{'source', 'gap', 'repeat'}=(); |
143 |
my $skipctg; |
144 |
if ($f_out) { |
145 |
open(F_OUT, '>'.$f_out) || die("Error creating output file $f_out\n"); |
146 |
select(F_OUT); |
147 |
} |
148 |
while (<>) { |
149 |
LSKIP: |
150 |
$last_line=$_; |
151 |
if (m/^ORIGIN\s*$/) { |
152 |
# read sequence here |
153 |
my $ctgdata=$ctgmap{$curctg}; |
154 |
my ($c_chr,$c_acc, $c_def)=($ctgdata->[0], $ctgdata->[4], $ctgdata->[6]); |
155 |
my $defline=">$c_chr $c_acc"; |
156 |
$defline.=" $c_def" if $c_def; |
157 |
my $fname=$c_chr.'.fa'; |
158 |
flushGSeq(); |
159 |
if ($writeseq) { |
160 |
open(FASEQ, '>'.$fname) || die ("Error creating fasta file $fname!\n"); |
161 |
print FASEQ $defline."\n"; |
162 |
} |
163 |
my $l; |
164 |
while (($l=<>)) { |
165 |
$l=~s/[\s\n\r]+$//s; |
166 |
last if $l eq '//'; |
167 |
my ($coord, $seq)=($l=~m/^\s*(\d+)\s+(\w[ \w]*)/); |
168 |
$seq=~tr/ \t//d; |
169 |
if ($writeseq) { |
170 |
print FASEQ uc($seq)."\n"; |
171 |
} |
172 |
} |
173 |
close(FASEQ) if $writeseq; |
174 |
next; |
175 |
} |
176 |
if (m/^([A-Z,0-9]+)\s+(\S.+)/) { |
177 |
($cursection, my $rest)=($1,$2); |
178 |
if ($cursection eq 'LOCUS') { |
179 |
flushGSeq(); # flush contig features accumulated so far |
180 |
my ($ctgacc, $ctglen)=($rest=~m/^(\w+)\s+(\d+)\s+bp/); |
181 |
die("Error: cannot parse the LOCUS line:\n$last_line\n") unless $ctglen>1; |
182 |
my $cd; |
183 |
if ($ctgmapfile) { |
184 |
$cd=$ctgmap{$ctgacc}; |
185 |
unless ($cd) { |
186 |
print STDERR "Skipping contig $ctgacc (no chromosome entry)\n"; |
187 |
$skipctg=1; |
188 |
next; |
189 |
} |
190 |
die("Error: mismatch between contig $ctgacc length and its chromosome positioning!\n") |
191 |
unless $ctglen == $cd->[3]-$cd->[2]+1; |
192 |
} |
193 |
$skipctg=0; |
194 |
$curctg=$ctgacc; |
195 |
$curctglen=$ctglen; |
196 |
next; |
197 |
} |
198 |
next if $skipctg; |
199 |
if ($cursection eq 'DEFINITION') { |
200 |
unless ($ctgmapfile) { |
201 |
my ($chrnum)=($rest=~m/chromosome\s+([\dXYM]+)/i); |
202 |
unless ($chrnum) { |
203 |
if (m/ mitochondrion/) { |
204 |
$chrnum='M'; |
205 |
} |
206 |
elsif (m/unplaced genomic contig/) { |
207 |
$chrnum='Un'; |
208 |
} |
209 |
} |
210 |
die("Error parsing chromosome name! ($rest)\n") unless $chrnum; |
211 |
$rest=~s/\.$//; |
212 |
my $chrid='chr'.$chrnum; |
213 |
$ctgmap{$curctg}=[$chrid, '+', 1, $curctglen, '', '', $rest]; |
214 |
} |
215 |
next; |
216 |
} |
217 |
if ($cursection eq 'VERSION') { |
218 |
unless ($ctgmapfile) { |
219 |
my ($accver)=($rest=~m/^\s*\w+(\.\d+)/); |
220 |
die("Error parsing accession.version! ($rest)\n") unless $accver; |
221 |
$ctgmap{$curctg}->[4]=$curctg.$accver; |
222 |
} |
223 |
next; |
224 |
} |
225 |
if ($cursection eq 'COMMENT') { |
226 |
$curcmt=$rest; |
227 |
} |
228 |
next; |
229 |
} #big section start |
230 |
next if $skipctg; |
231 |
next unless exists $sparse{$cursection}; |
232 |
if ($cursection eq 'COMMENT') { |
233 |
chomp; |
234 |
s/^\s+/ /; |
235 |
$curcmt.=$_; |
236 |
} |
237 |
elsif ($cursection eq 'FEATURES') { |
238 |
# current contig information has been parsed by now |
239 |
if ($curctg && $curcmt) { |
240 |
my $ctgdata=$ctgmap{$curctg}; |
241 |
die("Error: no chromosome mapping for contig $curctg!\n") unless $ctgdata; |
242 |
my ($glacc)=($curcmt=~m/sequence is identical to\s+(GL\d+)/s); |
243 |
my $chrname=$$ctgdata[0]; |
244 |
if ($glacc) { |
245 |
$$ctgdata[4]=$glacc; |
246 |
my $chr=$$ctgdata[0]; |
247 |
if ($chr=~m/^(chr\w+)/) { |
248 |
my $chrbase=$1; |
249 |
# try to match UCSC naming scheme |
250 |
if ($chrbase eq 'chrUn') { |
251 |
$chrname=$chrbase.'_'.lc($glacc); |
252 |
} |
253 |
else { |
254 |
if (exists($haplotypes{$glacc})) { |
255 |
$chrname=$haplotypes{$glacc}; |
256 |
} |
257 |
else { |
258 |
$chrname=$chrbase.'_'.lc($glacc).'_random'; |
259 |
} |
260 |
} |
261 |
} |
262 |
} |
263 |
$$ctgdata[5]=$chrname; |
264 |
if ($chrlen_test) { |
265 |
my $oldchrlen=$chrsizes{$chrname}; |
266 |
if ($oldchrlen) { |
267 |
die("Error: $chrname length mismatch (expected '$chrsizes{$chrname}' but found $curctglen) !\n") |
268 |
unless ($oldchrlen==$curctglen); |
269 |
} |
270 |
else { |
271 |
my $msg="Warning: couldn't check length of $chrname."; |
272 |
if ($skip_no_len_check) { |
273 |
$skipctg=1; |
274 |
delete $ctgmap{$curctg}; |
275 |
GMessage($msg." Skipped."); |
276 |
next; |
277 |
} |
278 |
GMessage($msg); |
279 |
} |
280 |
} |
281 |
} |
282 |
#big FEATURES block |
283 |
if (m/^\s{3,8}(['\-\w]+)\s{2,}(\S+)/) { # a feature starts here (gene/*RNA/CDS/exon |
284 |
my ($newfeat, $rest)=($1,$2); |
285 |
flushFeature() if $curfeat; |
286 |
$lastattr=''; |
287 |
$curfeat=$newfeat; |
288 |
|
289 |
# next unless exists($fparse{$curfeat}); |
290 |
next if exists($fskip{$curfeat}); |
291 |
unless (exists($fparse{$curfeat})) { |
292 |
GMessage("Warning: skipping unknown feature: $curfeat!"); |
293 |
next; |
294 |
} |
295 |
# check for coordinate definition |
296 |
$_=$rest; |
297 |
@fsegs=(); |
298 |
$curfstrand=(s/complement\(//)?'-':'+'; |
299 |
my $multi=(s/join\(//); |
300 |
$fpartial5=1 if m/^\<\d+/; |
301 |
s/\s+$//; |
302 |
my $readnext; |
303 |
do { |
304 |
my @s=(m/([\d+\.\>]+)/g); |
305 |
$readnext=m/\,$/; |
306 |
foreach my $seg (@s) { |
307 |
my ($x1,$x2)=($seg=~m/(\d+)/g); |
308 |
unless ($x2) { # microexons may be given as only one coordinate |
309 |
$x2=$x1; |
310 |
} |
311 |
else { |
312 |
($x1,$x2)=($x2,$x1) if $x1>$x2; |
313 |
} |
314 |
push(@fsegs, [$x1,$x2]); |
315 |
} |
316 |
$_=<> if $readnext; |
317 |
} while ($readnext); #parse feature segments (exons) |
318 |
$fpartial3=1 if m/^\.\.\>\d+\)*$/; |
319 |
die("Error: couldn't parse segment coordinates for feature $curfeat on $curctg\n$_\n") unless @fsegs>0; |
320 |
next; |
321 |
} # coordinate line for a gene/mRNA/CDS/exon/*RNA feature |
322 |
#next unless exists($fparse{$curfeat}); #only parse recognizable features |
323 |
next if exists($fskip{$curfeat}); |
324 |
unless (exists($fparse{$curfeat})) { |
325 |
GMessage("Warning: skipping unknown feature: $curfeat!"); |
326 |
next; |
327 |
} |
328 |
|
329 |
# -- here we are within a feature block (after coordinates were parsed) |
330 |
#-- attribute parsing |
331 |
chomp; |
332 |
if (m/^\s{9,}\/([^=]+)(.*)/) { # attribute start |
333 |
my ($attr, $value)=($1,$2); |
334 |
if ($value) { |
335 |
$value=~s/^=//; |
336 |
$value=~tr/;/,/d; #" |
337 |
$value=~s/^"//; |
338 |
$value=~s/"$//; |
339 |
} |
340 |
else { $value=1 }; |
341 |
$attr=~s/^gene_syn\w+/gene_syn/; |
342 |
if ($attr eq 'db_xref' || $attr eq 'gene_syn') { |
343 |
$value=~tr/ \n\t\r//d; |
344 |
my @vmulti=split(/[\;\,]/, $value); |
345 |
if ($attr eq 'gene_syn' && @vmulti>1) { |
346 |
push(@{$fattrs{$attr}}, @vmulti); |
347 |
} |
348 |
else { |
349 |
push(@{$fattrs{$attr}}, $value); |
350 |
} |
351 |
} |
352 |
else { |
353 |
$fattrs{$attr}=$value; |
354 |
} |
355 |
$lastattr=$attr; |
356 |
} #attribute start line |
357 |
else { #value continuation line for last attribute: |
358 |
die("Error: invalid line encountered when attribute continuation was expected!\n". |
359 |
"$last_line\ncurctg=$curctg, curfeat=$curfeat, lastattr=$lastattr\n") |
360 |
unless $lastattr && exists $fattrs{$lastattr}; |
361 |
s/^\s+/ /; |
362 |
s/"$//; |
363 |
my $v=$_; |
364 |
if ($lastattr eq 'gene_syn') { |
365 |
$v=~tr/ \n\t\r//d; |
366 |
my @vmulti=split(/[\;\,]/,$v); |
367 |
push(@{$fattrs{$lastattr}}, @vmulti); |
368 |
} |
369 |
else { |
370 |
$fattrs{$lastattr}.=$v; |
371 |
} |
372 |
} #attribute continuation |
373 |
#if ($lastattr eq 'note') { |
374 |
# my $v=$fattrs{$lastattr}; |
375 |
# if ($v=~s/Derived by automated(.+?)prediction method: ([^\.]+)\..*$/predicted by: $2/) |
376 |
# { $fattrs{$lastattr}=$v } |
377 |
# } |
378 |
|
379 |
}# we are in FEATURES block |
380 |
} #for each input line |
381 |
|
382 |
flushGSeq(); |
383 |
|
384 |
if ($f_out) { |
385 |
close(F_OUT); |
386 |
} |
387 |
|
388 |
|
389 |
#--------------------------------------------------------- |
390 |
sub flushGSeq { |
391 |
return unless $curctg; |
392 |
flushFeature(); # close last feature read |
393 |
return unless keys(%genes); |
394 |
|
395 |
my $gcount=keys(%genes); |
396 |
while (my ($g,$gd)=each(%genes)) { |
397 |
writeGene($g,$gd); |
398 |
} |
399 |
%genes=(); |
400 |
$curctg=''; |
401 |
$curctglen=0; |
402 |
$curcmt=''; |
403 |
$cursection=''; |
404 |
#-- debug only: |
405 |
#exit; |
406 |
} |
407 |
|
408 |
|
409 |
sub makeUniqID { |
410 |
my ($idref, $href)=@_; |
411 |
my $id=$$idref; |
412 |
$id=~s/~\d+$//; |
413 |
my $c = ++$$href{$id}; |
414 |
if ($c>1) { |
415 |
$c--; |
416 |
$$idref=$id.'.m'.$c; |
417 |
} |
418 |
return $$idref; |
419 |
} |
420 |
|
421 |
sub createGeneEntry { |
422 |
my ($gene, $gstart, $gend)=@_; |
423 |
push(@{$gids{$gene}},[$curctg, $gstart, $gend]); |
424 |
my $gno=scalar(@{$gids{$gene}})-1; |
425 |
$gene.='~'.$gno if $gno>0; |
426 |
if (exists($genes{$gene})) { #should never happen, we just made sure it's unique! |
427 |
my $prev_gstart=$genes{$gene}->[2]; |
428 |
my $prev_gend=$genes{$gene}->[3]; |
429 |
# if (abs($prev_gstart-$gstart)>30000) { |
430 |
die("Error -- gene $gene is duplicated ($prev_gstart vs $gstart) !?\n"); |
431 |
# } |
432 |
# else { #update min..max coordinates |
433 |
# $genes{$gene}->[2]=$gstart if $gstart<$prev_gstart; |
434 |
# $genes{$gene}->[3]=$gend if $gend>$prev_gend; |
435 |
# ${$genes{$gene}->[5]}{'fragmented'}=1; |
436 |
# } |
437 |
} |
438 |
else |
439 |
{ # create a new gene entry here |
440 |
my $pseudo= delete($fattrs{'pseudo'}) ? 1 : 0; |
441 |
# 0 1 2 3 4 5 6 7 |
442 |
$genes{$gene}=[$curctg, $curfstrand, $gstart, $gend, [], {}, {}, $pseudo, []]; |
443 |
} |
444 |
delete $fattrs{'gene'}; |
445 |
if ($moreattrs) { |
446 |
if (keepAttr('db_xref', \%fattrs)) { |
447 |
$genes{$gene}->[4]=[@{$fattrs{'db_xref'}}]; |
448 |
delete $fattrs{'db_xref'}; |
449 |
} |
450 |
#another special case: gene_syn |
451 |
if (keepAttr('gene_syn', \%fattrs)) { |
452 |
$genes{$gene}->[8]=[@{$fattrs{'gene_syn'}}]; |
453 |
delete $fattrs{'gene_syn'}; |
454 |
} |
455 |
my $garef=$genes{$gene}->[5]; |
456 |
if ($allattrs) { |
457 |
my @k=keys(%fattrs); |
458 |
@{$garef}{@k}=(values(%fattrs)); |
459 |
} else{ |
460 |
foreach my $k (keys(%fattrs)) { |
461 |
$garef->{$k}=$fattrs{$k} if exists($xattrs{$k}); |
462 |
} |
463 |
} |
464 |
} #more attributes are kept |
465 |
return $gene; |
466 |
} |
467 |
|
468 |
|
469 |
sub flushFeature { #called after a feature attributes block has been parsed into %fattrs |
470 |
return unless $curfeat && keys(%fattrs)>0; |
471 |
my $v=$fattrs{'note'}; |
472 |
if ($v && |
473 |
$v=~s/Derived by automated(.+?)prediction method: ([^\.]+)\..*$/predicted by: $2/) { |
474 |
$fattrs{'note'}=$v; |
475 |
} |
476 |
my $gene=$fattrs{'gene'}; |
477 |
my @exon= sort { $main::a->[0] <=> $main::b->[0] } @fsegs; |
478 |
my ($xstart, $xend)=($exon[0]->[0], $exon[-1]->[1]); |
479 |
if (!$gene && $curfeat=~m/[^m]RNA/) { |
480 |
my $t=$fattrs{'product'}; |
481 |
if (length($t)<14) { |
482 |
$t=~tr/- ,/___/s; |
483 |
$gene=$t; |
484 |
} |
485 |
else { |
486 |
$gene=$curfeat; |
487 |
} |
488 |
$gene='NOT_GENE|'.$gene; |
489 |
# clearly this is an orphan RNA entry (with no gene) |
490 |
# so we create a dummy gene entry AND a transcript entry here |
491 |
createGeneEntry($gene, $xstart, $xend); |
492 |
} |
493 |
|
494 |
die("Error: no gene attribute found for current feature $curfeat\n") |
495 |
unless $gene; |
496 |
if ($curfeat eq 'gene') { # gene attributes processing |
497 |
#make sure it's unique |
498 |
die("Error: gene feature ($gene) has multiple segments?\n") if @fsegs>1; |
499 |
die("Error: gene feature ($gene) has no coordinates?\n") if @fsegs==0; |
500 |
createGeneEntry($gene, $xstart, $xend); |
501 |
@fsegs=(); |
502 |
%fattrs=(); |
503 |
($fpartial3, $fpartial5)=(); |
504 |
return; #gene data added |
505 |
} # stored gene attributes |
506 |
# else |
507 |
# - storing RNA attributes |
508 |
# now make sure we place this RNA within the right gene - in case of gene duplications |
509 |
# locate the matching $gene |
510 |
my $gxdata=$gids{$gene}; |
511 |
unless ($gxdata) { |
512 |
# strange, this looks like another orphan xRNA entry |
513 |
createGeneEntry($gene, $xstart, $xend); |
514 |
$gxdata=$gids{$gene}; |
515 |
} |
516 |
|
517 |
die("Error: gene $gene not found in \%gids !\n") unless $gxdata; #should never happen |
518 |
if (@$gxdata>1) { |
519 |
my $i=0; |
520 |
my $found; |
521 |
foreach my $gd (@$gxdata) { |
522 |
if ($gd->[0] eq $curctg && $gd->[1]<=$xstart && $gd->[2]>=$xend) { |
523 |
$found=1; |
524 |
last; |
525 |
} |
526 |
$i++; |
527 |
} |
528 |
die("Error: cannot find gene enclosing RNA data ". |
529 |
$fattrs{'transcript_id'}." ".$fattrs{'protein_id'}." (on $curctg, $xstart..$xend)\n") |
530 |
unless $found; |
531 |
$gene.='~'.$i if $i>0; |
532 |
} |
533 |
|
534 |
my $geneid=$gene; |
535 |
my $gdata=$genes{$gene}; |
536 |
die("Error: \$genes data for gene '$gene' cannot be found!\n") unless $gdata; |
537 |
my $rnas=$gdata->[6]; |
538 |
my $gpseudo=$gdata->[7]; |
539 |
$geneid=~tr/~/_/; |
540 |
my ($tid, $pid); |
541 |
my $isMito=($ctgmap{$curctg}->[0] eq 'chrM'); #different format for mitochondrial entries |
542 |
$pid=$fattrs{'protein_id'}; |
543 |
my $isCDS = ($curfeat eq 'CDS'); |
544 |
if ($isCDS && $pid) { # just finished a CDS parsing |
545 |
#unless ($pid) { |
546 |
# print STDERR "Warning: no protein_id found for CDS feature of gene $geneid (on $curctg).\n"; |
547 |
# @fsegs=(); |
548 |
# %fattrs=(); |
549 |
# ($fpartial3, $fpartial5)=(); |
550 |
# return; |
551 |
# } |
552 |
my @t=split(/\|/,$pid); |
553 |
$pid=$t[1] if (@t>1); |
554 |
$pid=~s/\.\d+$//; #remove version |
555 |
$tid=$pmap{$pid}; |
556 |
die("Error: cannot get transcript for protein id $pid (gene $geneid)!\n") unless $tid; |
557 |
unless (exists($rnas->{$tid})) { #this should never happen - CDS without mRNA definition? |
558 |
# --> yeah, it does happen for Mitochondrial chromosomes! |
559 |
if ($isMito) { |
560 |
$tid=$pid; |
561 |
# 0 1 2 3 4 5 6 7 |
562 |
#$rnas->{$tid}=['mRNA', [@exon], [], {}, [], 0, 0, []]; |
563 |
$rnas->{$tid}=['mRNA', [@exon], [@exon], {}, [], 0, 0, []]; |
564 |
} |
565 |
else { |
566 |
$tid=~s/\.\d+$//; |
567 |
die("Error: no mRNA entry for CDS/protein $pid (reported as coming from transcript $tid)\n") |
568 |
unless exists($rnas->{$tid}); |
569 |
} |
570 |
} |
571 |
# #just add the CDS |
572 |
push(@{$rnas->{$tid}->[2]}, @exon); |
573 |
} #CDS |
574 |
else { # exons, other RNA features or CDS without protein_id |
575 |
if ($isCDS) { |
576 |
my $msg="Warning: no protein_id for CDS feature of gene $geneid (on $curctg)"; |
577 |
if ($gpseudo) { |
578 |
# discard CDS segments for pseudogenes |
579 |
@fsegs=(); |
580 |
%fattrs=(); |
581 |
($fpartial3, $fpartial5)=(); |
582 |
$msg=~s/ gene/ pseudogene/; |
583 |
GMessage("$msg. Discarded."); |
584 |
return; |
585 |
} |
586 |
GMessage("$msg."); |
587 |
} |
588 |
$tid=$fattrs{'transcript_id'}; |
589 |
if ($tid) { |
590 |
my @t=split(/\|/,$tid); |
591 |
$tid=$t[1] if (@t>1); |
592 |
#$tid=~s/^[a-z]+\|//i; |
593 |
#$tid=~s/\|\w*$//; # some entries have a suffix - e.g. ref|NM_006824.1|P40 |
594 |
$tid=~s/\.\d+$//; #remove version! |
595 |
} |
596 |
else { |
597 |
$tid=$geneid.'.t'; #some special cases with features parented directly by [pseudo] genes |
598 |
} |
599 |
unless (exists($rnas->{$tid})) { |
600 |
my $frna=$curfeat; |
601 |
if ($frna!~m/RNA/) { |
602 |
$frna= $gpseudo? 'pseudoRNA' : 'misc_RNA'; |
603 |
} |
604 |
if ($isCDS) { |
605 |
# 0 1 2 3 4 5 6 7 |
606 |
$rnas->{$tid}=[$frna, [], [@exon], {}, [], 0, 0, [] ]; |
607 |
} |
608 |
else { |
609 |
# 0 1 2 3 4 5 6 7 |
610 |
$rnas->{$tid}=[$frna, [@exon], [], {}, [], 0, 0, [] ]; |
611 |
} |
612 |
} |
613 |
else { #just add the exons |
614 |
my $segidx = $isCDS ? 2 : 1; |
615 |
push(@{$rnas->{$tid}->[$segidx]}, @exon); |
616 |
} |
617 |
} #exon segments |
618 |
my $tdata=$rnas->{$tid}; |
619 |
$tdata->[5]=1 if $fpartial5; |
620 |
$tdata->[6]=1 if $fpartial3; |
621 |
# add attributes |
622 |
delete @fattrs{'gene', 'transcript_id'}; |
623 |
delete $fattrs{'protein_id'} unless exists($xattrs{'protein_id'}); |
624 |
my $product = delete $fattrs{'product'}; |
625 |
my $ahref=$tdata->[3]; |
626 |
$ahref->{'product'}=$product if $product; |
627 |
if (my $ncRNAclass=delete $fattrs{'ncRNA_class'}) { |
628 |
$ahref->{'ncRNA_class'}=$ncRNAclass; |
629 |
} |
630 |
if ($moreattrs) { |
631 |
my %xrefs; |
632 |
my @cx; |
633 |
@cx=@{$fattrs{'db_xref'}} if keepAttr('db_xref', \%fattrs); #if exists $fattrs{'db_xref'}; |
634 |
push(@cx, @{$tdata->[4]}); |
635 |
@xrefs{@cx}=(); |
636 |
@cx=keys(%xrefs); |
637 |
$tdata->[4]=[@cx]; |
638 |
delete $fattrs{'db_xref'}; |
639 |
#if (exists $fattrs{'gene_syn'}) { |
640 |
if ($isCDS && (my $v=$fattrs{'note'})) { |
641 |
if ($v=~s/isoform .+? encoded by transcript [\w \-\,\.]+//) { |
642 |
$v=~s/^\;?\s+//; |
643 |
$v=~s/\;?\s+$//; |
644 |
$v=~s/\;\s*/, /; |
645 |
if (length($v)>2) { $fattrs{'note'}=$v } |
646 |
else { delete $fattrs{'note'} } |
647 |
} |
648 |
} |
649 |
if (keepAttr('gene_syn', \%fattrs)) { |
650 |
%xrefs=(); |
651 |
@cx=@{$fattrs{'gene_syn'}}; |
652 |
push(@cx, @{$tdata->[7]}); |
653 |
@xrefs{@cx}=(); |
654 |
@cx=keys(%xrefs); |
655 |
$tdata->[7]=[@cx]; |
656 |
delete $fattrs{'gene_syn'}; |
657 |
} |
658 |
#add new attributes |
659 |
if ($allattrs) { |
660 |
foreach my $k (keys %fattrs) { |
661 |
$ahref->{$k}=$fattrs{$k}; |
662 |
} |
663 |
} |
664 |
else { # $addattrs |
665 |
foreach my $k (keys %fattrs) { |
666 |
$ahref->{$k}=$fattrs{$k} if exists($xattrs{$k}); |
667 |
} |
668 |
} |
669 |
} |
670 |
#--> clean up |
671 |
@fsegs=(); |
672 |
%fattrs=(); |
673 |
($fpartial3, $fpartial5)=(); |
674 |
$lastattr=''; |
675 |
$curfeat=''; |
676 |
} |
677 |
|
678 |
sub writeGene { |
679 |
my ($gene, $gdata)=@_; |
680 |
my $notgene=($gene=~s/^NOT_GENE\|//); |
681 |
my $geneid=$gene; |
682 |
my ($gene_name)=($geneid=~m/^([^~]+)/); |
683 |
$geneid=~tr/~/_/; |
684 |
my ($cbase, $cstrand, $gstart, $gend, $gpseudo)= |
685 |
($gdata->[0], $gdata->[1], $gdata->[2], $gdata->[3], $gdata->[7]); |
686 |
my $gfeature=($gpseudo)?'pseudogene' : 'gene'; |
687 |
#makeUniqID(\$gene, \%gids); # $gene will be updated if necessary |
688 |
#my $geneid=$gene; |
689 |
#$geneid=~tr/~/_/; |
690 |
my $attrs="ID=$geneid;Name=$gene_name"; |
691 |
my @gxrefs=@{$gdata->[4]}; |
692 |
$attrs.=';xrefs='.join(',', @gxrefs) if (@gxrefs>0); |
693 |
my @gsyn=@{$gdata->[8]}; |
694 |
$attrs.=';gene_syn='.join(',', @gsyn) if (@gsyn>0); |
695 |
my $gattrs=$gdata->[5]; |
696 |
my $rnas=$gdata->[6]; |
697 |
my $gcoords; |
698 |
my ($gbase, $strand, $gchrlen); |
699 |
($gbase, $strand, $gcoords, $gchrlen)=contig2chr($cbase, $cstrand, [[$gstart, $gend]]); |
700 |
($gstart, $gend)=($$gcoords[0]->[0], $$gcoords[0]->[1]); |
701 |
my $showpseudo = $gpseudo && (keys(%$rnas)==0); |
702 |
unless (!$showpseudo && ($mrnaonly || $t_only || $notgene)) { |
703 |
foreach my $k (keys(%$gattrs)) { |
704 |
$attrs.=";$k=".$gattrs->{$k}; |
705 |
} |
706 |
print join("\t", $gbase, $track, $gfeature, $gstart, $gend, '.', $strand, '.', $attrs)."\n"; |
707 |
} |
708 |
|
709 |
foreach my $tid (keys(%$rnas)) { |
710 |
#die("Error: no transcript_id found at write_RNA for gene $gene ($geneid)\n$last_line\n") unless $tid; |
711 |
my $tdata=$rnas->{$tid}; |
712 |
my $rnafeat=$tdata->[0]; |
713 |
if ($rnafeat eq 'RNA_exon' || $rnafeat eq 'misc_RNA') { |
714 |
$rnafeat = $gpseudo ? 'pseudoRNA' : 'misc_RNA'; |
715 |
} |
716 |
if ($mrnaonly) { |
717 |
$rnafeat=~s/misc_RNA/mRNA/; |
718 |
} |
719 |
|
720 |
next if $mrnaonly && $rnafeat ne 'mRNA'; |
721 |
# my $texons=$tdata->[1]; |
722 |
my $texons; |
723 |
($gbase, $strand, $texons)=contig2chr($cbase, $cstrand, $tdata->[1]); |
724 |
my $tcds=$tdata->[2]; |
725 |
if ($tcds && @$tcds>0) { |
726 |
($gbase, $strand, $tcds)=contig2chr($cbase, $cstrand, $tdata->[2]); |
727 |
} |
728 |
else { |
729 |
$tcds=''; |
730 |
} |
731 |
my $tahref=$tdata->[3]; #attributes |
732 |
my $txar=$tdata->[4]; # db_xrefs |
733 |
my $txgsyn=$tdata->[7]; # gene_syns |
734 |
my $ncRNA; |
735 |
if (my $rnaclass=$tahref->{'ncRNA_class'}) { |
736 |
$rnafeat=$rnaclass; |
737 |
delete($$tahref{'ncRNA_class'}); |
738 |
$ncRNA=1; |
739 |
} |
740 |
my ($part3, $part5)=($tdata->[5], $tdata->[6]); |
741 |
makeUniqID(\$tid, \%tids); |
742 |
my @exon=sort { $main::a->[0] <=> $main::b->[0] } @$texons; |
743 |
my ($tstart, $tend)=($exon[0]->[0], $exon[-1]->[1]); |
744 |
$tid=~s/^NOT_GENE\|//; |
745 |
my $attrs="ID=$tid"; |
746 |
my $gffname=$tid; |
747 |
if ($notgene) { |
748 |
$attrs.=";Name=$tid"; |
749 |
} |
750 |
else { |
751 |
$attrs.=";Parent=$geneid" unless ($mrnaonly || $t_only); |
752 |
$gffname=$gene_name; |
753 |
$attrs.=";Name=$gene_name;gene_name=$gene_name"; |
754 |
} |
755 |
|
756 |
my $product=$tahref->{'product'}; |
757 |
$ncRNA=1 if $rnafeat eq 'tRNA' || $gffname=~m/NCRNA\d+/; |
758 |
$attrs.=';ncRNA=1' if $ncRNA; |
759 |
$attrs.=';partial=1' if ($part5 || $part3); |
760 |
if ($product) { |
761 |
$product=~tr/"/'/; #" |
762 |
$product=~tr/;/./; |
763 |
# $attrs.=';product="'.$product.'"'; |
764 |
$attrs.=";product=$product"; |
765 |
} |
766 |
if ($moreattrs) { |
767 |
$attrs.=';xrefs='.join(',', @$txar) if ($txar && @$txar>0); |
768 |
$attrs.=';gene_syn='.join(',', @$txgsyn) if ($txgsyn && @$txgsyn>0); |
769 |
foreach my $k (keys(%$tahref)) { |
770 |
next if exists($kattrs{$k}); |
771 |
$attrs.=";$k=".$tahref->{$k}; |
772 |
} |
773 |
} |
774 |
print join("\t", $gbase, $track, $rnafeat, $tstart, $tend, '.', $strand, '.', $attrs)."\n"; |
775 |
unless($ncRNA && @exon==1) { |
776 |
foreach my $ex (@exon) { |
777 |
print join("\t", $gbase, $track, 'exon', $$ex[0], $$ex[1], '.', $strand, '.', "Parent=$tid")."\n"; |
778 |
} |
779 |
} |
780 |
@exon=(); |
781 |
#write_CDS(); |
782 |
if ($tcds) { #have CDS, write it |
783 |
my @cds= sort { $main::a->[0] <=> $main::b->[0] } @$tcds; |
784 |
my $codonstart=$tahref->{'codon_start'}; |
785 |
if ($codonstart>1) { |
786 |
$codonstart--; |
787 |
if ($strand eq '-') { |
788 |
$cds[-1]->[1]-=$codonstart; |
789 |
} |
790 |
else { |
791 |
$cds[0]->[0]+=$codonstart; |
792 |
} |
793 |
} |
794 |
else { $codonstart='0'; } |
795 |
|
796 |
foreach my $cd (@cds) { #TODO: calculate phase ? |
797 |
print join("\t", $gbase, $track, 'CDS', $$cd[0], $$cd[1], '.', $strand, '.', "Parent=$tid")."\n"; |
798 |
} |
799 |
} |
800 |
} #for each transcript |
801 |
} |
802 |
|
803 |
sub contig2chr { |
804 |
my ($ctg, $ctgstrand, $ctgsegs)=@_; |
805 |
my $cd=$ctgmap{$ctg}; |
806 |
die("Error: cannot find chromosome mapping for contig $ctg!\n") unless $ctg; |
807 |
my ($chr, $chrstrand, $chrstart, $chrend, $glacc, $chrname)=@$cd; |
808 |
unless ($ctgmapfile) { |
809 |
return ($chrname, $ctgstrand, [@$ctgsegs], $ctgmap{$ctg}->[3]); |
810 |
} |
811 |
my $clen=$chrend-$chrstart+1; |
812 |
my @chrsegs; |
813 |
foreach my $c (@$ctgsegs) { |
814 |
my ($start, $end)=($chrstrand eq '-') ? ($clen+$chrstart-$$c[1],$clen+$chrstart-$$c[0]) : |
815 |
($$c[0]+$chrstart-1, $$c[1]+$chrstart-1); |
816 |
push(@chrsegs, [$start ,$end]); |
817 |
} |
818 |
my $nstrand=($ctgstrand eq $chrstrand) ? '+' : '-' ; |
819 |
return ($chr, $nstrand, [@chrsegs], $clen, $chrname); |
820 |
} |
821 |
|
822 |
|
823 |
sub keepAttr { |
824 |
my ($attr, $href)=@_; |
825 |
|
826 |
if ($allattrs) { |
827 |
if ($href) { return exists($href->{$attr}); } |
828 |
else { return 1; } |
829 |
} |
830 |
elsif ($addattrs) { |
831 |
if (exists($xattrs{$attr})) { |
832 |
if ($href) { return exists($href->{$attr}); } |
833 |
else { return 1; } |
834 |
} |
835 |
else { return 0; } |
836 |
} |
837 |
else { return 0; } |
838 |
} |
839 |
|
840 |
sub GMessage { |
841 |
print STDERR join("\n",@_)."\n"; |
842 |
} |