ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gbs2gff_chr.pl
Revision: 117
Committed: Wed Nov 23 21:27:59 2011 UTC (12 years, 8 months ago) by gpertea
File size: 26725 byte(s)
Log Message:
gbs2gff fixes

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 gbs2gff_chr.pl -p <prot_id2transcript_id-file> [-m <seq_contig.md>] [-M][-T] \
8 [-f <mapflt>] [-c genome.fa.fai] [-t <track>][-A][-a <attr1,attr2,..>] \
9 <gbs_stream>
10 Creates chromosome gff annotation files from NCBI GB data
11
12 Takes the contig mappings (onto chromosomes) from <seq_contig.md> (optional),
13 filtering it by group_label if requested (e.g. -f Primary_Assembly)
14 and converts the input GenBank formatted data into a gff file.
15
16 Note: only these features are parsed:
17 gene, mRNA, CDS, misc_RNA, ncRNA, rRNA, tRNA
18 Currently these are discarded:
19 C_region, V_segment
20 Options:
21 -A write all attributes
22 -a write these attributes (comma delimited list)
23 -M parse and write mRNA annotations only
24 -T do not write parent "gene" records
25 -K for -c option, skip genome sequences for which the sequence
26 length cannot be verified
27 /;
28 # -S write fasta files with the chromosome\/contig sequences
29
30 umask 0002;
31 getopts('p:c:m:f:o:t:a:KTMA') || die($usage."\n");
32 my $ctgmapfile=$Getopt::Std::opt_m;
33 my $f_out=$Getopt::Std::opt_o;
34 my $skip_no_len_check=$Getopt::Std::opt_K;
35 my $writeseq=$Getopt::Std::opt_S;
36 my $addattrs=$Getopt::Std::opt_a;
37 my $mapflt=$Getopt::Std::opt_f;
38 my $p2nfile=$Getopt::Std::opt_p;
39 my $allattrs=$Getopt::Std::opt_A;
40 my $t_only=$Getopt::Std::opt_T; # no gene parent, print transcripts only
41 my $mrnaonly=$Getopt::Std::opt_M;
42 my $chrlen_test=$Getopt::Std::opt_c; #must be a SAMTools faidx file
43 my %chrsizes;
44 if ($chrlen_test && -f $chrlen_test) {
45 open(FAI, $chrlen_test) || die("Error opening $chrlen_test!\n");
46 while (<FAI>) {
47 chomp;
48 next unless $_;
49 next if m/^\s*#/;
50 my @t=split;
51 $chrsizes{$t[0]}=$t[1];
52 }
53 close(FAI);
54 }
55 die("Error: -p is required -- to map protein_id to transcript_id\n")
56 unless $p2nfile && -f $p2nfile;
57 my %pmap; # protein_id => transcript_id
58 open(PROTMAP, $p2nfile) || die("Error opening $p2nfile!\n");
59 while (<PROTMAP>) {
60 chomp;
61 my ($p, $n)=split;
62 next unless $n;
63 $p=~s/\.\d+$//;
64 $n=~s/\.\d+$//; # discard versioning
65 $pmap{$p}=$n;
66 }
67 close(PROTMAP);
68
69 my %haplotypes = (
70 'GL000250' => 'chr6_apd_hap1',
71 'GL000251' => 'chr6_cox_hap2',
72 'GL000252' => 'chr6_dbb_hap3',
73 'GL000253' => 'chr6_mann_hap4',
74 'GL000254' => 'chr6_mcf_hap5',
75 'GL000255' => 'chr6_qbl_hap6',
76 'GL000256' => 'chr6_ssto_hap7',
77 'GL000257' => 'chr4_ctg9_hap1',
78 'GL000258' => 'chr17_ctg5_hap1'
79 );
80
81 # 0 1 2 3 4 5 6
82 my %ctgmap; # $ctg=> [chr, strand, chr_start, chr_end, ctg_gl_acc, chr_name, seq_definition]
83 if ($ctgmapfile) {
84 open(CTGMAP, $ctgmapfile) || die("Error opening $ctgmapfile\n");
85 while (<CTGMAP>) {
86 next if m/^#/;
87 my @t=split(/\t/);
88 next if $t[6] eq 'na';
89 next if $mapflt && $t[8]!~/$mapflt/i;
90 my $ctgacc=$t[5];
91 $ctgacc=~s/\.\d+$//; # remove version
92 my $chrname='chr'.$t[1];
93 $chrname=~s/\.\d+$//;
94 my ($chrstart, $chrend)=($t[2], $t[3]);
95 ($chrstart, $chrend)=($chrend, $chrstart) if $chrend<$chrstart;
96 my $chrstrand=($t[4] eq '-') ? '-' : '+';
97 my $ctgdata=[$chrname, $chrstrand, $chrstart, $chrend, '', ''];
98 $ctgmap{$t[5]}=$ctgdata;
99 $ctgmap{$ctgacc}=$ctgdata;
100 }
101 close(CTGMAP);
102 }
103 #key attributes to always keep :
104 my %kattrs; @kattrs{'gene', 'product', 'transcript_id', 'codon_start'}=();
105 my %xattrs; #extra attributes to keep
106 if ($addattrs) {
107 $addattrs=~tr/ //d;
108 my @add=split(/\,/,$addattrs);
109 @xattrs{@add}=();
110 }
111 my $moreattrs = ( $allattrs || $addattrs );
112 # ---------------
113 my %gids; #ensure unique gene IDs
114 #allow for duplication $gids{$gene}=[ [gstart1, gend1], [gstart2, gend2] ]
115 my %tids; # and transcript IDs
116 #----
117 # -- per gbase:
118
119 my %genes; # genes{$gene}=[$gbase, $strand, $gstart, $gend, [@gxrefs], \%gattrs, \%rnas, isPseudo, [@genesyms] ]
120 # 0 1 2 3 4 5 6 7 8
121 # %rnas hash holds all mRNA data for a single gene (i.e. all isoforms)
122 # 0 1 2 3 4 5 6 7
123 # $rnas{transcript_id}=[ $featname, [@exon], [@cds], \%tattrs, [@txrefs], $partial3, $partial5, [@genesyms] ]
124 #current values:
125 my ($curctg, $curctglen, $curcmt); # per contig
126 my $cursection;
127 my ($curfeat, $curfstrand, @fsegs, %fattrs, $fpartial5, $fpartial3); # per feature
128 # $gpseudo, $gstart, $gend, $partial5, $partial3);
129 my $last_line;
130 my $lastattr;
131 my $track=$Getopt::Std::opt_t || 'ncbi';
132 #only parse these sections:
133 my %sparse;@sparse{'FEATURES', 'COMMENT'}=(1,1); #sections for which to parse multiple rows
134
135 #only parse these features (WARNING: check this everyonce in a while, NCBI may change these tags
136 # gzip -cd *.gbs.gz | perl -ne 'print $1."\n" if m/^\s{4,8}(\S+)\s{2,}\S+/' | sort -u
137 #
138 my %fparse;
139 @fparse{'gene','misc_RNA','mRNA','ncRNA', 'CDS', 'rRNA',
140 'tRNA', 'snoRNA','miRNA','microRNA','micro_RNA', 'exon',
141 'C_region', 'D_segment', 'J_segment', 'V_segment'}=();
142 my %fskip; @fskip{'source', 'gap', 'repeat'}=();
143 my $skipctg;
144 if ($f_out) {
145 open(F_OUT, '>'.$f_out) || die("Error creating output file $f_out\n");
146 select(F_OUT);
147 }
148 while (<>) {
149 LSKIP:
150 $last_line=$_;
151 if (m/^ORIGIN\s*$/) {
152 # read sequence here
153 my $ctgdata=$ctgmap{$curctg};
154 my ($c_chr,$c_acc, $c_def)=($ctgdata->[0], $ctgdata->[4], $ctgdata->[6]);
155 my $defline=">$c_chr $c_acc";
156 $defline.=" $c_def" if $c_def;
157 my $fname=$c_chr.'.fa';
158 flushGSeq();
159 if ($writeseq) {
160 open(FASEQ, '>'.$fname) || die ("Error creating fasta file $fname!\n");
161 print FASEQ $defline."\n";
162 }
163 my $l;
164 while (($l=<>)) {
165 $l=~s/[\s\n\r]+$//s;
166 last if $l eq '//';
167 my ($coord, $seq)=($l=~m/^\s*(\d+)\s+(\w[ \w]*)/);
168 $seq=~tr/ \t//d;
169 if ($writeseq) {
170 print FASEQ uc($seq)."\n";
171 }
172 }
173 close(FASEQ) if $writeseq;
174 next;
175 }
176 if (m/^([A-Z,0-9]+)\s+(\S.+)/) {
177 ($cursection, my $rest)=($1,$2);
178 if ($cursection eq 'LOCUS') {
179 flushGSeq(); # flush contig features accumulated so far
180 my ($ctgacc, $ctglen)=($rest=~m/^(\w+)\s+(\d+)\s+bp/);
181 die("Error: cannot parse the LOCUS line:\n$last_line\n") unless $ctglen>1;
182 my $cd;
183 if ($ctgmapfile) {
184 $cd=$ctgmap{$ctgacc};
185 unless ($cd) {
186 print STDERR "Skipping contig $ctgacc (no chromosome entry)\n";
187 $skipctg=1;
188 next;
189 }
190 die("Error: mismatch between contig $ctgacc length and its chromosome positioning!\n")
191 unless $ctglen == $cd->[3]-$cd->[2]+1;
192 }
193 $skipctg=0;
194 $curctg=$ctgacc;
195 $curctglen=$ctglen;
196 next;
197 }
198 next if $skipctg;
199 if ($cursection eq 'DEFINITION') {
200 unless ($ctgmapfile) {
201 my ($chrnum)=($rest=~m/chromosome\s+([\dXYM]+)/i);
202 unless ($chrnum) {
203 if (m/ mitochondrion/) {
204 $chrnum='M';
205 }
206 elsif (m/unplaced genomic contig/) {
207 $chrnum='Un';
208 }
209 }
210 die("Error parsing chromosome name! ($rest)\n") unless $chrnum;
211 $rest=~s/\.$//;
212 my $chrid='chr'.$chrnum;
213 $ctgmap{$curctg}=[$chrid, '+', 1, $curctglen, '', '', $rest];
214 }
215 next;
216 }
217 if ($cursection eq 'VERSION') {
218 unless ($ctgmapfile) {
219 my ($accver)=($rest=~m/^\s*\w+(\.\d+)/);
220 die("Error parsing accession.version! ($rest)\n") unless $accver;
221 $ctgmap{$curctg}->[4]=$curctg.$accver;
222 }
223 next;
224 }
225 if ($cursection eq 'COMMENT') {
226 $curcmt=$rest;
227 }
228 next;
229 } #big section start
230 next if $skipctg;
231 next unless exists $sparse{$cursection};
232 if ($cursection eq 'COMMENT') {
233 chomp;
234 s/^\s+/ /;
235 $curcmt.=$_;
236 }
237 elsif ($cursection eq 'FEATURES') {
238 # current contig information has been parsed by now
239 if ($curctg && $curcmt) {
240 my $ctgdata=$ctgmap{$curctg};
241 die("Error: no chromosome mapping for contig $curctg!\n") unless $ctgdata;
242 my ($glacc)=($curcmt=~m/sequence is identical to\s+(GL\d+)/s);
243 my $chrname=$$ctgdata[0];
244 if ($glacc) {
245 $$ctgdata[4]=$glacc;
246 my $chr=$$ctgdata[0];
247 if ($chr=~m/^(chr\w+)/) {
248 my $chrbase=$1;
249 # try to match UCSC naming scheme
250 if ($chrbase eq 'chrUn') {
251 $chrname=$chrbase.'_'.lc($glacc);
252 }
253 else {
254 if (exists($haplotypes{$glacc})) {
255 $chrname=$haplotypes{$glacc};
256 }
257 else {
258 $chrname=$chrbase.'_'.lc($glacc).'_random';
259 }
260 }
261 }
262 }
263 $$ctgdata[5]=$chrname;
264 if ($chrlen_test) {
265 my $oldchrlen=$chrsizes{$chrname};
266 if ($oldchrlen) {
267 die("Error: $chrname length mismatch (expected '$chrsizes{$chrname}' but found $curctglen) !\n")
268 unless ($oldchrlen==$curctglen);
269 }
270 else {
271 my $msg="Warning: couldn't check length of $chrname.";
272 if ($skip_no_len_check) {
273 $skipctg=1;
274 delete $ctgmap{$curctg};
275 GMessage($msg." Skipped.");
276 next;
277 }
278 GMessage($msg);
279 }
280 }
281 }
282 #big FEATURES block
283 if (m/^\s{3,8}(['\-\w]+)\s{2,}(\S+)/) { # a feature starts here (gene/*RNA/CDS/exon
284 my ($newfeat, $rest)=($1,$2);
285 flushFeature() if $curfeat;
286 $lastattr='';
287 $curfeat=$newfeat;
288
289 # next unless exists($fparse{$curfeat});
290 next if exists($fskip{$curfeat});
291 unless (exists($fparse{$curfeat})) {
292 GMessage("Warning: skipping unknown feature: $curfeat!");
293 next;
294 }
295 # check for coordinate definition
296 $_=$rest;
297 @fsegs=();
298 $curfstrand=(s/complement\(//)?'-':'+';
299 my $multi=(s/join\(//);
300 $fpartial5=1 if m/^\<\d+/;
301 s/\s+$//;
302 my $readnext;
303 do {
304 my @s=(m/([\d+\.\>]+)/g);
305 $readnext=m/\,$/;
306 foreach my $seg (@s) {
307 my ($x1,$x2)=($seg=~m/(\d+)/g);
308 unless ($x2) { # microexons may be given as only one coordinate
309 $x2=$x1;
310 }
311 else {
312 ($x1,$x2)=($x2,$x1) if $x1>$x2;
313 }
314 push(@fsegs, [$x1,$x2]);
315 }
316 $_=<> if $readnext;
317 } while ($readnext); #parse feature segments (exons)
318 $fpartial3=1 if m/^\.\.\>\d+\)*$/;
319 die("Error: couldn't parse segment coordinates for feature $curfeat on $curctg\n$_\n") unless @fsegs>0;
320 next;
321 } # coordinate line for a gene/mRNA/CDS/exon/*RNA feature
322 #next unless exists($fparse{$curfeat}); #only parse recognizable features
323 next if exists($fskip{$curfeat});
324 unless (exists($fparse{$curfeat})) {
325 GMessage("Warning: skipping unknown feature: $curfeat!");
326 next;
327 }
328
329 # -- here we are within a feature block (after coordinates were parsed)
330 #-- attribute parsing
331 chomp;
332 if (m/^\s{9,}\/([^=]+)(.*)/) { # attribute start
333 my ($attr, $value)=($1,$2);
334 if ($value) {
335 $value=~s/^=//;
336 $value=~tr/;/,/d; #"
337 $value=~s/^"//;
338 $value=~s/"$//;
339 }
340 else { $value=1 };
341 $attr=~s/^gene_syn\w+/gene_syn/;
342 if ($attr eq 'db_xref' || $attr eq 'gene_syn') {
343 $value=~tr/ \n\t\r//d;
344 my @vmulti=split(/[\;\,]/, $value);
345 if ($attr eq 'gene_syn' && @vmulti>1) {
346 push(@{$fattrs{$attr}}, @vmulti);
347 }
348 else {
349 push(@{$fattrs{$attr}}, $value);
350 }
351 }
352 else {
353 $fattrs{$attr}=$value;
354 }
355 $lastattr=$attr;
356 } #attribute start line
357 else { #value continuation line for last attribute:
358 die("Error: invalid line encountered when attribute continuation was expected!\n".
359 "$last_line\ncurctg=$curctg, curfeat=$curfeat, lastattr=$lastattr\n")
360 unless $lastattr && exists $fattrs{$lastattr};
361 s/^\s+/ /;
362 s/"$//;
363 my $v=$_;
364 if ($lastattr eq 'gene_syn') {
365 $v=~tr/ \n\t\r//d;
366 my @vmulti=split(/[\;\,]/,$v);
367 push(@{$fattrs{$lastattr}}, @vmulti);
368 }
369 else {
370 $fattrs{$lastattr}.=$v;
371 }
372 } #attribute continuation
373 #if ($lastattr eq 'note') {
374 # my $v=$fattrs{$lastattr};
375 # if ($v=~s/Derived by automated(.+?)prediction method: ([^\.]+)\..*$/predicted by: $2/)
376 # { $fattrs{$lastattr}=$v }
377 # }
378
379 }# we are in FEATURES block
380 } #for each input line
381
382 flushGSeq();
383
384 if ($f_out) {
385 close(F_OUT);
386 }
387
388
389 #---------------------------------------------------------
390 sub flushGSeq {
391 return unless $curctg;
392 flushFeature(); # close last feature read
393 return unless keys(%genes);
394
395 my $gcount=keys(%genes);
396 while (my ($g,$gd)=each(%genes)) {
397 writeGene($g,$gd);
398 }
399 %genes=();
400 $curctg='';
401 $curctglen=0;
402 $curcmt='';
403 $cursection='';
404 #-- debug only:
405 #exit;
406 }
407
408
409 sub makeUniqID {
410 my ($idref, $href)=@_;
411 my $id=$$idref;
412 $id=~s/~\d+$//;
413 my $c = ++$$href{$id};
414 if ($c>1) {
415 $c--;
416 $$idref=$id.'.m'.$c;
417 }
418 return $$idref;
419 }
420
421 sub createGeneEntry {
422 my ($gene, $gstart, $gend)=@_;
423 push(@{$gids{$gene}},[$curctg, $gstart, $gend]);
424 my $gno=scalar(@{$gids{$gene}})-1;
425 $gene.='~'.$gno if $gno>0;
426 if (exists($genes{$gene})) { #should never happen, we just made sure it's unique!
427 my $prev_gstart=$genes{$gene}->[2];
428 my $prev_gend=$genes{$gene}->[3];
429 # if (abs($prev_gstart-$gstart)>30000) {
430 die("Error -- gene $gene is duplicated ($prev_gstart vs $gstart) !?\n");
431 # }
432 # else { #update min..max coordinates
433 # $genes{$gene}->[2]=$gstart if $gstart<$prev_gstart;
434 # $genes{$gene}->[3]=$gend if $gend>$prev_gend;
435 # ${$genes{$gene}->[5]}{'fragmented'}=1;
436 # }
437 }
438 else
439 { # create a new gene entry here
440 my $pseudo= delete($fattrs{'pseudo'}) ? 1 : 0;
441 # 0 1 2 3 4 5 6 7
442 $genes{$gene}=[$curctg, $curfstrand, $gstart, $gend, [], {}, {}, $pseudo, []];
443 }
444 delete $fattrs{'gene'};
445 if ($moreattrs) {
446 if (keepAttr('db_xref', \%fattrs)) {
447 $genes{$gene}->[4]=[@{$fattrs{'db_xref'}}];
448 delete $fattrs{'db_xref'};
449 }
450 #another special case: gene_syn
451 if (keepAttr('gene_syn', \%fattrs)) {
452 $genes{$gene}->[8]=[@{$fattrs{'gene_syn'}}];
453 delete $fattrs{'gene_syn'};
454 }
455 my $garef=$genes{$gene}->[5];
456 if ($allattrs) {
457 my @k=keys(%fattrs);
458 @{$garef}{@k}=(values(%fattrs));
459 } else{
460 foreach my $k (keys(%fattrs)) {
461 $garef->{$k}=$fattrs{$k} if exists($xattrs{$k});
462 }
463 }
464 } #more attributes are kept
465 return $gene;
466 }
467
468
469 sub flushFeature { #called after a feature attributes block has been parsed into %fattrs
470 return unless $curfeat && keys(%fattrs)>0;
471 my $v=$fattrs{'note'};
472 if ($v &&
473 $v=~s/Derived by automated(.+?)prediction method: ([^\.]+)\..*$/predicted by: $2/) {
474 $fattrs{'note'}=$v;
475 }
476 my $gene=$fattrs{'gene'};
477 my @exon= sort { $main::a->[0] <=> $main::b->[0] } @fsegs;
478 my ($xstart, $xend)=($exon[0]->[0], $exon[-1]->[1]);
479 if (!$gene && $curfeat=~m/[^m]RNA/) {
480 my $t=$fattrs{'product'};
481 if (length($t)<14) {
482 $t=~tr/- ,/___/s;
483 $gene=$t;
484 }
485 else {
486 $gene=$curfeat;
487 }
488 $gene='NOT_GENE|'.$gene;
489 # clearly this is an orphan RNA entry (with no gene)
490 # so we create a dummy gene entry AND a transcript entry here
491 createGeneEntry($gene, $xstart, $xend);
492 }
493
494 die("Error: no gene attribute found for current feature $curfeat\n")
495 unless $gene;
496 if ($curfeat eq 'gene') { # gene attributes processing
497 #make sure it's unique
498 die("Error: gene feature ($gene) has multiple segments?\n") if @fsegs>1;
499 die("Error: gene feature ($gene) has no coordinates?\n") if @fsegs==0;
500 createGeneEntry($gene, $xstart, $xend);
501 @fsegs=();
502 %fattrs=();
503 ($fpartial3, $fpartial5)=();
504 return; #gene data added
505 } # stored gene attributes
506 # else
507 # - storing RNA attributes
508 # now make sure we place this RNA within the right gene - in case of gene duplications
509 # locate the matching $gene
510 my $gxdata=$gids{$gene};
511 unless ($gxdata) {
512 # strange, this looks like another orphan xRNA entry
513 createGeneEntry($gene, $xstart, $xend);
514 $gxdata=$gids{$gene};
515 }
516
517 die("Error: gene $gene not found in \%gids !\n") unless $gxdata; #should never happen
518 if (@$gxdata>1) {
519 my $i=0;
520 my $found;
521 foreach my $gd (@$gxdata) {
522 if ($gd->[0] eq $curctg && $gd->[1]<=$xstart && $gd->[2]>=$xend) {
523 $found=1;
524 last;
525 }
526 $i++;
527 }
528 die("Error: cannot find gene enclosing RNA data ".
529 $fattrs{'transcript_id'}." ".$fattrs{'protein_id'}." (on $curctg, $xstart..$xend)\n")
530 unless $found;
531 $gene.='~'.$i if $i>0;
532 }
533
534 my $geneid=$gene;
535 my $gdata=$genes{$gene};
536 die("Error: \$genes data for gene '$gene' cannot be found!\n") unless $gdata;
537 my $rnas=$gdata->[6];
538 my $gpseudo=$gdata->[7];
539 $geneid=~tr/~/_/;
540 my ($tid, $pid);
541 my $isMito=($ctgmap{$curctg}->[0] eq 'chrM'); #different format for mitochondrial entries
542 $pid=$fattrs{'protein_id'};
543 my $isCDS = ($curfeat eq 'CDS');
544 if ($isCDS && $pid) { # just finished a CDS parsing
545 #unless ($pid) {
546 # print STDERR "Warning: no protein_id found for CDS feature of gene $geneid (on $curctg).\n";
547 # @fsegs=();
548 # %fattrs=();
549 # ($fpartial3, $fpartial5)=();
550 # return;
551 # }
552 my @t=split(/\|/,$pid);
553 $pid=$t[1] if (@t>1);
554 $pid=~s/\.\d+$//; #remove version
555 $tid=$pmap{$pid};
556 die("Error: cannot get transcript for protein id $pid (gene $geneid)!\n") unless $tid;
557 unless (exists($rnas->{$tid})) { #this should never happen - CDS without mRNA definition?
558 # --> yeah, it does happen for Mitochondrial chromosomes!
559 if ($isMito) {
560 $tid=$pid;
561 # 0 1 2 3 4 5 6 7
562 #$rnas->{$tid}=['mRNA', [@exon], [], {}, [], 0, 0, []];
563 $rnas->{$tid}=['mRNA', [@exon], [@exon], {}, [], 0, 0, []];
564 }
565 else {
566 $tid=~s/\.\d+$//;
567 die("Error: no mRNA entry for CDS/protein $pid (reported as coming from transcript $tid)\n")
568 unless exists($rnas->{$tid});
569 }
570 }
571 # #just add the CDS
572 push(@{$rnas->{$tid}->[2]}, @exon);
573 } #CDS
574 else { # exons, other RNA features or CDS without protein_id
575 if ($isCDS) {
576 my $msg="Warning: no protein_id for CDS feature of gene $geneid (on $curctg)";
577 if ($gpseudo) {
578 # discard CDS segments for pseudogenes
579 @fsegs=();
580 %fattrs=();
581 ($fpartial3, $fpartial5)=();
582 $msg=~s/ gene/ pseudogene/;
583 GMessage("$msg. Discarded.");
584 return;
585 }
586 GMessage("$msg.");
587 }
588 $tid=$fattrs{'transcript_id'};
589 if ($tid) {
590 my @t=split(/\|/,$tid);
591 $tid=$t[1] if (@t>1);
592 #$tid=~s/^[a-z]+\|//i;
593 #$tid=~s/\|\w*$//; # some entries have a suffix - e.g. ref|NM_006824.1|P40
594 $tid=~s/\.\d+$//; #remove version!
595 }
596 else {
597 $tid=$geneid.'.t'; #some special cases with features parented directly by [pseudo] genes
598 }
599 unless (exists($rnas->{$tid})) {
600 my $frna=$curfeat;
601 if ($frna!~m/RNA/) {
602 $frna= $gpseudo? 'pseudoRNA' : 'misc_RNA';
603 }
604 if ($isCDS) {
605 # 0 1 2 3 4 5 6 7
606 $rnas->{$tid}=[$frna, [], [@exon], {}, [], 0, 0, [] ];
607 }
608 else {
609 # 0 1 2 3 4 5 6 7
610 $rnas->{$tid}=[$frna, [@exon], [], {}, [], 0, 0, [] ];
611 }
612 }
613 else { #just add the exons
614 my $segidx = $isCDS ? 2 : 1;
615 push(@{$rnas->{$tid}->[$segidx]}, @exon);
616 }
617 } #exon segments
618 my $tdata=$rnas->{$tid};
619 $tdata->[5]=1 if $fpartial5;
620 $tdata->[6]=1 if $fpartial3;
621 # add attributes
622 delete @fattrs{'gene', 'transcript_id'};
623 delete $fattrs{'protein_id'} unless exists($xattrs{'protein_id'});
624 my $product = delete $fattrs{'product'};
625 my $ahref=$tdata->[3];
626 $ahref->{'product'}=$product if $product;
627 if (my $ncRNAclass=delete $fattrs{'ncRNA_class'}) {
628 $ahref->{'ncRNA_class'}=$ncRNAclass;
629 }
630 if ($moreattrs) {
631 my %xrefs;
632 my @cx;
633 @cx=@{$fattrs{'db_xref'}} if keepAttr('db_xref', \%fattrs); #if exists $fattrs{'db_xref'};
634 push(@cx, @{$tdata->[4]});
635 @xrefs{@cx}=();
636 @cx=keys(%xrefs);
637 $tdata->[4]=[@cx];
638 delete $fattrs{'db_xref'};
639 #if (exists $fattrs{'gene_syn'}) {
640 if ($isCDS && (my $v=$fattrs{'note'})) {
641 if ($v=~s/isoform .+? encoded by transcript [\w \-\,\.]+//) {
642 $v=~s/^\;?\s+//;
643 $v=~s/\;?\s+$//;
644 $v=~s/\;\s*/, /;
645 if (length($v)>2) { $fattrs{'note'}=$v }
646 else { delete $fattrs{'note'} }
647 }
648 }
649 if (keepAttr('gene_syn', \%fattrs)) {
650 %xrefs=();
651 @cx=@{$fattrs{'gene_syn'}};
652 push(@cx, @{$tdata->[7]});
653 @xrefs{@cx}=();
654 @cx=keys(%xrefs);
655 $tdata->[7]=[@cx];
656 delete $fattrs{'gene_syn'};
657 }
658 #add new attributes
659 if ($allattrs) {
660 foreach my $k (keys %fattrs) {
661 $ahref->{$k}=$fattrs{$k};
662 }
663 }
664 else { # $addattrs
665 foreach my $k (keys %fattrs) {
666 $ahref->{$k}=$fattrs{$k} if exists($xattrs{$k});
667 }
668 }
669 }
670 #--> clean up
671 @fsegs=();
672 %fattrs=();
673 ($fpartial3, $fpartial5)=();
674 $lastattr='';
675 $curfeat='';
676 }
677
678 sub writeGene {
679 my ($gene, $gdata)=@_;
680 my $notgene=($gene=~s/^NOT_GENE\|//);
681 my $geneid=$gene;
682 my ($gene_name)=($geneid=~m/^([^~]+)/);
683 $geneid=~tr/~/_/;
684 my ($cbase, $cstrand, $gstart, $gend, $gpseudo)=
685 ($gdata->[0], $gdata->[1], $gdata->[2], $gdata->[3], $gdata->[7]);
686 my $gfeature=($gpseudo)?'pseudogene' : 'gene';
687 #makeUniqID(\$gene, \%gids); # $gene will be updated if necessary
688 #my $geneid=$gene;
689 #$geneid=~tr/~/_/;
690 my $attrs="ID=$geneid;Name=$gene_name";
691 my @gxrefs=@{$gdata->[4]};
692 $attrs.=';xrefs='.join(',', @gxrefs) if (@gxrefs>0);
693 my @gsyn=@{$gdata->[8]};
694 $attrs.=';gene_syn='.join(',', @gsyn) if (@gsyn>0);
695 my $gattrs=$gdata->[5];
696 my $rnas=$gdata->[6];
697 my $gcoords;
698 my ($gbase, $strand, $gchrlen);
699 ($gbase, $strand, $gcoords, $gchrlen)=contig2chr($cbase, $cstrand, [[$gstart, $gend]]);
700 ($gstart, $gend)=($$gcoords[0]->[0], $$gcoords[0]->[1]);
701 my $showpseudo = $gpseudo && (keys(%$rnas)==0);
702 unless (!$showpseudo && ($mrnaonly || $t_only || $notgene)) {
703 foreach my $k (keys(%$gattrs)) {
704 $attrs.=";$k=".$gattrs->{$k};
705 }
706 print join("\t", $gbase, $track, $gfeature, $gstart, $gend, '.', $strand, '.', $attrs)."\n";
707 }
708
709 foreach my $tid (keys(%$rnas)) {
710 #die("Error: no transcript_id found at write_RNA for gene $gene ($geneid)\n$last_line\n") unless $tid;
711 my $tdata=$rnas->{$tid};
712 my $rnafeat=$tdata->[0];
713 if ($rnafeat eq 'RNA_exon' || $rnafeat eq 'misc_RNA') {
714 $rnafeat = $gpseudo ? 'pseudoRNA' : 'misc_RNA';
715 }
716 if ($mrnaonly) {
717 $rnafeat=~s/misc_RNA/mRNA/;
718 }
719
720 next if $mrnaonly && $rnafeat ne 'mRNA';
721 # my $texons=$tdata->[1];
722 my $texons;
723 ($gbase, $strand, $texons)=contig2chr($cbase, $cstrand, $tdata->[1]);
724 my $tcds=$tdata->[2];
725 if ($tcds && @$tcds>0) {
726 ($gbase, $strand, $tcds)=contig2chr($cbase, $cstrand, $tdata->[2]);
727 }
728 else {
729 $tcds='';
730 }
731 my $tahref=$tdata->[3]; #attributes
732 my $txar=$tdata->[4]; # db_xrefs
733 my $txgsyn=$tdata->[7]; # gene_syns
734 my $ncRNA;
735 if (my $rnaclass=$tahref->{'ncRNA_class'}) {
736 $rnafeat=$rnaclass;
737 delete($$tahref{'ncRNA_class'});
738 $ncRNA=1;
739 }
740 my ($part3, $part5)=($tdata->[5], $tdata->[6]);
741 makeUniqID(\$tid, \%tids);
742 my @exon=sort { $main::a->[0] <=> $main::b->[0] } @$texons;
743 my ($tstart, $tend)=($exon[0]->[0], $exon[-1]->[1]);
744 $tid=~s/^NOT_GENE\|//;
745 my $attrs="ID=$tid";
746 my $gffname=$tid;
747 if ($notgene) {
748 $attrs.=";Name=$tid";
749 }
750 else {
751 $attrs.=";Parent=$geneid" unless ($mrnaonly || $t_only);
752 $gffname=$gene_name;
753 $attrs.=";Name=$gene_name;gene_name=$gene_name";
754 }
755
756 my $product=$tahref->{'product'};
757 $ncRNA=1 if $rnafeat eq 'tRNA' || $gffname=~m/NCRNA\d+/;
758 $attrs.=';ncRNA=1' if $ncRNA;
759 $attrs.=';partial=1' if ($part5 || $part3);
760 if ($product) {
761 $product=~tr/"/'/; #"
762 $product=~tr/;/./;
763 # $attrs.=';product="'.$product.'"';
764 $attrs.=";product=$product";
765 }
766 if ($moreattrs) {
767 $attrs.=';xrefs='.join(',', @$txar) if ($txar && @$txar>0);
768 $attrs.=';gene_syn='.join(',', @$txgsyn) if ($txgsyn && @$txgsyn>0);
769 foreach my $k (keys(%$tahref)) {
770 next if exists($kattrs{$k});
771 $attrs.=";$k=".$tahref->{$k};
772 }
773 }
774 print join("\t", $gbase, $track, $rnafeat, $tstart, $tend, '.', $strand, '.', $attrs)."\n";
775 unless($ncRNA && @exon==1) {
776 foreach my $ex (@exon) {
777 print join("\t", $gbase, $track, 'exon', $$ex[0], $$ex[1], '.', $strand, '.', "Parent=$tid")."\n";
778 }
779 }
780 @exon=();
781 #write_CDS();
782 if ($tcds) { #have CDS, write it
783 my @cds= sort { $main::a->[0] <=> $main::b->[0] } @$tcds;
784 my $codonstart=$tahref->{'codon_start'};
785 if ($codonstart>1) {
786 $codonstart--;
787 if ($strand eq '-') {
788 $cds[-1]->[1]-=$codonstart;
789 }
790 else {
791 $cds[0]->[0]+=$codonstart;
792 }
793 }
794 else { $codonstart='0'; }
795
796 foreach my $cd (@cds) { #TODO: calculate phase ?
797 print join("\t", $gbase, $track, 'CDS', $$cd[0], $$cd[1], '.', $strand, '.', "Parent=$tid")."\n";
798 }
799 }
800 } #for each transcript
801 }
802
803 sub contig2chr {
804 my ($ctg, $ctgstrand, $ctgsegs)=@_;
805 my $cd=$ctgmap{$ctg};
806 die("Error: cannot find chromosome mapping for contig $ctg!\n") unless $ctg;
807 my ($chr, $chrstrand, $chrstart, $chrend, $glacc, $chrname)=@$cd;
808 unless ($ctgmapfile) {
809 return ($chrname, $ctgstrand, [@$ctgsegs], $ctgmap{$ctg}->[3]);
810 }
811 my $clen=$chrend-$chrstart+1;
812 my @chrsegs;
813 foreach my $c (@$ctgsegs) {
814 my ($start, $end)=($chrstrand eq '-') ? ($clen+$chrstart-$$c[1],$clen+$chrstart-$$c[0]) :
815 ($$c[0]+$chrstart-1, $$c[1]+$chrstart-1);
816 push(@chrsegs, [$start ,$end]);
817 }
818 my $nstrand=($ctgstrand eq $chrstrand) ? '+' : '-' ;
819 return ($chr, $nstrand, [@chrsegs], $clen, $chrname);
820 }
821
822
823 sub keepAttr {
824 my ($attr, $href)=@_;
825
826 if ($allattrs) {
827 if ($href) { return exists($href->{$attr}); }
828 else { return 1; }
829 }
830 elsif ($addattrs) {
831 if (exists($xattrs{$attr})) {
832 if ($href) { return exists($href->{$attr}); }
833 else { return 1; }
834 }
835 else { return 0; }
836 }
837 else { return 0; }
838 }
839
840 sub GMessage {
841 print STDERR join("\n",@_)."\n";
842 }

Properties

Name Value
svn:executable *