[ViewVC] Annotation of: gclib/scripts/tbl2gff

#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;

my $usage = q/Usage:
 tbl2gff [-F] [-g <chr_table>] [-t <track>] -p <prot2nuc.tab> <ncbi_tbl_file> 

 Convert the given NCBI tbl file to GFF3
 
 Options:
 -p  protein_id to transcript_id mapping file (required)
     this file has 2 columns: 
        GenPept accession and (nucleotide) accession 
     it is required in order to assign CDS to their 
     corresponding mRNA exons
 -t  use given track name instead of 'gb_tbl'
 -g  replace the genomic sequence names according
     to the translation table <chr_table>
     (only the first 2 columns in that file are used)
 -M  only outputs mRNA entries (without gene lines)
 -F  preserve all notes and comments
/;
umask 0002;
my %kattrs;
#key attributes to always keep 
@kattrs{'gene', 'product', 'transcript_id'}=();
getopts('FMt:g:p:') || die($usage."\n");
my $gxfile=$Getopt::Std::opt_g;
my $p2nfile=$Getopt::Std::opt_p;
my $mrnaonly=$Getopt::Std::opt_M;
die("Error: -p is required -- to map protein_id to transcript_id\n") unless $p2nfile && -f $p2nfile;
my %pmap; # protein_id => transcript_id
open(PROTMAP, $p2nfile) || die("Error opening $p2nfile!\n");
while (<PROTMAP>) {
 chomp;
 my ($p, $n)=split;
 next unless $n;
 $p=~s/\.\d+$//;
 $n=~s/\.\d+$//; # discard versioning
 $pmap{$p}=$n;
}
close(PROTMAP);

my $track=$Getopt::Std::opt_t || 'gb_tbl';
my $allattrs=$Getopt::Std::opt_F;
my $last_line;
my %gx; # translation table for genomic sequence names ('>Feature' line)
if ($gxfile) {
 open(GX, $gxfile) || die("Error opening $gxfile\n");
 while(<GX>) {
  chomp;
  my @t=split;
  if ($t[0] && $t[1]) {
   $gx{$t[0]}=$t[1];
   $gx{$t[1]}=$t[0];
   }
  }
 close(GX); 
}

my $tblfile=$ARGV[0];
die($usage."\n") if $tblfile eq '-h' || ! -f $tblfile;

my %gids; #ensure unique gene IDs
          #allow for duplication $gids{$gene}=[ [gstart1, gend1], [gstart2, gend2] ]
my %tids; # and transcript IDs

# -- per gbase: 

my %genes; # genes{$gene}=[$gbase, $strand, $gstart, $gend, [@gxrefs], \%gattrs,  \%rnas,  isPseudo]
           #                  0        1        2      3        4         5          6        7
#my %rnas;
# %rnas hash holds all mRNA data for a single gene (i.e. all isoforms)
#                             0         1      2         3        4          5         6
# $rnas{transcript_id}=[ $featname, [@exon], [@cds], \%tattrs, [@txrefs], $partial3, $partial5 ]
#current values:
my ($gbase, $curfeat, @exon, %cattrs, $gpseudo,  $gstart, $gend, $partial5, $partial3);
my $strand='+';

# product and transcript_id are found in %tattrs
while (<>) {
 $last_line=$_;
 if (m/^>Feature\s+(\S+)/) { # misnomer - in fact this is the genomic sequence name
   my $newbase=$1;
   flushGSeq(); #write previous gene feature
   $gbase='';
   if ($gxfile) {
     $gbase=$gx{$newbase};
     unless ($gbase) {
       if ($newbase=~m/^[a-z]+\|([\w\.]+)/) {
          $newbase=$1;
          $gbase=$gx{$newbase};
          unless ($gbase) {
            $newbase=~s/\.\d+$//;
            $gbase=$gx{$newbase};
            }
          }
       }
     } #convert contig name
   $gbase=$newbase unless $gbase;
   next;
   } # base genomic sequence (contig/chromosome) established
 chomp;
 tr/\n\r//d;
 my @t=split(/\t/);
 my ($start, $end)=@t[0,1];
 if ($start && $end) { # segment coordinates
   $partial5=1 if ($start=~s/^<//);
   $partial3=1 if ($end=~s/^>//);
   my $newstrand='+';
   if ($end<$start) {
     $newstrand='-';
     ($start, $end)=($end, $start);
     }
   if ($t[2]) { # feature start (gene/mRNA/CDS)
     flushFeature() unless $t[2] eq 'exon'; #cleanup current gene/transcript data
     $strand=$newstrand;
     if ($t[2] eq 'gene') {
       #new gene feature starts
       $curfeat='gene';
       $gstart=$start;
       $gend=$end;
       } # gene start
     else { #non-gene features starting
       if ($t[2] eq 'exon') { # special case, exons given one by one
         if ($curfeat ne 'RNA_exon') { #first exon
           flushFeature(); #just in case there was another mRNA (exons) given before
           }
         $curfeat='RNA_exon';
         push(@exon, [$start, $end]);
         }
        elsif ($t[2] eq 'CDS' || $t[2]=~m/RNA/) {
         $curfeat=$t[2];
         push(@exon, [$start, $end]);
         }
        else { $curfeat=''; next; } 
      # -- skipping any other features here
      } #non-gene features starting
     } # feature start 
    else { # feature continuation
     $strand=$newstrand;
     if ($curfeat eq 'CDS' || $curfeat=~/RNA/) {
        push(@exon, [$start, $end]);
        }
       else {
        die("Error: unexpected feature '$curfeat' continuation\n$last_line");
        }
     }
   } # had segment coordinates
  else { # no segment coords => attributes for current feature
   my ($attr, $value)=@t[3,4];
   $value=~tr/;/,/;
   if ($attr eq 'db_xref' || $attr eq 'gene_syn') {
      push(@{$cattrs{$attr}}, $value);
      }
     else { 
      $cattrs{$attr}=$value if $attr ne 'number';
      }
   } # attributes block
 } #while input lines

 flushGSeq();

#================ SUBROUTINES ============

sub writeGene {
 my ($gene, $gdata)=@_;
 my $geneid=$gene;
 my ($gene_name)=($geneid=~m/^([^~]+)/);
 $geneid=~tr/~/_/;
 ($gbase,       $strand,      $gstart,     $gend,      $gpseudo)=
  ($gdata->[0], $gdata->[1], $gdata->[2], $gdata->[3], $gdata->[7]);
 my $gfeature=($gpseudo)?'pseudogene' : 'gene';
 #makeUniqID(\$gene, \%gids); # $gene will be updated if necessary
 #my $geneid=$gene;
 #$geneid=~tr/~/_/;
 my $attrs="ID=$geneid;Name=$gene_name";
 my @gxrefs=@{$gdata->[4]};
 $attrs.=';xrefs='.join(',', @gxrefs) if (@gxrefs>0);
 my $gattrs=$gdata->[5];
 unless ($mrnaonly) {
   foreach my $k (keys(%$gattrs)) {
       $attrs.=";$k=".$gattrs->{$k};
       }
   print join("\t", $gbase, $track, $gfeature, $gstart, $gend, '.', $strand, '.', $attrs)."\n";
   }
 
 my $rnas=$gdata->[6];
 foreach my $tid (keys(%$rnas)) {
    #die("Error: no transcript_id found at write_RNA for gene $gene ($geneid)\n$last_line\n") unless $tid;
    my $tdata=$rnas->{$tid};
    my $rnafeat=$tdata->[0];
    if ($rnafeat eq 'RNA_exon') {
       $rnafeat = $gpseudo ? 'pseudo_RNA' : 'misc_RNA';
       }
    if ($mrnaonly) {
      $rnafeat=~s/misc_RNA/mRNA/;
      }
      
    next if $mrnaonly && $rnafeat ne 'mRNA';
    my $texons=$tdata->[1];
    my $tcds=$tdata->[2];
    my $tahref=$tdata->[3]; #attributes
    my $txar=$tdata->[4]; # db_xrefs
    my ($part3, $part5)=($tdata->[5], $tdata->[6]);
    makeUniqID(\$tid, \%tids);
    @exon=sort { $main::a->[0] <=> $main::b->[0] } @$texons;
    my ($tstart, $tend)=($exon[0]->[0], $exon[-1]->[1]);
    my $attrs="ID=$tid";
    $attrs.=";Parent=$geneid" unless $mrnaonly;
    $attrs.="Name=$gene_name;gene_name=$gene_name";
    $attrs.=';partial=1' if ($part5 || $part3);
    my $product=$tahref->{'product'};
    if ($product) {
      $product=~tr/"/'/; #"
      $attrs.=';product="'.$product.'"';
      }
    if ($allattrs) {
      $attrs.=';xrefs='.join(',', @$txar) if (@$txar > 0);
      foreach my $k (keys(%$tahref)) {
        next if exists($kattrs{$k});
        $attrs.=";$k=".$tahref->{$k};
        }
      }
    print join("\t", $gbase, $track, $rnafeat, $tstart, $tend, '.', $strand, '.', $attrs)."\n";
    foreach my $ex (@exon) {
     print join("\t", $gbase, $track, 'exon', $$ex[0], $$ex[1], '.', $strand, '.', "Parent=$tid")."\n";
     }
    @exon=();
    #write_CDS();
    my @cds=@$tcds;    
    if (@cds>0) { #have CDS, write it
      @cds=sort { $main::a->[0] <=> $main::b->[0] } @cds;
      my $codonstart=$tahref->{'codon_start'};
      if ($codonstart>1) {
        $codonstart--;
        if ($strand eq '-') {
          $cds[-1]->[1]-=$codonstart;
          }
         else {
          $cds[0]->[0]+=$codonstart;
          }
        }
       else { $codonstart='0'; }

      foreach my $cd (@cds) { #TODO: calculate phase ?
        print join("\t", $gbase, $track, 'CDS', $$cd[0], $$cd[1], '.', $strand, '.', "Parent=$tid")."\n";        
        }
      }
  } #for each transcript
}

sub makeUniqID {
 my ($idref, $href)=@_;
 my $id=$$idref;
 $id=~s/~\d+$//;
 my $c = ++$$href{$id};
 if ($c>1) {
   $c--;
   $$idref=$id.'.m'.$c;
   }
 return $$idref;  
}

sub flushFeature { #called after an attribute block has been parsed into %cattrs
 return unless $curfeat && keys(%cattrs)>0;
 my $gene=$cattrs{'gene'};
 die("Error: no gene attribute found for current feature $curfeat\n") 
    unless $gene;
 if ($curfeat eq 'gene') { # gene attributes processing
   #make sure it's unique
   push(@{$gids{$gene}},[$gbase, $gstart, $gend]);
   my $gno=scalar(@{$gids{$gene}})-1;
   $gene.='~'.$gno if $gno>0;
   if (exists($genes{$gene})) { #should never happen, we just made sure it's unique!
         my $prev_gstart=$genes{$gene}->[2];
         my $prev_gend=$genes{$gene}->[3];
        # if (abs($prev_gstart-$gstart)>30000) {
           die("Error -- gene $gene is duplicated ($prev_gstart vs $gstart) !?\n");
        #   }
        # else { #update min..max coordinates
        #   $genes{$gene}->[2]=$gstart if $gstart<$prev_gstart;
        #   $genes{$gene}->[3]=$gend if $gend>$prev_gend;
        #   ${$genes{$gene}->[5]}{'fragmented'}=1;
        #   }
        }
      else 
      { # create a new gene entry here
       my $pseudo= delete($cattrs{'pseudo'}) ? 1 : 0;
       #                 0       1         2       3    4    5  6      7 
       $genes{$gene}=[$gbase, $strand, $gstart, $gend, [],  {}, {}, $pseudo];
       }
   delete $cattrs{'gene'};
   if ($allattrs) {
     if (exists($cattrs{'db_xref'})) {
        $genes{$gene}->[4]=[@{$cattrs{'db_xref'}}];
        delete $cattrs{'db_xref'};
        }
    #another special case: gene_syn
     if (exists($cattrs{'gene_syn'})) {
        #convert [list] to a string with comma delimited names
        $cattrs{'gene_syn'}=join(',',@{$cattrs{'gene_syn'}});
        }
    my $garef=$genes{$gene}->[5];
    my @k=keys(%cattrs);
    @{$garef}{@k}=(values(%cattrs));
    } #all attributes are kept
   @exon=();
   %cattrs=(); 
   ($partial3, $partial5)=();
   return; #gene data added
   } # stored gene attributes
# else 
# - storing RNA attributes
# now make sure we place this RNA within the right gene - in case of gene duplications
# locate the matching $gene
@exon= sort { $main::a->[0] <=> $main::b->[0] } @exon;
my ($xstart, $xend)=($exon[0]->[0], $exon[-1]->[1]);
my $gxdata=$gids{$gene};
die("Error: gene $gene not found in \%gids !\n") unless $gxdata; #should never happen
if (@$gxdata>1) {
  my $i=0;
  my $found;
  foreach my $gd (@$gxdata) {
    if ($gd->[0] eq $gbase && $gd->[1]<=$xstart && $gd->[2]>=$xend) {
      $found=1;
      last;
      }
    $i++;
    }
  die("Error: cannot find gene enclosing RNA data ".
      $cattrs{'transcript_id'}." ".$cattrs{'protein_id'}." (on $gbase, $xstart..$xend)\n")
       unless $found;
  $gene.='~'.$i if $i>0; 
  }

my $geneid=$gene;
my $gdata=$genes{$gene};
die("Error: \$genes data for gene '$gene' cannot be found!\n") unless $gdata;
my $rnas=$gdata->[6];
$geneid=~tr/~/_/;
my ($tid, $pid);
if ($curfeat eq 'CDS') { # just finished a CDS parsing
  my $pid=$cattrs{'protein_id'};
  die("Error: no protein_id found for CDS feature (gene $geneid)!\n") unless $pid;
  #$pid=~s/^[a-z]+\|//i; # ref|NM_002314.1|
  #$pid=~s/\|(\w*)$//; # some entries have a suffix - e.g. ref|NM_006824.1|P40
  my @t=split(/\|/,$pid);
  $pid=$t[1] if (@t>1);
  $pid=~s/\.\d+$//; #remove version
  $tid=$pmap{$pid}; 
  die("Error: cannot get transcript for protein id $pid (gene $geneid)!\n") unless $tid;
  unless (exists($rnas->{$tid})) { #this should never happen - CDS without mRNA definition?
    #               0         1       2     3   4   5  6
    #$rnas->{$tid}=['mRNA', [@exons], [@cds], {}, [], 0, 0];
    $tid=~s/\.\d+$//;
    die("Error: cannot find mRNA entry for protein $pid (reported as coming from transcript $tid)\n")
       unless exists($rnas->{$tid});
    }
   # #just add the CDS
   push(@{$rnas->{$tid}->[2]}, @exon);
   #   }
  } #CDS
 else { # exons (RNA feature)
 $tid=$cattrs{'transcript_id'};
 if ($tid) {
    my @t=split(/\|/,$tid);
    $tid=$t[1] if (@t>1);
    #$tid=~s/^[a-z]+\|//i;
    #$tid=~s/\|\w*$//; # some entries have a suffix - e.g. ref|NM_006824.1|P40
    $tid=~s/\.\d+$//; #remove version!
    }
   else {
    $tid=$geneid.'_t'; #some special cases with 'exon' features for pseudo genes
    }
  unless (exists($rnas->{$tid})) {
    #               0         1      2   3   4   5  6
    $rnas->{$tid}=[$curfeat, [@exon], [], {}, [], 0, 0];
    }
   else { #just add the exons
    push(@{$rnas->{$tid}->[1]}, @exon);
    }
  } #exons
my $tdata=$rnas->{$tid};
$tdata->[5]=1 if $partial5;
$tdata->[6]=1 if $partial3;
# add attributes
delete @cattrs{'gene', 'transcript_id', 'protein_id'};
my $product = delete $cattrs{'product'};
my $ahref=$tdata->[3];
$ahref->{'product'}=$product if $product;
if ($allattrs) {
 my %xrefs;
 my @cx;
 @cx=@{$cattrs{'db_xref'}} if exists $cattrs{'db_xref'};
 push(@cx, @{$tdata->[4]});
 @xrefs{@cx}=();
 @cx=keys(%xrefs);
 $tdata->[4]=[@cx];
 delete $cattrs{'db_xref'};
 #add new attributes 
 foreach my $k (keys %cattrs) {
   $ahref->{$k}=$cattrs{$k};
   }
 }  
#--> clean up
@exon=();
%cattrs=();
($partial3, $partial5)=();
}

sub flushGSeq {
 return unless keys(%genes);
 flushFeature(); # write last RNA feature read
 my $gcount=keys(%genes);
 while (my ($g,$gd)=each(%genes)) {
    writeGene($g,$gd);
 }
 ($gbase, $curfeat, @exon, %cattrs, $gpseudo,  $gstart, $gend, 
  $partial5, $partial3)=(); 
 $strand='+';
 @exon=();
 %cattrs=();
 %genes=();
 #-- debug only:
 #exit;
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size:	13595 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	use FindBin;use lib $FindBin::Bin;
5
6	my $usage = q/Usage:
7	tbl2gff [-F] [-g <chr_table>] [-t <track>] -p <prot2nuc.tab> <ncbi_tbl_file>
8
9	Convert the given NCBI tbl file to GFF3
10
11	Options:
12	-p protein_id to transcript_id mapping file (required)
13	this file has 2 columns:
14	GenPept accession and (nucleotide) accession
15	it is required in order to assign CDS to their
16	corresponding mRNA exons
17	-t use given track name instead of 'gb_tbl'
18	-g replace the genomic sequence names according
19	to the translation table <chr_table>
20	(only the first 2 columns in that file are used)
21	-M only outputs mRNA entries (without gene lines)
22	-F preserve all notes and comments
23	/;
24	umask 0002;
25	my %kattrs;
26	#key attributes to always keep
27	@kattrs{'gene', 'product', 'transcript_id'}=();
28	getopts('FMt:g:p:') \|\| die($usage."\n");
29	my $gxfile=$Getopt::Std::opt_g;
30	my $p2nfile=$Getopt::Std::opt_p;
31	my $mrnaonly=$Getopt::Std::opt_M;
32	die("Error: -p is required -- to map protein_id to transcript_id\n") unless $p2nfile && -f $p2nfile;
33	my %pmap; # protein_id => transcript_id
34	open(PROTMAP, $p2nfile) \|\| die("Error opening $p2nfile!\n");
35	while (<PROTMAP>) {
36	chomp;
37	my ($p, $n)=split;
38	next unless $n;
39	$p=~s/\.\d+$//;
40	$n=~s/\.\d+$//; # discard versioning
41	$pmap{$p}=$n;
42	}
43	close(PROTMAP);
44
45	my $track=$Getopt::Std::opt_t \|\| 'gb_tbl';
46	my $allattrs=$Getopt::Std::opt_F;
47	my $last_line;
48	my %gx; # translation table for genomic sequence names ('>Feature' line)
49	if ($gxfile) {
50	open(GX, $gxfile) \|\| die("Error opening $gxfile\n");
51	while(<GX>) {
52	chomp;
53	my @t=split;
54	if ($t[0] && $t[1]) {
55	$gx{$t[0]}=$t[1];
56	$gx{$t[1]}=$t[0];
57	}
58	}
59	close(GX);
60	}
61
62	my $tblfile=$ARGV[0];
63	die($usage."\n") if $tblfile eq '-h' \|\| ! -f $tblfile;
64
65	my %gids; #ensure unique gene IDs
66	#allow for duplication $gids{$gene}=[ [gstart1, gend1], [gstart2, gend2] ]
67	my %tids; # and transcript IDs
68
69	# -- per gbase:
70
71	my %genes; # genes{$gene}=[$gbase, $strand, $gstart, $gend, [@gxrefs], \%gattrs, \%rnas, isPseudo]
72	# 0 1 2 3 4 5 6 7
73	#my %rnas;
74	# %rnas hash holds all mRNA data for a single gene (i.e. all isoforms)
75	# 0 1 2 3 4 5 6
76	# $rnas{transcript_id}=[ $featname, [@exon], [@cds], \%tattrs, [@txrefs], $partial3, $partial5 ]
77	#current values:
78	my ($gbase, $curfeat, @exon, %cattrs, $gpseudo, $gstart, $gend, $partial5, $partial3);
79	my $strand='+';
80
81	# product and transcript_id are found in %tattrs
82	while (<>) {
83	$last_line=$_;
84	if (m/^>Feature\s+(\S+)/) { # misnomer - in fact this is the genomic sequence name
85	my $newbase=$1;
86	flushGSeq(); #write previous gene feature
87	$gbase='';
88	if ($gxfile) {
89	$gbase=$gx{$newbase};
90	unless ($gbase) {
91	if ($newbase=~m/^[a-z]+\\|([\w\.]+)/) {
92	$newbase=$1;
93	$gbase=$gx{$newbase};
94	unless ($gbase) {
95	$newbase=~s/\.\d+$//;
96	$gbase=$gx{$newbase};
97	}
98	}
99	}
100	} #convert contig name
101	$gbase=$newbase unless $gbase;
102	next;
103	} # base genomic sequence (contig/chromosome) established
104	chomp;
105	tr/\n\r//d;
106	my @t=split(/\t/);
107	my ($start, $end)=@t[0,1];
108	if ($start && $end) { # segment coordinates
109	$partial5=1 if ($start=~s/^<//);
110	$partial3=1 if ($end=~s/^>//);
111	my $newstrand='+';
112	if ($end<$start) {
113	$newstrand='-';
114	($start, $end)=($end, $start);
115	}
116	if ($t[2]) { # feature start (gene/mRNA/CDS)
117	flushFeature() unless $t[2] eq 'exon'; #cleanup current gene/transcript data
118	$strand=$newstrand;
119	if ($t[2] eq 'gene') {
120	#new gene feature starts
121	$curfeat='gene';
122	$gstart=$start;
123	$gend=$end;
124	} # gene start
125	else { #non-gene features starting
126	if ($t[2] eq 'exon') { # special case, exons given one by one
127	if ($curfeat ne 'RNA_exon') { #first exon
128	flushFeature(); #just in case there was another mRNA (exons) given before
129	}
130	$curfeat='RNA_exon';
131	push(@exon, [$start, $end]);
132	}
133	elsif ($t[2] eq 'CDS' \|\| $t[2]=~m/RNA/) {
134	$curfeat=$t[2];
135	push(@exon, [$start, $end]);
136	}
137	else { $curfeat=''; next; }
138	# -- skipping any other features here
139	} #non-gene features starting
140	} # feature start
141	else { # feature continuation
142	$strand=$newstrand;
143	if ($curfeat eq 'CDS' \|\| $curfeat=~/RNA/) {
144	push(@exon, [$start, $end]);
145	}
146	else {
147	die("Error: unexpected feature '$curfeat' continuation\n$last_line");
148	}
149	}
150	} # had segment coordinates
151	else { # no segment coords => attributes for current feature
152	my ($attr, $value)=@t[3,4];
153	$value=~tr/;/,/;
154	if ($attr eq 'db_xref' \|\| $attr eq 'gene_syn') {
155	push(@{$cattrs{$attr}}, $value);
156	}
157	else {
158	$cattrs{$attr}=$value if $attr ne 'number';
159	}
160	} # attributes block
161	} #while input lines
162
163	flushGSeq();
164
165	#================ SUBROUTINES ============
166
167	sub writeGene {
168	my ($gene, $gdata)=@_;
169	my $geneid=$gene;
170	my ($gene_name)=($geneid=~m/^([^~]+)/);
171	$geneid=~tr/~/_/;
172	($gbase, $strand, $gstart, $gend, $gpseudo)=
173	($gdata->[0], $gdata->[1], $gdata->[2], $gdata->[3], $gdata->[7]);
174	my $gfeature=($gpseudo)?'pseudogene' : 'gene';
175	#makeUniqID(\$gene, \%gids); # $gene will be updated if necessary
176	#my $geneid=$gene;
177	#$geneid=~tr/~/_/;
178	my $attrs="ID=$geneid;Name=$gene_name";
179	my @gxrefs=@{$gdata->[4]};
180	$attrs.=';xrefs='.join(',', @gxrefs) if (@gxrefs>0);
181	my $gattrs=$gdata->[5];
182	unless ($mrnaonly) {
183	foreach my $k (keys(%$gattrs)) {
184	$attrs.=";$k=".$gattrs->{$k};
185	}
186	print join("\t", $gbase, $track, $gfeature, $gstart, $gend, '.', $strand, '.', $attrs)."\n";
187	}
188
189	my $rnas=$gdata->[6];
190	foreach my $tid (keys(%$rnas)) {
191	#die("Error: no transcript_id found at write_RNA for gene $gene ($geneid)\n$last_line\n") unless $tid;
192	my $tdata=$rnas->{$tid};
193	my $rnafeat=$tdata->[0];
194	if ($rnafeat eq 'RNA_exon') {
195	$rnafeat = $gpseudo ? 'pseudo_RNA' : 'misc_RNA';
196	}
197	if ($mrnaonly) {
198	$rnafeat=~s/misc_RNA/mRNA/;
199	}
200
201	next if $mrnaonly && $rnafeat ne 'mRNA';
202	my $texons=$tdata->[1];
203	my $tcds=$tdata->[2];
204	my $tahref=$tdata->[3]; #attributes
205	my $txar=$tdata->[4]; # db_xrefs
206	my ($part3, $part5)=($tdata->[5], $tdata->[6]);
207	makeUniqID(\$tid, \%tids);
208	@exon=sort { $main::a->[0] <=> $main::b->[0] } @$texons;
209	my ($tstart, $tend)=($exon[0]->[0], $exon[-1]->[1]);
210	my $attrs="ID=$tid";
211	$attrs.=";Parent=$geneid" unless $mrnaonly;
212	$attrs.="Name=$gene_name;gene_name=$gene_name";
213	$attrs.=';partial=1' if ($part5 \|\| $part3);
214	my $product=$tahref->{'product'};
215	if ($product) {
216	$product=~tr/"/'/; #"
217	$attrs.=';product="'.$product.'"';
218	}
219	if ($allattrs) {
220	$attrs.=';xrefs='.join(',', @$txar) if (@$txar > 0);
221	foreach my $k (keys(%$tahref)) {
222	next if exists($kattrs{$k});
223	$attrs.=";$k=".$tahref->{$k};
224	}
225	}
226	print join("\t", $gbase, $track, $rnafeat, $tstart, $tend, '.', $strand, '.', $attrs)."\n";
227	foreach my $ex (@exon) {
228	print join("\t", $gbase, $track, 'exon', $$ex[0], $$ex[1], '.', $strand, '.', "Parent=$tid")."\n";
229	}
230	@exon=();
231	#write_CDS();
232	my @cds=@$tcds;
233	if (@cds>0) { #have CDS, write it
234	@cds=sort { $main::a->[0] <=> $main::b->[0] } @cds;
235	my $codonstart=$tahref->{'codon_start'};
236	if ($codonstart>1) {
237	$codonstart--;
238	if ($strand eq '-') {
239	$cds[-1]->[1]-=$codonstart;
240	}
241	else {
242	$cds[0]->[0]+=$codonstart;
243	}
244	}
245	else { $codonstart='0'; }
246
247	foreach my $cd (@cds) { #TODO: calculate phase ?
248	print join("\t", $gbase, $track, 'CDS', $$cd[0], $$cd[1], '.', $strand, '.', "Parent=$tid")."\n";
249	}
250	}
251	} #for each transcript
252	}
253
254	sub makeUniqID {
255	my ($idref, $href)=@_;
256	my $id=$$idref;
257	$id=~s/~\d+$//;
258	my $c = ++$$href{$id};
259	if ($c>1) {
260	$c--;
261	$$idref=$id.'.m'.$c;
262	}
263	return $$idref;
264	}
265
266	sub flushFeature { #called after an attribute block has been parsed into %cattrs
267	return unless $curfeat && keys(%cattrs)>0;
268	my $gene=$cattrs{'gene'};
269	die("Error: no gene attribute found for current feature $curfeat\n")
270	unless $gene;
271	if ($curfeat eq 'gene') { # gene attributes processing
272	#make sure it's unique
273	push(@{$gids{$gene}},[$gbase, $gstart, $gend]);
274	my $gno=scalar(@{$gids{$gene}})-1;
275	$gene.='~'.$gno if $gno>0;
276	if (exists($genes{$gene})) { #should never happen, we just made sure it's unique!
277	my $prev_gstart=$genes{$gene}->[2];
278	my $prev_gend=$genes{$gene}->[3];
279	# if (abs($prev_gstart-$gstart)>30000) {
280	die("Error -- gene $gene is duplicated ($prev_gstart vs $gstart) !?\n");
281	# }
282	# else { #update min..max coordinates
283	# $genes{$gene}->[2]=$gstart if $gstart<$prev_gstart;
284	# $genes{$gene}->[3]=$gend if $gend>$prev_gend;
285	# ${$genes{$gene}->[5]}{'fragmented'}=1;
286	# }
287	}
288	else
289	{ # create a new gene entry here
290	my $pseudo= delete($cattrs{'pseudo'}) ? 1 : 0;
291	# 0 1 2 3 4 5 6 7
292	$genes{$gene}=[$gbase, $strand, $gstart, $gend, [], {}, {}, $pseudo];
293	}
294	delete $cattrs{'gene'};
295	if ($allattrs) {
296	if (exists($cattrs{'db_xref'})) {
297	$genes{$gene}->[4]=[@{$cattrs{'db_xref'}}];
298	delete $cattrs{'db_xref'};
299	}
300	#another special case: gene_syn
301	if (exists($cattrs{'gene_syn'})) {
302	#convert [list] to a string with comma delimited names
303	$cattrs{'gene_syn'}=join(',',@{$cattrs{'gene_syn'}});
304	}
305	my $garef=$genes{$gene}->[5];
306	my @k=keys(%cattrs);
307	@{$garef}{@k}=(values(%cattrs));
308	} #all attributes are kept
309	@exon=();
310	%cattrs=();
311	($partial3, $partial5)=();
312	return; #gene data added
313	} # stored gene attributes
314	# else
315	# - storing RNA attributes
316	# now make sure we place this RNA within the right gene - in case of gene duplications
317	# locate the matching $gene
318	@exon= sort { $main::a->[0] <=> $main::b->[0] } @exon;
319	my ($xstart, $xend)=($exon[0]->[0], $exon[-1]->[1]);
320	my $gxdata=$gids{$gene};
321	die("Error: gene $gene not found in \%gids !\n") unless $gxdata; #should never happen
322	if (@$gxdata>1) {
323	my $i=0;
324	my $found;
325	foreach my $gd (@$gxdata) {
326	if ($gd->[0] eq $gbase && $gd->[1]<=$xstart && $gd->[2]>=$xend) {
327	$found=1;
328	last;
329	}
330	$i++;
331	}
332	die("Error: cannot find gene enclosing RNA data ".
333	$cattrs{'transcript_id'}." ".$cattrs{'protein_id'}." (on $gbase, $xstart..$xend)\n")
334	unless $found;
335	$gene.='~'.$i if $i>0;
336	}
337
338	my $geneid=$gene;
339	my $gdata=$genes{$gene};
340	die("Error: \$genes data for gene '$gene' cannot be found!\n") unless $gdata;
341	my $rnas=$gdata->[6];
342	$geneid=~tr/~/_/;
343	my ($tid, $pid);
344	if ($curfeat eq 'CDS') { # just finished a CDS parsing
345	my $pid=$cattrs{'protein_id'};
346	die("Error: no protein_id found for CDS feature (gene $geneid)!\n") unless $pid;
347	#$pid=~s/^[a-z]+\\|//i; # ref\|NM_002314.1\|
348	#$pid=~s/\\|(\w*)$//; # some entries have a suffix - e.g. ref\|NM_006824.1\|P40
349	my @t=split(/\\|/,$pid);
350	$pid=$t[1] if (@t>1);
351	$pid=~s/\.\d+$//; #remove version
352	$tid=$pmap{$pid};
353	die("Error: cannot get transcript for protein id $pid (gene $geneid)!\n") unless $tid;
354	unless (exists($rnas->{$tid})) { #this should never happen - CDS without mRNA definition?
355	# 0 1 2 3 4 5 6
356	#$rnas->{$tid}=['mRNA', [@exons], [@cds], {}, [], 0, 0];
357	$tid=~s/\.\d+$//;
358	die("Error: cannot find mRNA entry for protein $pid (reported as coming from transcript $tid)\n")
359	unless exists($rnas->{$tid});
360	}
361	# #just add the CDS
362	push(@{$rnas->{$tid}->[2]}, @exon);
363	# }
364	} #CDS
365	else { # exons (RNA feature)
366	$tid=$cattrs{'transcript_id'};
367	if ($tid) {
368	my @t=split(/\\|/,$tid);
369	$tid=$t[1] if (@t>1);
370	#$tid=~s/^[a-z]+\\|//i;
371	#$tid=~s/\\|\w*$//; # some entries have a suffix - e.g. ref\|NM_006824.1\|P40
372	$tid=~s/\.\d+$//; #remove version!
373	}
374	else {
375	$tid=$geneid.'_t'; #some special cases with 'exon' features for pseudo genes
376	}
377	unless (exists($rnas->{$tid})) {
378	# 0 1 2 3 4 5 6
379	$rnas->{$tid}=[$curfeat, [@exon], [], {}, [], 0, 0];
380	}
381	else { #just add the exons
382	push(@{$rnas->{$tid}->[1]}, @exon);
383	}
384	} #exons
385	my $tdata=$rnas->{$tid};
386	$tdata->[5]=1 if $partial5;
387	$tdata->[6]=1 if $partial3;
388	# add attributes
389	delete @cattrs{'gene', 'transcript_id', 'protein_id'};
390	my $product = delete $cattrs{'product'};
391	my $ahref=$tdata->[3];
392	$ahref->{'product'}=$product if $product;
393	if ($allattrs) {
394	my %xrefs;
395	my @cx;
396	@cx=@{$cattrs{'db_xref'}} if exists $cattrs{'db_xref'};
397	push(@cx, @{$tdata->[4]});
398	@xrefs{@cx}=();
399	@cx=keys(%xrefs);
400	$tdata->[4]=[@cx];
401	delete $cattrs{'db_xref'};
402	#add new attributes
403	foreach my $k (keys %cattrs) {
404	$ahref->{$k}=$cattrs{$k};
405	}
406	}
407	#--> clean up
408	@exon=();
409	%cattrs=();
410	($partial3, $partial5)=();
411	}
412
413	sub flushGSeq {
414	return unless keys(%genes);
415	flushFeature(); # write last RNA feature read
416	my $gcount=keys(%genes);
417	while (my ($g,$gd)=each(%genes)) {
418	writeGene($g,$gd);
419	}
420	($gbase, $curfeat, @exon, %cattrs, $gpseudo, $gstart, $gend,
421	$partial5, $partial3)=();
422	$strand='+';
423	@exon=();
424	%cattrs=();
425	%genes=();
426	#-- debug only:
427	#exit;
428	}