gclib/scripts/ncbitax2bcp.pl

#!/usr/bin/perl
use strict;
#use Getopt::Std;
use FindBin;use lib $FindBin::Bin;

my $usage = q{Usage:

Download ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
And unpack it in the current directory

Then run $0 which will create
the bcp files for loading into the database.
};
umask 0002;
my %merged; #insert "merged" entries too, even though their data is duplicate
#getopts('o:') || die($usage."\n");
#my $outfile=$Getopt::Std::opt_o;
die("$usage\n") if $ARGV[0]=~m/^\-[\-help]*$/;
die("$usage Error: cannot find nodes.dmp!\n") unless -f 'nodes.dmp';
my $cmd='cut -f5 nodes.dmp | sort -u |';
open(TAXC, '>taxon_cat.bcp') || die "Error creating taxon_cat.bcp file!\n";

my %ranks; #rank_name => rank_id
open(RANK, $cmd) || die "Error opening $cmd pipe!\n";
my ($rank_id, $rank_name);
while (<RANK>) {
 chomp;
 s/^\s+//;s/\s+$//;
 next unless $_;
 $rank_id++; 
 $rank_name=$_;
 $ranks{$rank_name}=$rank_id;
 print TAXC join("\t", 'R', $rank_id, '', $rank_name, '')."\n";
 }
close(RANK);

open(MRG, 'merged.dmp') || die("Error opening merged.dmp!\n");
while (<MRG>) {
 chomp;
 next unless $_;
 s/\t\|$//;
 my @t=split(/\t\|\t/);
 push(@{$merged{$t[1]}},$t[0]);
}
close(MRG);

open(DIV, 'division.dmp') || die("Error opening division.dmp!\n");
while (<DIV>) {
 chomp;
 next unless $_;
 s/\t\|$//;
 my @t=split(/\t\|\t/);
 print TAXC join("\t", 'D', @t[0..3])."\n";
}

close(DIV);
close(TAXC);
open(NAMES, 'names.dmp') || die("Error opening names.dmp!\n");
open(TAXN, '>taxon_names.bcp') || die("Error creating taxon_names.bcp!\n");
my %names; # tax_id => [sci_name, com_name]
while(<NAMES>) {
 chomp;
 next unless $_;
 s/\t\|$//;
 my @t=split(/\t\|\t/);
 print TAXN join("\t", @t)."\n";
 if ($t[3]=~/\bcommon name/) {
   my $d=$names{$t[0]};
   $names{$t[0]}= $d ? [$$d[0], $t[1]] : ['',$t[1]];   
   }
  elsif ($t[3]=~/\bscientific name/) {
   my $d=$names{$t[0]};
   $names{$t[0]}= $d ? [$t[1], $$d[1]] : [$t[1]];
   } 
 }
close(NAMES);
close(TAXN);
open(TAX, '>taxon.bcp') || die("Error creating taxon.bcp!\n");
open(NODES, 'nodes.dmp') || die("Error opening names.dmp!\n");
while(<NODES>) {
 chomp;
 next unless $_;
 s/\t\|$//;
 my @t=split(/\t\|\t/);
 my $rank_id=$ranks{$t[2]};
 die("Error: invalid rank $t[2] from nodes.dmp line:\n$_\n")
   unless defined($rank_id);
 my $d=$names{$t[0]};
 die("Error: names not found for taxon $t[0] at nodes.dmp line:\n$_\n")
   unless $d && $$d[0];
 my $sci_name=$$d[0];
 my $com_name=$$d[1] || '';
 print TAX join("\t",$t[0],$t[1],$rank_id, $t[4], $t[5],
                       $sci_name, $com_name)."\n";
 if (my $md=$merged{$t[0]}) {
   foreach my $tid (@$md) {
     print TAX join("\t",$tid,$t[1],$rank_id, $t[4], $t[5],
                       $sci_name, $com_name)."\n";
     }
   }
}
close(NODES);
close(TAX);
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size:	2762 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	#use Getopt::Std;
4	use FindBin;use lib $FindBin::Bin;
5
6	my $usage = q{Usage:
7
8	Download ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
9	And unpack it in the current directory
10
11	Then run $0 which will create
12	the bcp files for loading into the database.
13	};
14	umask 0002;
15	my %merged; #insert "merged" entries too, even though their data is duplicate
16	#getopts('o:') \|\| die($usage."\n");
17	#my $outfile=$Getopt::Std::opt_o;
18	die("$usage\n") if $ARGV[0]=~m/^\-[\-help]*$/;
19	die("$usage Error: cannot find nodes.dmp!\n") unless -f 'nodes.dmp';
20	my $cmd='cut -f5 nodes.dmp \| sort -u \|';
21	open(TAXC, '>taxon_cat.bcp') \|\| die "Error creating taxon_cat.bcp file!\n";
22
23	my %ranks; #rank_name => rank_id
24	open(RANK, $cmd) \|\| die "Error opening $cmd pipe!\n";
25	my ($rank_id, $rank_name);
26	while (<RANK>) {
27	chomp;
28	s/^\s+//;s/\s+$//;
29	next unless $_;
30	$rank_id++;
31	$rank_name=$_;
32	$ranks{$rank_name}=$rank_id;
33	print TAXC join("\t", 'R', $rank_id, '', $rank_name, '')."\n";
34	}
35	close(RANK);
36
37	open(MRG, 'merged.dmp') \|\| die("Error opening merged.dmp!\n");
38	while (<MRG>) {
39	chomp;
40	next unless $_;
41	s/\t\\|$//;
42	my @t=split(/\t\\|\t/);
43	push(@{$merged{$t[1]}},$t[0]);
44	}
45	close(MRG);
46
47	open(DIV, 'division.dmp') \|\| die("Error opening division.dmp!\n");
48	while (<DIV>) {
49	chomp;
50	next unless $_;
51	s/\t\\|$//;
52	my @t=split(/\t\\|\t/);
53	print TAXC join("\t", 'D', @t[0..3])."\n";
54	}
55
56	close(DIV);
57	close(TAXC);
58	open(NAMES, 'names.dmp') \|\| die("Error opening names.dmp!\n");
59	open(TAXN, '>taxon_names.bcp') \|\| die("Error creating taxon_names.bcp!\n");
60	my %names; # tax_id => [sci_name, com_name]
61	while(<NAMES>) {
62	chomp;
63	next unless $_;
64	s/\t\\|$//;
65	my @t=split(/\t\\|\t/);
66	print TAXN join("\t", @t)."\n";
67	if ($t[3]=~/\bcommon name/) {
68	my $d=$names{$t[0]};
69	$names{$t[0]}= $d ? [$$d[0], $t[1]] : ['',$t[1]];
70	}
71	elsif ($t[3]=~/\bscientific name/) {
72	my $d=$names{$t[0]};
73	$names{$t[0]}= $d ? [$t[1], $$d[1]] : [$t[1]];
74	}
75	}
76	close(NAMES);
77	close(TAXN);
78	open(TAX, '>taxon.bcp') \|\| die("Error creating taxon.bcp!\n");
79	open(NODES, 'nodes.dmp') \|\| die("Error opening names.dmp!\n");
80	while(<NODES>) {
81	chomp;
82	next unless $_;
83	s/\t\\|$//;
84	my @t=split(/\t\\|\t/);
85	my $rank_id=$ranks{$t[2]};
86	die("Error: invalid rank $t[2] from nodes.dmp line:\n$_\n")
87	unless defined($rank_id);
88	my $d=$names{$t[0]};
89	die("Error: names not found for taxon $t[0] at nodes.dmp line:\n$_\n")
90	unless $d && $$d[0];
91	my $sci_name=$$d[0];
92	my $com_name=$$d[1] \|\| '';
93	print TAX join("\t",$t[0],$t[1],$rank_id, $t[4], $t[5],
94	$sci_name, $com_name)."\n";
95	if (my $md=$merged{$t[0]}) {
96	foreach my $tid (@$md) {
97	print TAX join("\t",$tid,$t[1],$rank_id, $t[4], $t[5],
98	$sci_name, $com_name)."\n";
99	}
100	}
101	}
102	close(NODES);
103	close(TAX);