1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
#use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q{Usage: |
7 |
|
8 |
Download ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz |
9 |
And unpack it in the current directory |
10 |
|
11 |
Then run $0 which will create |
12 |
the bcp files for loading into the database. |
13 |
}; |
14 |
umask 0002; |
15 |
my %merged; #insert "merged" entries too, even though their data is duplicate |
16 |
#getopts('o:') || die($usage."\n"); |
17 |
#my $outfile=$Getopt::Std::opt_o; |
18 |
die("$usage\n") if $ARGV[0]=~m/^\-[\-help]*$/; |
19 |
die("$usage Error: cannot find nodes.dmp!\n") unless -f 'nodes.dmp'; |
20 |
my $cmd='cut -f5 nodes.dmp | sort -u |'; |
21 |
open(TAXC, '>taxon_cat.bcp') || die "Error creating taxon_cat.bcp file!\n"; |
22 |
|
23 |
my %ranks; #rank_name => rank_id |
24 |
open(RANK, $cmd) || die "Error opening $cmd pipe!\n"; |
25 |
my ($rank_id, $rank_name); |
26 |
while (<RANK>) { |
27 |
chomp; |
28 |
s/^\s+//;s/\s+$//; |
29 |
next unless $_; |
30 |
$rank_id++; |
31 |
$rank_name=$_; |
32 |
$ranks{$rank_name}=$rank_id; |
33 |
print TAXC join("\t", 'R', $rank_id, '', $rank_name, '')."\n"; |
34 |
} |
35 |
close(RANK); |
36 |
|
37 |
open(MRG, 'merged.dmp') || die("Error opening merged.dmp!\n"); |
38 |
while (<MRG>) { |
39 |
chomp; |
40 |
next unless $_; |
41 |
s/\t\|$//; |
42 |
my @t=split(/\t\|\t/); |
43 |
push(@{$merged{$t[1]}},$t[0]); |
44 |
} |
45 |
close(MRG); |
46 |
|
47 |
open(DIV, 'division.dmp') || die("Error opening division.dmp!\n"); |
48 |
while (<DIV>) { |
49 |
chomp; |
50 |
next unless $_; |
51 |
s/\t\|$//; |
52 |
my @t=split(/\t\|\t/); |
53 |
print TAXC join("\t", 'D', @t[0..3])."\n"; |
54 |
} |
55 |
|
56 |
close(DIV); |
57 |
close(TAXC); |
58 |
open(NAMES, 'names.dmp') || die("Error opening names.dmp!\n"); |
59 |
open(TAXN, '>taxon_names.bcp') || die("Error creating taxon_names.bcp!\n"); |
60 |
my %names; # tax_id => [sci_name, com_name] |
61 |
while(<NAMES>) { |
62 |
chomp; |
63 |
next unless $_; |
64 |
s/\t\|$//; |
65 |
my @t=split(/\t\|\t/); |
66 |
print TAXN join("\t", @t)."\n"; |
67 |
if ($t[3]=~/\bcommon name/) { |
68 |
my $d=$names{$t[0]}; |
69 |
$names{$t[0]}= $d ? [$$d[0], $t[1]] : ['',$t[1]]; |
70 |
} |
71 |
elsif ($t[3]=~/\bscientific name/) { |
72 |
my $d=$names{$t[0]}; |
73 |
$names{$t[0]}= $d ? [$t[1], $$d[1]] : [$t[1]]; |
74 |
} |
75 |
} |
76 |
close(NAMES); |
77 |
close(TAXN); |
78 |
open(TAX, '>taxon.bcp') || die("Error creating taxon.bcp!\n"); |
79 |
open(NODES, 'nodes.dmp') || die("Error opening names.dmp!\n"); |
80 |
while(<NODES>) { |
81 |
chomp; |
82 |
next unless $_; |
83 |
s/\t\|$//; |
84 |
my @t=split(/\t\|\t/); |
85 |
my $rank_id=$ranks{$t[2]}; |
86 |
die("Error: invalid rank $t[2] from nodes.dmp line:\n$_\n") |
87 |
unless defined($rank_id); |
88 |
my $d=$names{$t[0]}; |
89 |
die("Error: names not found for taxon $t[0] at nodes.dmp line:\n$_\n") |
90 |
unless $d && $$d[0]; |
91 |
my $sci_name=$$d[0]; |
92 |
my $com_name=$$d[1] || ''; |
93 |
print TAX join("\t",$t[0],$t[1],$rank_id, $t[4], $t[5], |
94 |
$sci_name, $com_name)."\n"; |
95 |
if (my $md=$merged{$t[0]}) { |
96 |
foreach my $tid (@$md) { |
97 |
print TAX join("\t",$tid,$t[1],$rank_id, $t[4], $t[5], |
98 |
$sci_name, $com_name)."\n"; |
99 |
} |
100 |
} |
101 |
} |
102 |
close(NODES); |
103 |
close(TAX); |