ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/ncbitax2bcp.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 2762 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 #use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q{Usage:
7
8 Download ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
9 And unpack it in the current directory
10
11 Then run $0 which will create
12 the bcp files for loading into the database.
13 };
14 umask 0002;
15 my %merged; #insert "merged" entries too, even though their data is duplicate
16 #getopts('o:') || die($usage."\n");
17 #my $outfile=$Getopt::Std::opt_o;
18 die("$usage\n") if $ARGV[0]=~m/^\-[\-help]*$/;
19 die("$usage Error: cannot find nodes.dmp!\n") unless -f 'nodes.dmp';
20 my $cmd='cut -f5 nodes.dmp | sort -u |';
21 open(TAXC, '>taxon_cat.bcp') || die "Error creating taxon_cat.bcp file!\n";
22
23 my %ranks; #rank_name => rank_id
24 open(RANK, $cmd) || die "Error opening $cmd pipe!\n";
25 my ($rank_id, $rank_name);
26 while (<RANK>) {
27 chomp;
28 s/^\s+//;s/\s+$//;
29 next unless $_;
30 $rank_id++;
31 $rank_name=$_;
32 $ranks{$rank_name}=$rank_id;
33 print TAXC join("\t", 'R', $rank_id, '', $rank_name, '')."\n";
34 }
35 close(RANK);
36
37 open(MRG, 'merged.dmp') || die("Error opening merged.dmp!\n");
38 while (<MRG>) {
39 chomp;
40 next unless $_;
41 s/\t\|$//;
42 my @t=split(/\t\|\t/);
43 push(@{$merged{$t[1]}},$t[0]);
44 }
45 close(MRG);
46
47 open(DIV, 'division.dmp') || die("Error opening division.dmp!\n");
48 while (<DIV>) {
49 chomp;
50 next unless $_;
51 s/\t\|$//;
52 my @t=split(/\t\|\t/);
53 print TAXC join("\t", 'D', @t[0..3])."\n";
54 }
55
56 close(DIV);
57 close(TAXC);
58 open(NAMES, 'names.dmp') || die("Error opening names.dmp!\n");
59 open(TAXN, '>taxon_names.bcp') || die("Error creating taxon_names.bcp!\n");
60 my %names; # tax_id => [sci_name, com_name]
61 while(<NAMES>) {
62 chomp;
63 next unless $_;
64 s/\t\|$//;
65 my @t=split(/\t\|\t/);
66 print TAXN join("\t", @t)."\n";
67 if ($t[3]=~/\bcommon name/) {
68 my $d=$names{$t[0]};
69 $names{$t[0]}= $d ? [$$d[0], $t[1]] : ['',$t[1]];
70 }
71 elsif ($t[3]=~/\bscientific name/) {
72 my $d=$names{$t[0]};
73 $names{$t[0]}= $d ? [$t[1], $$d[1]] : [$t[1]];
74 }
75 }
76 close(NAMES);
77 close(TAXN);
78 open(TAX, '>taxon.bcp') || die("Error creating taxon.bcp!\n");
79 open(NODES, 'nodes.dmp') || die("Error opening names.dmp!\n");
80 while(<NODES>) {
81 chomp;
82 next unless $_;
83 s/\t\|$//;
84 my @t=split(/\t\|\t/);
85 my $rank_id=$ranks{$t[2]};
86 die("Error: invalid rank $t[2] from nodes.dmp line:\n$_\n")
87 unless defined($rank_id);
88 my $d=$names{$t[0]};
89 die("Error: names not found for taxon $t[0] at nodes.dmp line:\n$_\n")
90 unless $d && $$d[0];
91 my $sci_name=$$d[0];
92 my $com_name=$$d[1] || '';
93 print TAX join("\t",$t[0],$t[1],$rank_id, $t[4], $t[5],
94 $sci_name, $com_name)."\n";
95 if (my $md=$merged{$t[0]}) {
96 foreach my $tid (@$md) {
97 print TAX join("\t",$tid,$t[1],$rank_id, $t[4], $t[5],
98 $sci_name, $com_name)."\n";
99 }
100 }
101 }
102 close(NODES);
103 close(TAX);

Properties

Name Value
svn:executable *