1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use LWP::Simple; |
5 |
use FindBin;use lib $FindBin::Bin; |
6 |
#use dbSession; |
7 |
my $url='http://www.expasy.org/cgi-bin/lists?dbxref.txt'; |
8 |
my $usage = qq/Usage: |
9 |
dbxref2bcp.pl [-f <input_dbxref.txt>] |
10 |
|
11 |
Parses the UniProt's dbxref.txt info preparing it for loading into geanno db, |
12 |
table xrefdbs. The required data file is either given explicitely or |
13 |
downloaded from the embedded url: |
14 |
$url |
15 |
/; |
16 |
|
17 |
umask 0002; |
18 |
getopts('f:') || die($usage."\n"); |
19 |
my $file=$Getopt::Std::opt_f; |
20 |
unless ($file) { |
21 |
print STDERR "downloading file..\n"; |
22 |
$file='dbxref_uniprot.txt'; |
23 |
unlink($file); |
24 |
die("Error: file $file already exists (couldn't remove!)\n") |
25 |
if -f $file; |
26 |
die "Error downloading with getstore()!\n" |
27 |
if is_error(getstore($url, $file)); |
28 |
die "Error: file $file is non-existent or zero size after retrieval!\n" |
29 |
unless -s $file; |
30 |
print STDERR "Download OK.\n"; |
31 |
} |
32 |
|
33 |
my @knowntags=('Abbrev', 'Name', 'Cat', 'LinkTp', 'Ref', 'Server', 'Db_URL', 'Note'); |
34 |
# xrefdb name cat linktp linktp_info server db_url note |
35 |
my %known; |
36 |
@known{@knowntags}=(1) x scalar(@knowntags); |
37 |
|
38 |
open(INF, $file) || die("Error opening file $file!\n"); |
39 |
|
40 |
my ($curtag, $curval); |
41 |
open(OUTF, '>xrefdbs.bcp') || die("Error creating file xrefdbs.bcp!\n"); |
42 |
|
43 |
my %val; |
44 |
while (<INF>) { |
45 |
if (m/^$/) { |
46 |
#end record |
47 |
if ($curtag) { |
48 |
storeCurVal(); |
49 |
putValues(); |
50 |
($curtag,$curval)=(undef,undef); |
51 |
} |
52 |
next; |
53 |
} |
54 |
if (m/^Abbrev:\s*(.+)/) { |
55 |
($curtag, $curval)=('Abbrev',$1); |
56 |
putValues(); |
57 |
next; |
58 |
} |
59 |
next unless $curtag; |
60 |
chomp; |
61 |
if (m/^(\S+)\s*:(.+)/) { #new tag |
62 |
my ($t, $v)=($1,$2); |
63 |
die("Error: unrecognized tag $t at '$_'!\n") unless exists $known{$t} || $t eq 'AC'; |
64 |
storeCurVal() unless $t eq 'AC'; |
65 |
$curtag=$t; |
66 |
$curval=$v; |
67 |
if ($curtag eq 'LinkTp') { |
68 |
if ($v=~m/^\s*(\w[\w ]+)\;\s+(\S.+)/) { |
69 |
$curval=$1; |
70 |
$val{$curtag}=$curval; |
71 |
$curtag='linfo'; |
72 |
$curval=$2; |
73 |
} |
74 |
} |
75 |
} |
76 |
elsif (m/^\s+(.+)/) { |
77 |
my $vx=$1; |
78 |
$vx=~s/^\s+//;$vx=~s/\s+$//; |
79 |
$curval.=' '.$vx; |
80 |
} |
81 |
chomp; |
82 |
} |
83 |
|
84 |
storeCurVal() if ($curtag); |
85 |
putValues(); |
86 |
|
87 |
close(INF); |
88 |
close(OUTF); |
89 |
|
90 |
# map { print STDERR $_."\t".$known{$_}."\n" } @knowntags; |
91 |
|
92 |
sub putValues { |
93 |
return unless keys(%val); |
94 |
my @vals = map { $val{$_} } @knowntags; |
95 |
print OUTF join("\t",@vals)."\n"; |
96 |
undef(%val); |
97 |
} |
98 |
|
99 |
sub storeCurVal { |
100 |
$curval=~s/^\s+//; $curval=~s/\s+$//; |
101 |
$val{$curtag}=$curval; |
102 |
$known{$curtag}=length($curval) if $known{$curtag}<length($curval); |
103 |
} |