ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/go2bcp.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 4025 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use LWP::Simple;
5 use FindBin;use lib $FindBin::Bin;
6 use dbSession;
7 my $url=
8 'http://www.geneontology.org/ontology/gene_ontology_edit.obo';
9 my $usage = qq/Usage:
10 go2bcp.pl [-f <input_obo_file>]
11
12 Parses and loads the GO data in geanno db.
13 The GO data are either downloaded from the embedded url:
14 $url
15 ..or, if -f option is given, from a local .obo file downloaded previously.
16 /;
17 umask 0002;
18 getopts('f:') || die($usage."\n");
19 my $file=$Getopt::Std::opt_f;
20 unless ($file) {
21 print STDERR "downloading file..\n";
22 $file='gene_ontology_data.obo';
23 unlink($file);
24 die("Error: file $file already exists (couldn't remove!)\n")
25 if -f $file;
26 die "Error downloading with getstore()!\n"
27 if is_error(getstore($url, $file));
28 die "Error: file $file is non-existent or zero size after retrieval!\n"
29 unless -s $file;
30 print STDERR "Download OK.\n";
31 }
32
33 my %relcode=( 'part_of' => 'P',
34 'is_a' => 'I',
35 'alt_id'=> 'A',
36 'upd_ann'=>'U'
37 );
38 my %namespace=(
39 'biological_process'=>'P',
40 'cellular_component'=>'C',
41 'molecular_function'=>'F'
42 );
43 #create the bcp files for loading
44
45 open(GOTERM, '>go_term.bcp') || die ("Error creating file go_term.bcp!\n");
46 open(GOLINK, '>go_link.bcp') || die ("Error creating file go_link.bcp!\n");
47 open(GOSYN, '>go_synonyms.bcp') || die ("Error creating file go_synonyms.bcp!\n");
48 open(GOXREF, '>go_xref.bcp') || die("Error creating file go_xref.bcp!\n");
49 open(GOSET, '>go_subset.bcp') || die("Error creating file go_subset.bcp!\n");
50
51 #my $ds=dbSession->new('geanno@NEOSYBASE');
52
53 open(FGO, $file) || die ("Error opening $file\n");
54
55 my $c_id; #current go term being parsed
56 my $c_type; # 'P', 'F', or 'C'
57 my $c_name;
58 my $c_def;
59 my $c_obsolete=0;
60 my $c_comment;
61 while (<FGO>) {
62 if (m/^\s*\[Term\]/) {
63 goTerm() if $c_id;
64 my $id;
65 do { $_=<FGO>;
66 ($id)=(/^id:\s*GO\:(\d+)/);
67 } until $id || !$_;
68 die("Couldn't parse current term ID!\n") unless $id;
69 $c_id=int($id);
70 ($c_name, $c_type, $c_def, $c_comment)=(undef,undef,undef,undef);
71 $c_obsolete=0;
72 next;
73 }
74 next unless $c_id;
75 next if m/^\s*$/;
76 chomp;
77 s/\s+$//;tr/\t/ /s;
78 if (/^name:\s*(.+)/) {
79 $c_name=$1;
80 }
81 elsif (/^namespace:\s*(\S+)/) {
82 $c_type=$namespace{lc($1)};
83 }
84 elsif (/^def:\s*(.+)/) {
85 $c_def=$1;
86 $c_def =~ tr/"//d; #"#--
87 }
88 elsif (/^is_obsolete:\s*true/) {
89 $c_obsolete=1;
90 }
91 elsif (/^relationship:\s*(\S+)\s*GO:(\d+)/) {
92 my ($rel, $id2)=($1, int($2));
93 $rel=$relcode{lc($rel)} ||
94 die("Error: unrecognized relationship code '$rel' for term $c_id ($_)\n");
95 print GOLINK join("\t",$c_id, $rel, $id2)."\n";
96 }
97 elsif (/^alt_id:\s*GO:(\d+)/) {
98 my $id2=int($1);
99 print GOLINK join("\t",$c_id, $relcode{'alt_id'}, $id2)."\n";
100 }
101 elsif (/^is_a:\s*GO:(\d+)/) {
102 my $id2=int($1);
103 print GOLINK join("\t",$c_id, $relcode{'is_a'}, $id2)."\n";
104 }
105 elsif (/^subset:\s*(\S+)/) {
106 my $subset=$1;
107 print GOSET join("\t",$c_id, $subset)."\n";
108 }
109 elsif (/^synonym:/) {
110 my ($s,$t)=(m/\"(.+?)\"\s*(\w+)/);
111 print STDERR "WARNING: parsing synonym & its type at $c_id, $_\n" unless $t;
112 print GOSYN join("\t",$c_id, $s, $t)."\n" if $t;
113 }
114 elsif (m/^xref:\s*(\S+):(.+)$/) {
115 my ($xdb, $xref)=($1,$2);
116 $xref=~s/^\s+//;$xref=~s/\s+$//;
117 print GOXREF join("\t",$c_id, $xdb, $xref)."\n";
118 }
119 elsif (/^comment:\s*(.+)/) {
120 my $comment=$1;$comment=~tr/\t\n/ /s;
121 $c_comment=$comment;
122 my @upds=($comment=~m/to\s*update\s*annotation.+?GO:(\d+)/g);
123 foreach my $u_id (@upds) {
124 print GOLINK join("\t",$c_id, 'U', int($u_id))."\n";
125 }
126 }
127
128 } #while
129 goTerm() if $c_id;
130 close(FGO);
131
132 close(GOTERM);
133 close(GOLINK);
134 close(GOSYN);
135 close(GOXREF);
136 close(GOSET);
137
138
139 #-- now run bcpin
140
141 my $bcpcmd='bcpin -TI -b geanno@NEOSYBASE';
142 system($bcpcmd. ' go_term.bcp go_link.bcp go_synonyms.bcp go_xref.bcp go_subset.bcp')
143 && die("Error at bcpin!\n");
144
145 sub goTerm {
146 print GOTERM join("\t",$c_id, $c_type, $c_name, $c_def, $c_obsolete, $c_comment)."\n";
147 }

Properties

Name Value
svn:executable *