gclib/scripts/uniprot2bcp.pl

#!/usr/bin/perl
use strict;
use Getopt::Std;
#use LWP::Simple;
use FindBin;use lib $FindBin::Bin;
use dbSession;
my $usage = qq/Usage:
 uniprot2bcp.pl [-t <taxons.tab>] [-F] uniref100_clusters.pfa [<uniprot_data_file_or_stream>]
 
 Parses the UNIPROT data into bcp files for loading into geanno db.

 Also creates a uniprot fasta file (uniprot.fa) with all the
 uniprot entries found in the input data\/stream

 If no database connection is possible, use -t option to provide a file
 with taxon information; <taxons.tab> can be built with 
 pullsql -b common\@CBCBMYSQL -o taxons.tab 
 .. with the query:
  select tax_id, name, c_name from taxon
  
 If -F option is given, ONLY the fasta file will be created. 
/;
umask 0002;
getopts('t:F') || die($usage."\n");
my $fastaOnly=$Getopt::Std::opt_F;
my $ftaxon=$Getopt::Std::opt_t;
my @tax;
if ($ftaxon) {
 open(TAX, $ftaxon)||die("Error opening $ftaxon\n");
 while (<TAX>) {
  chomp;
  my @t=split(/\t/);
  $tax[int($t[0])]=[$t[1],$t[2]];
  }
 close(TAX);
 }

#create the bcp files for loading

my $funiref=shift(@ARGV) || die("$usage\n");

my $ds;
unless ($ftaxon) {
 $ds=dbSession->new('common@CBCBMYSQL');
 }

my %uniref; # uniprot_id => uniref_cluster_id
open(UNIREF, $funiref) || die("Error opening $funiref file!\n");
while (<UNIREF>) {
 chomp;
 s/^>//;
 my @ids=split;
 unless (@ids>1) {
   print STDERR "WARNING: cluster: '$_' has no members!\n";
   next;
   }
 my $clname=shift(@ids);
 foreach my $upid (@ids) {
   $uniref{$upid}=$clname;
   }
 }
close(UNIREF);
print STDERR "UniRef clusters loaded, now parsing the main uniprot KB data..\n";

open(UPFA, '>uniprot.fa') || die ("Error creating file uniprot.fa!\n");
unless ($fastaOnly) {
 open(UNIPROT, '>uniprot.bcp') || die ("Error creating file uniprot.bcp!\n");
 open(UNAMES, '>uniprot_names.bcp') || die ("Error creating file uniprot_names.bcp!\n");
 open(UXREF,  '>uniprot_xref.bcp') || die ("Error creating file uniprot_xref.bcp!\n");
}

#open(FGO, $file) || die ("Error opening $file\n");
my $sth;
unless ($ftaxon) {
 $sth=$ds->prep('select name, c_name from taxon where tax_id=?');
 }
my $c_id; #current uniprot ID
my $c_taxon;
my $c_rev=0;
my $c_len;
my $c_priacc;
my $c_descr;
my $c_gene;
my $c_uniref;
my $c_htaxon;
my $seq;
my $inSeq=0;
my $xcode=0;
my ($c_orgname, $c_comname);
while (<>) {
 if (m{^//}) {
   writeUP() if $c_id;
   next;
   }
if ($inSeq) {
  #within SQ record
  chomp;
  tr/\t //d;
  $seq.=$_;
  next;
  }
if (m/^SQ\s+SEQUENCE\s+(\d+)/) {
  my $seqlen=$1;
  die("Error: invalid length for $c_id ($c_len vs $seqlen)!\n") if ($seqlen != $c_len);
  $inSeq=1;
  next;
  }
if (m/^ID\s+(\S+)\s+(\S+)\s+(\d+)/) {
  my ($id, $rev, $len)=($1,$2,$3);
  writeUP() if $c_id;
  ($c_id, $c_len)=($id, $len);
  $c_rev=($rev=~/^Reviewed/)?'1':'0';
  next;
  }# ID
if (m/^AC\s+(\S.+)/) {
  my $a=$1;
  chomp;
  my @accs=split(/;\s*/, $a);
  $c_priacc=$accs[0] unless $c_priacc;
  unless ($fastaOnly) {
    foreach my $acc (@accs) {
     next unless $acc;
     print UXREF join("\t",$c_id, 'UP_ACC', '', $acc, '')."\n";
     }
    }
  next;
  }# AC
if (m/^DE\s+(\S.+)/) {
  my $d=$1;
  $c_descr.=' '.$d;
  next;
  }
if (m/^PE\s+(\d+)\:/) {
  $xcode=$1;
  next;
  }
if (m/^GN\s+(\S.+)/) {
  my $d=$1;
  my ($gname)=($d=~m/\bName\s*=\s*([\w\-\.]+)/);
  if ($gname) {
    $c_gene=$gname unless $c_gene;
    unless ($fastaOnly) {
     print UXREF join("\t", $c_id, 'UP_GENE', '', $gname, '')."\n";
     my ($gsyns)=($d=~m/\bSynonyms\s*=\s*(\S[\w\-\,\.\s]+)\;/);
     if ($gsyns) {
       my @gsyn=split(/\,\s*/,$gsyns);
       foreach my $gs (@gsyn) {
         print UNAMES join("\t",$gname, 'GSYN', $gs)."\n";
         }
       } #synonyms found
     }
    }#a gname found
  next;
  }#GN
if (s/^OS\s+//) {
  ($c_orgname)=(m/^([\w\. \-]+)/);
  $c_orgname=~s/\s+$//;
  $c_orgname=~s/\.$//;
  ($c_comname)=(m/\(([\w\. \-]+)/);
  next;
  }
if (m/^OX\s+NCBI_TaxID\s*=\s*(\d+)/) {
 my $t=$1;
 $c_taxon=$t unless $c_taxon;
 print UXREF join("\t", $c_id, 'UP_Taxon', $t, '', '')."\n"
    unless $fastaOnly; #taxon
 next;
 }#OX (Taxon)

if (m/^OH\s+NCBI_TaxID\s*=\s*(\d+)/) {
 my $t=$1;
 $c_htaxon=$t unless $c_htaxon;
 print UXREF join("\t", $c_id, 'UP_Host', $t, '', '')."\n"
  unless $fastaOnly; #host taxon
 next;
 } #OH (Host taxon)
 
if (m/^DR\s+(\S.+)/) { # db xref 
 my $d=$1;
 $d=~s/\-?\.\s*$//;
 $d=~s/\;\s*$//;
 my @xd=split(/\;\s*/,$d);
 @xd=grep { !/^\-/ } @xd;
 my $xdb=shift(@xd);
 if ($xdb eq 'GO') {
   my $goid=shift(@xd);
   my ($gonum)=($goid=~/GO:(\d+)/);
   $gonum=int($gonum);
   die("Error parsing GO ID from DR line: $_\n") unless $gonum>0;
   my $gobranch=shift(@xd);
   my $go_ann_type=shift(@xd);
   print UXREF join("\t", $c_id, $xdb, $gonum, $goid, $go_ann_type.' '.$gobranch)."\n"
     unless $fastaOnly;
   }
 else {
   my $acc=shift(@xd);
   my $data=join(' ', @xd);
   print UXREF join("\t", $c_id, $xdb, '', $acc, $data)."\n" unless $fastaOnly;
   }
 next;
 } #DR 
} #while

writeUP() if $c_id;
unless ($fastaOnly) {
 close(UNIPROT);
 close(UNAMES);
 close(UXREF);
 }
close(UPFA);
unless ($ftaxon) {
  $ds->logout();
  }
#-- now run bcpin

#my $bcpcmd='bcpin -TI -b geanno@NEOSYBASE';
#system($bcpcmd. ' uniprot.bcp uniprot_names.bcp uniprot_xref') 
#  && die("Error at bcpin!\n");

sub writeUP {
 # c_descr now can contain strange RecName: Full=...; and AltName: Full=...;
 # keep only the RecName Full, remove anything else
 if ($c_descr=~m/Name:\s+Full=\s*([^\;]+)/) { # RecName or SubName
   $c_descr=$1;
   }
  # elsif ($c_descr=~m/SubName:\s+Full=\s*([^\;]+)/) {
  # $c_descr=$1;
  # }
   
 prepFields($c_descr);
 # -- c_descr can probably be insanely long.. truncate it here to the max field length
 $c_uniref=$uniref{$c_id} || $uniref{$c_priacc};
 unless ($fastaOnly) {
  print UNIPROT join("\t",
   $c_id, $c_taxon, $c_rev, $c_len, $c_priacc, $c_descr, $c_gene, $xcode, $c_uniref, $c_htaxon)."\n";
  }
 my $upre=($c_rev==1)?'UPr|':'UP|';
 #$upre.='e' if $xcode<3;
 my $defline='';
 $defline.='|gid|'.$c_gene if $c_gene;
 $defline.=' '.$c_uniref if $c_uniref;
 $defline.=' '.$c_descr;
 my ($oname, $coname);
 if ($ftaxon) {
    my $td=$tax[$c_taxon];
    if ($td) {
       ($oname, $coname)=@$td;
       }
     #else {
     #  print STDERR "Warning: no data for taxon $c_taxon".
     #        " was loaded ($c_id|acc|$c_priacc $defline)!\n";
     #  }
    }
  else {
   $ds->exec($sth, $c_taxon);
   while (my $rr=$ds->fetch($sth)) { ($oname, $coname)=@$rr; };
   }
 unless ($oname) {
   print STDERR "Warning: no data for taxon $c_taxon".
           " ($c_id|acc|$c_priacc $defline) - using '$c_orgname' instead.\n";
   $oname=$c_orgname;
   }
 $defline.=' {'.$oname.'}';
 $defline=~tr/;"\t/.' /; #"
 print UPFA '>'.$upre.$c_id.'|acc|'.$c_priacc.$defline."\n";
 print UPFA fastafmt(\$seq);
 $inSeq=0;
 ($c_id, $c_taxon, $c_rev, $c_len, $c_priacc, $c_descr, $c_gene, $xcode, $c_uniref, $c_htaxon)=
 (undef,    undef,    0,        undef,  undef,   undef,   undef,   0,      undef,     undef  );
 ($c_orgname, $c_comname)=(undef,undef);
 $seq=''; 
}

sub fastafmt {
 my $s=shift;
 my $slen=length($$s);
 my @lines=unpack("A60" x (int(($slen-1)/60)+1),$$s);
 return join("\n",@lines)."\n";
 }
 
sub prepFields {
 foreach (@_) {
  s/^\s+//;
  s/\s+$//;
  s/\.$//;
  tr/\t \n/   /s;
  }
 }
Revision:	23
Committed:	Tue Jul 26 21:44:38 2011 UTC (13 years, 1 month ago) by gpertea
Original Path:	*ann_bin/uniprot2bcp.pl*
File size:	7236 byte(s)
Log Message:	adding misc scripts
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	#use LWP::Simple;
5	use FindBin;use lib $FindBin::Bin;
6	use dbSession;
7	my $usage = qq/Usage:
8	uniprot2bcp.pl [-t <taxons.tab>] [-F] uniref100_clusters.pfa [<uniprot_data_file_or_stream>]
9
10	Parses the UNIPROT data into bcp files for loading into geanno db.
11
12	Also creates a uniprot fasta file (uniprot.fa) with all the
13	uniprot entries found in the input data\/stream
14
15	If no database connection is possible, use -t option to provide a file
16	with taxon information; <taxons.tab> can be built with
17	pullsql -b common\@CBCBMYSQL -o taxons.tab
18	.. with the query:
19	select tax_id, name, c_name from taxon
20
21	If -F option is given, ONLY the fasta file will be created.
22	/;
23	umask 0002;
24	getopts('t:F') \|\| die($usage."\n");
25	my $fastaOnly=$Getopt::Std::opt_F;
26	my $ftaxon=$Getopt::Std::opt_t;
27	my @tax;
28	if ($ftaxon) {
29	open(TAX, $ftaxon)\|\|die("Error opening $ftaxon\n");
30	while (<TAX>) {
31	chomp;
32	my @t=split(/\t/);
33	$tax[int($t[0])]=[$t[1],$t[2]];
34	}
35	close(TAX);
36	}
37
38	#create the bcp files for loading
39
40	my $funiref=shift(@ARGV) \|\| die("$usage\n");
41
42	my $ds;
43	unless ($ftaxon) {
44	$ds=dbSession->new('common@CBCBMYSQL');
45	}
46
47	my %uniref; # uniprot_id => uniref_cluster_id
48	open(UNIREF, $funiref) \|\| die("Error opening $funiref file!\n");
49	while (<UNIREF>) {
50	chomp;
51	s/^>//;
52	my @ids=split;
53	unless (@ids>1) {
54	print STDERR "WARNING: cluster: '$_' has no members!\n";
55	next;
56	}
57	my $clname=shift(@ids);
58	foreach my $upid (@ids) {
59	$uniref{$upid}=$clname;
60	}
61	}
62	close(UNIREF);
63	print STDERR "UniRef clusters loaded, now parsing the main uniprot KB data..\n";
64
65	open(UPFA, '>uniprot.fa') \|\| die ("Error creating file uniprot.fa!\n");
66	unless ($fastaOnly) {
67	open(UNIPROT, '>uniprot.bcp') \|\| die ("Error creating file uniprot.bcp!\n");
68	open(UNAMES, '>uniprot_names.bcp') \|\| die ("Error creating file uniprot_names.bcp!\n");
69	open(UXREF, '>uniprot_xref.bcp') \|\| die ("Error creating file uniprot_xref.bcp!\n");
70	}
71
72	#open(FGO, $file) \|\| die ("Error opening $file\n");
73	my $sth;
74	unless ($ftaxon) {
75	$sth=$ds->prep('select name, c_name from taxon where tax_id=?');
76	}
77	my $c_id; #current uniprot ID
78	my $c_taxon;
79	my $c_rev=0;
80	my $c_len;
81	my $c_priacc;
82	my $c_descr;
83	my $c_gene;
84	my $c_uniref;
85	my $c_htaxon;
86	my $seq;
87	my $inSeq=0;
88	my $xcode=0;
89	my ($c_orgname, $c_comname);
90	while (<>) {
91	if (m{^//}) {
92	writeUP() if $c_id;
93	next;
94	}
95	if ($inSeq) {
96	#within SQ record
97	chomp;
98	tr/\t //d;
99	$seq.=$_;
100	next;
101	}
102	if (m/^SQ\s+SEQUENCE\s+(\d+)/) {
103	my $seqlen=$1;
104	die("Error: invalid length for $c_id ($c_len vs $seqlen)!\n") if ($seqlen != $c_len);
105	$inSeq=1;
106	next;
107	}
108	if (m/^ID\s+(\S+)\s+(\S+)\s+(\d+)/) {
109	my ($id, $rev, $len)=($1,$2,$3);
110	writeUP() if $c_id;
111	($c_id, $c_len)=($id, $len);
112	$c_rev=($rev=~/^Reviewed/)?'1':'0';
113	next;
114	}# ID
115	if (m/^AC\s+(\S.+)/) {
116	my $a=$1;
117	chomp;
118	my @accs=split(/;\s*/, $a);
119	$c_priacc=$accs[0] unless $c_priacc;
120	unless ($fastaOnly) {
121	foreach my $acc (@accs) {
122	next unless $acc;
123	print UXREF join("\t",$c_id, 'UP_ACC', '', $acc, '')."\n";
124	}
125	}
126	next;
127	}# AC
128	if (m/^DE\s+(\S.+)/) {
129	my $d=$1;
130	$c_descr.=' '.$d;
131	next;
132	}
133	if (m/^PE\s+(\d+)\:/) {
134	$xcode=$1;
135	next;
136	}
137	if (m/^GN\s+(\S.+)/) {
138	my $d=$1;
139	my ($gname)=($d=~m/\bName\s=\s([\w\-\.]+)/);
140	if ($gname) {
141	$c_gene=$gname unless $c_gene;
142	unless ($fastaOnly) {
143	print UXREF join("\t", $c_id, 'UP_GENE', '', $gname, '')."\n";
144	my ($gsyns)=($d=~m/\bSynonyms\s=\s(\S[\w\-\,\.\s]+)\;/);
145	if ($gsyns) {
146	my @gsyn=split(/\,\s*/,$gsyns);
147	foreach my $gs (@gsyn) {
148	print UNAMES join("\t",$gname, 'GSYN', $gs)."\n";
149	}
150	} #synonyms found
151	}
152	}#a gname found
153	next;
154	}#GN
155	if (s/^OS\s+//) {
156	($c_orgname)=(m/^([\w\. \-]+)/);
157	$c_orgname=~s/\s+$//;
158	$c_orgname=~s/\.$//;
159	($c_comname)=(m/\(([\w\. \-]+)/);
160	next;
161	}
162	if (m/^OX\s+NCBI_TaxID\s=\s(\d+)/) {
163	my $t=$1;
164	$c_taxon=$t unless $c_taxon;
165	print UXREF join("\t", $c_id, 'UP_Taxon', $t, '', '')."\n"
166	unless $fastaOnly; #taxon
167	next;
168	}#OX (Taxon)
169
170	if (m/^OH\s+NCBI_TaxID\s=\s(\d+)/) {
171	my $t=$1;
172	$c_htaxon=$t unless $c_htaxon;
173	print UXREF join("\t", $c_id, 'UP_Host', $t, '', '')."\n"
174	unless $fastaOnly; #host taxon
175	next;
176	} #OH (Host taxon)
177
178	if (m/^DR\s+(\S.+)/) { # db xref
179	my $d=$1;
180	$d=~s/\-?\.\s*$//;
181	$d=~s/\;\s*$//;
182	my @xd=split(/\;\s*/,$d);
183	@xd=grep { !/^\-/ } @xd;
184	my $xdb=shift(@xd);
185	if ($xdb eq 'GO') {
186	my $goid=shift(@xd);
187	my ($gonum)=($goid=~/GO:(\d+)/);
188	$gonum=int($gonum);
189	die("Error parsing GO ID from DR line: $_\n") unless $gonum>0;
190	my $gobranch=shift(@xd);
191	my $go_ann_type=shift(@xd);
192	print UXREF join("\t", $c_id, $xdb, $gonum, $goid, $go_ann_type.' '.$gobranch)."\n"
193	unless $fastaOnly;
194	}
195	else {
196	my $acc=shift(@xd);
197	my $data=join(' ', @xd);
198	print UXREF join("\t", $c_id, $xdb, '', $acc, $data)."\n" unless $fastaOnly;
199	}
200	next;
201	} #DR
202	} #while
203
204	writeUP() if $c_id;
205	unless ($fastaOnly) {
206	close(UNIPROT);
207	close(UNAMES);
208	close(UXREF);
209	}
210	close(UPFA);
211	unless ($ftaxon) {
212	$ds->logout();
213	}
214	#-- now run bcpin
215
216	#my $bcpcmd='bcpin -TI -b geanno@NEOSYBASE';
217	#system($bcpcmd. ' uniprot.bcp uniprot_names.bcp uniprot_xref')
218	# && die("Error at bcpin!\n");
219
220	sub writeUP {
221	# c_descr now can contain strange RecName: Full=...; and AltName: Full=...;
222	# keep only the RecName Full, remove anything else
223	if ($c_descr=~m/Name:\s+Full=\s*([^\;]+)/) { # RecName or SubName
224	$c_descr=$1;
225	}
226	# elsif ($c_descr=~m/SubName:\s+Full=\s*([^\;]+)/) {
227	# $c_descr=$1;
228	# }
229
230	prepFields($c_descr);
231	# -- c_descr can probably be insanely long.. truncate it here to the max field length
232	$c_uniref=$uniref{$c_id} \|\| $uniref{$c_priacc};
233	unless ($fastaOnly) {
234	print UNIPROT join("\t",
235	$c_id, $c_taxon, $c_rev, $c_len, $c_priacc, $c_descr, $c_gene, $xcode, $c_uniref, $c_htaxon)."\n";
236	}
237	my $upre=($c_rev==1)?'UPr\|':'UP\|';
238	#$upre.='e' if $xcode<3;
239	my $defline='';
240	$defline.='\|gid\|'.$c_gene if $c_gene;
241	$defline.=' '.$c_uniref if $c_uniref;
242	$defline.=' '.$c_descr;
243	my ($oname, $coname);
244	if ($ftaxon) {
245	my $td=$tax[$c_taxon];
246	if ($td) {
247	($oname, $coname)=@$td;
248	}
249	#else {
250	# print STDERR "Warning: no data for taxon $c_taxon".
251	# " was loaded ($c_id\|acc\|$c_priacc $defline)!\n";
252	# }
253	}
254	else {
255	$ds->exec($sth, $c_taxon);
256	while (my $rr=$ds->fetch($sth)) { ($oname, $coname)=@$rr; };
257	}
258	unless ($oname) {
259	print STDERR "Warning: no data for taxon $c_taxon".
260	" ($c_id\|acc\|$c_priacc $defline) - using '$c_orgname' instead.\n";
261	$oname=$c_orgname;
262	}
263	$defline.=' {'.$oname.'}';
264	$defline=~tr/;"\t/.' /; #"
265	print UPFA '>'.$upre.$c_id.'\|acc\|'.$c_priacc.$defline."\n";
266	print UPFA fastafmt(\$seq);
267	$inSeq=0;
268	($c_id, $c_taxon, $c_rev, $c_len, $c_priacc, $c_descr, $c_gene, $xcode, $c_uniref, $c_htaxon)=
269	(undef, undef, 0, undef, undef, undef, undef, 0, undef, undef );
270	($c_orgname, $c_comname)=(undef,undef);
271	$seq='';
272	}
273
274	sub fastafmt {
275	my $s=shift;
276	my $slen=length($$s);
277	my @lines=unpack("A60" x (int(($slen-1)/60)+1),$$s);
278	return join("\n",@lines)."\n";
279	}
280
281	sub prepFields {
282	foreach (@_) {
283	s/^\s+//;
284	s/\s+$//;
285	s/\.$//;
286	tr/\t \n/ /s;
287	}
288	}