ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/hgnc_db_update.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 3139 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q{Usage:
7 hgnc_db_update.pl [-f <path/to/hgnc_download.txt>]
8
9 Prepares data from input hgnc data into "bcp" tab delimited
10 format for the following tables in database "common":
11
12 hgnc
13 hgnc_alias
14 hgnc_xref
15
16 Expected input columns:
17
18 0. HGNC ID
19 1. Approved Symbol
20 2. Approved Name
21 3. Status
22 4. Previous Symbols
23 5. Aliases
24 6. Name Aliases
25 7. Chromosome
26 8. Accession Numbers
27 9. Entrez Gene ID
28 10. Ensembl Gene ID
29 11. RefSeq IDs
30 12. Primary IDs
31 13. Secondary IDs
32 14. CCDS IDs
33 15. Entrez Gene ID (mapped data supplied by NCBI)
34 16. RefSeq (mapped data supplied by NCBI)
35 17. UniProt ID (mapped data supplied by UniProt)
36 18. Ensembl ID (mapped data supplied by Ensembl)
37 19. UCSC ID (mapped data supplied by UCSC)
38
39 Download url:
40
41 http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_primary_ids&col=gd_secondary_ids&col=gd_ccds_ids&col=md_eg_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status=Entry+Withdrawn&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag
42 };
43 umask 0002;
44 getopts('f:o:') || die($usage."\n");
45 my $infile=$Getopt::Std::opt_f || 'hgnc_download.txt';
46 die "$usage Error: cannot locate input file $infile\n" unless -f $infile;
47 my $wgeturl='http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?'.
48 'title=HGNC+output+data&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&'.
49 'col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_name_aliases&'.
50 'col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_eg_id&'.
51 'col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_primary_ids&'.
52 'col=gd_secondary_ids&col=gd_ccds_ids&col=md_eg_id&col=md_refseq_id&'.
53 'col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&'.
54 'status=Entry+Withdrawn&status_opt=2&level=pri&=on&where=&'.
55 'order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&'.
56 '.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag';
57 open(INF, $infile) || die("Error opening $infile\n");
58 open(BCP, '>hgnc.bcp') || die "Error creating file hgnc.bcp\n";
59 open(BCPA, '>hgnc_alias.bcp') || die "Error creating file hgnc_alias.bcp\n";
60 open(BCPX, '>hgnc_xref.bcp') || die "Error creating file hgnc_xref.bcp\n";
61
62 while (<INF>) {
63 next if (m/^HGNC ID\t/); {
64 chomp;
65 my @t=split(/\t/);
66 my $withdrawn=($t[1]=~s/[\~\-_ ]withdrawn$//i);
67 $withdrawn=1 if $t[3]=~m/withdrawn$/i);
68 my $replacedby=$1 if $withdrawn && $t[2]=~$m/see ([\w\.\-]+)$/;
69 my $current=$withdrawn ? 0 : 1;
70 # $current=1 if ($t[3]=~m/^approved$/i);
71 my $entrez_geneid = $t[9] || $t[15];
72 print BCP join("\t",@t[0..2], $t[7], $entrez_geneid, $current, $replacedby)."\n";
73
74 }
75 close(INF);
76 # write go_term table
77 # --
78 #************ Subroutines **************

Properties

Name Value
svn:executable *