1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q{Usage: |
7 |
hgnc_db_update.pl [-f <path/to/hgnc_download.txt>] |
8 |
|
9 |
Prepares data from input hgnc data into "bcp" tab delimited |
10 |
format for the following tables in database "common": |
11 |
|
12 |
hgnc |
13 |
hgnc_alias |
14 |
hgnc_xref |
15 |
|
16 |
Expected input columns: |
17 |
|
18 |
0. HGNC ID |
19 |
1. Approved Symbol |
20 |
2. Approved Name |
21 |
3. Status |
22 |
4. Previous Symbols |
23 |
5. Aliases |
24 |
6. Name Aliases |
25 |
7. Chromosome |
26 |
8. Accession Numbers |
27 |
9. Entrez Gene ID |
28 |
10. Ensembl Gene ID |
29 |
11. RefSeq IDs |
30 |
12. Primary IDs |
31 |
13. Secondary IDs |
32 |
14. CCDS IDs |
33 |
15. Entrez Gene ID (mapped data supplied by NCBI) |
34 |
16. RefSeq (mapped data supplied by NCBI) |
35 |
17. UniProt ID (mapped data supplied by UniProt) |
36 |
18. Ensembl ID (mapped data supplied by Ensembl) |
37 |
19. UCSC ID (mapped data supplied by UCSC) |
38 |
|
39 |
Download url: |
40 |
|
41 |
http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_primary_ids&col=gd_secondary_ids&col=gd_ccds_ids&col=md_eg_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status=Entry+Withdrawn&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag |
42 |
}; |
43 |
umask 0002; |
44 |
getopts('f:o:') || die($usage."\n"); |
45 |
my $infile=$Getopt::Std::opt_f || 'hgnc_download.txt'; |
46 |
die "$usage Error: cannot locate input file $infile\n" unless -f $infile; |
47 |
my $wgeturl='http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?'. |
48 |
'title=HGNC+output+data&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&'. |
49 |
'col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_name_aliases&'. |
50 |
'col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_eg_id&'. |
51 |
'col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_primary_ids&'. |
52 |
'col=gd_secondary_ids&col=gd_ccds_ids&col=md_eg_id&col=md_refseq_id&'. |
53 |
'col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&'. |
54 |
'status=Entry+Withdrawn&status_opt=2&level=pri&=on&where=&'. |
55 |
'order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&'. |
56 |
'.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'; |
57 |
open(INF, $infile) || die("Error opening $infile\n"); |
58 |
open(BCP, '>hgnc.bcp') || die "Error creating file hgnc.bcp\n"; |
59 |
open(BCPA, '>hgnc_alias.bcp') || die "Error creating file hgnc_alias.bcp\n"; |
60 |
open(BCPX, '>hgnc_xref.bcp') || die "Error creating file hgnc_xref.bcp\n"; |
61 |
|
62 |
while (<INF>) { |
63 |
next if (m/^HGNC ID\t/); { |
64 |
chomp; |
65 |
my @t=split(/\t/); |
66 |
my $withdrawn=($t[1]=~s/[\~\-_ ]withdrawn$//i); |
67 |
$withdrawn=1 if $t[3]=~m/withdrawn$/i); |
68 |
my $replacedby=$1 if $withdrawn && $t[2]=~$m/see ([\w\.\-]+)$/; |
69 |
my $current=$withdrawn ? 0 : 1; |
70 |
# $current=1 if ($t[3]=~m/^approved$/i); |
71 |
my $entrez_geneid = $t[9] || $t[15]; |
72 |
print BCP join("\t",@t[0..2], $t[7], $entrez_geneid, $current, $replacedby)."\n"; |
73 |
|
74 |
} |
75 |
close(INF); |
76 |
# write go_term table |
77 |
# -- |
78 |
#************ Subroutines ************** |