User Tools


EVS Data Derived Haplotype Pool

This tutorial explains how the EVS dataset for SEQPower was generated from resource on the Exome Variant Server.

wget http://evs.gs.washington.edu/evs_bulk_data/ESP6500SI-V2.snps_indels.vcf.tar.gz


Click to display ⇲

Click to hide ⇱

evs.fmt
# Copyright (C) 2011 Bo Peng (bpeng@mdanderson.org)
# Distributed under GPL. see <http://www.gnu.org/licenses/>
#
# Please refer to http://varianttools.sourceforge.net/Format/New for
# a description of the format of this file.
 
[format description]
description=Import vcf
variant=chr,%(pos)s,%(ref)s,%(alt)s
genotype=%(geno)s
variant_info=%(var_info)s
genotype_info=%(geno_info)s
# variants with identical chr,pos,ref will be collapsed.
export_by=chr,%(pos)s,%(ref)s
 
[DEFAULT]
format_string=
format_string_comment=FORMAT string that is outputted in the 9th column of
    exported vcf file. Please specified an appropriate value corresponding to the
    parameter --geno_info because this column cannot be automatically determined.
 
geno_info=
geno_info_comment=Genotype information fields. No genotype field is imported by
    default. You may edit it into, for example, "geno_info=GD,GQ,PL", if the .vcf
    format field looks like "GT:GD:GQ:PL". Please check the FORMAT string of your .vcf
    file to determine available fields to be imported.
 
geno=GT
geno_comment=Field to extract genotype from .vcf file. You can set it to
    safe_GT if genotype is not the first field in the genotype columns of your .vcf file.
 
var_info=dbSNPVersion,EuropeanAmericanAltCount,EuropeanAmericanRefCount,AfricanAmericanAltCount,AfricanAmericanRefCount,AllAltCount,AllRefCount,EuropeanAmericanMaf,AfricanAmericanMaf,AllMaf,AvgSampleReadDepth,Genes,GeneAccession,FunctionGvs,AminoAcidChange,ProteinPos,cDNAPos,ConservationScorePhastCons,ConservationScoreGERP,GranthamScore,PolyPhenPrediction,ChimpAllele,ClinicalLink,ExomeChip
var_info_comment=Variant information fields to be imported. Please check the
    INFO column of your vcf file for available fields.
 
phase_sep='/'
phase_sep_comment=Seperator used to output genotype, / for unphased and | for
    phased. This parameter is needed because 'vtools import' does not save phase
    information for each genotype.
 
wildtype_code=('0',)
wildtype_code_comment=How wildtype homozygotes are imported. These genotypes are
    by default imported as GT=0. They will be discarded if you set this parameter
    to None.
 
id=
id_comment=The field to output to the third (ID) column of the vcf file. You
    can use a id field (if you imported or updated it from a vcf file), another ID field
    (e.g. rsname) if available, or '' to output all missing values.
 
pos=pos
pos_comment=Field for position. To export indel, set it to 'pos-length(upstream)'
 
ref=ref
ref_comment=Field for reference allele
 
alt=alt
alt_comment=Field for alternative allele
 
qual=
qual_comment=Field for quality score
 
filter=
filter_comment=Field for filter status
 
info=
info_comment=Field for info status
 
[field formatter]
#
# NOTE: if multiple records are collapsed, they are passed as tuples.
# If no formatter is defined, the first value will be used in output.
#
fmt_GT=GenoFormatter(style='vcf')
fmt_DP=IfMulti(Formatter('DP={[0]}'), Formatter('DP={}'))
# if there are multiple alternative alleles, join them by ','
#
# not sure if we need to keep both JoinRecords and JoinFields
fmt_alt=JoinRecords(',')
fmt_raw_alt=JoinRecords(',')
 
[col_1]
field=chr
# remove fields with empty chromosome
adj=ValueOfNull(None)
comment=chromosome
 
[col_2]
field=pos
comment=position
 
[col_3]
field=%(id)s
adj=ValueOfNull('.')
comment=id
 
[col_4]
field=%(ref)s
comment=reference allele
 
[col_5]
field=%(alt)s
comment=alternative alleles
 
[col_6]
field=%(qual)s
adj=ValueOfNull('0')
comment=quality
 
[col_7]
field=%(filter)s
adj=ValueOfNull('PASS')
comment=filter
 
[col_8]
field=%(info)s
adj=ValueOfNull('.'), JoinFields(';')
comment=variant info
 
[col_9]
field=
adj=Constant("%(format_string)s")
comment=genotype format
 
[col_10]
field=%(geno)s,%(geno_info)s
adj=JoinFields(':')
comment=genotype
 
 
[chr]
index=1
type=VARCHAR(20)
adj=RemoveLeading('chr')
comment=Chromosome
 
[pos]
index=2
type=INTEGER NOT NULL
comment=1-based position
 
[raw_pos]
index=2
type=INTEGER
comment=1-based position
 
[id]
index=3
type=VARCHAR(48)
adj=Nullify('.')
comment=variant id (rs number or other IDs)
 
[ref]
index=4
type=VARCHAR(255)
comment=Reference allele, '-' for insertion.
 
[alt]
index=5
adj=CheckSplit()
type=VARCHAR(255)
comment=Alternative allele, '-' for deletion.
 
[raw_ref]
index=4
type=VARCHAR(255)
comment=Reference allele, without remove common leading and ending nucleotide.
 
[raw_alt]
index=5
adj=CheckSplit()
type=VARCHAR(255)
comment=Alternative allele, without remove common leading and ending nucleotide.
 
[upstream]
index=4,5
adj=CommonLeading()
type=VARCHAR(255)
comment=Common leading alleles of ref and alt alleles stored in .vcf
    file. This field is only available for indels.
 
[downstream]
index=4,5
adj=CommonEnding()
type=VARCHAR(255)
comment=Common ending alleles of ref and alt alleles stored in .vcf
    file, common leading is extracted before common ending. This field
    is only available for indels.
 
[qual]
index=6
type=FLOAT
comment=phred-scaled quality score for the assertion made in ALT. High QUAL scores indicate high confidence calls.
 
[filter]
index=7
type=VARCHAR(255)
comment=PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail.
 
[GT]
index=10:
type=INTEGER
adj=VcfGenotype(default=%(wildtype_code)s)
comment=Gentoype coded as 0 (ref ref), 1 (ref alt), 2 (alt alt) or -1 (alt1, alt2), assuming GT is the first FORMAT field in the .vcf file. Missing genotype will be dropped.
 
[safe_GT]
# This vcf genotype extractor uses format string and genotype to
# extract genotype. Although the format string should not be needed
# because the genotype field should be the first one, one of my dataset
# does not following this rule. The performance penalty is significant
# 4.36 -> 3.03 for 50k records.
index=9, 10:
adj=VcfGenoFromFormat(default=%(wildtype_code)s)
type=INTEGER
comment=Gentoype coded as 0 (ref ref), 1 (ref alt), 2 (alt alt) or -1 (alt1, alt2). This field checks the FORMAT string and extract genotype accordingly. Missing genotype will be dropped.
 
[info]
index=8
type=VARCHAR(255)
comment=Raw INFO column in the vcf file. This may be further splitted into various specified info fields, such as DP, etc.
[dbSNPVersion]
index=8
type=VARCHAR(255)
adj=ExtractValue('DBSNP=', ';'), Nullify('.')
comment=dbSNP version which established the rs_id
 
[EuropeanAmericanAltCount]
index=8
type=INTEGER
adj=ExtractValue('EA_AC=', ';'), ExtractField(1, ','), Nullify('.')
comment=The observed ref allele counts for the European American population. Allele counts only include genotypes with quality >= 30 and read depth >= 10.
 
[EuropeanAmericanRefCount]
index=8
type=INTEGER
adj=ExtractValue('EA_AC=', ';'), ExtractField(2, ','), Nullify('.')
comment=The observed ref allele counts for the European American population. Allele counts only include genotypes with quality >= 30 and read depth >= 10.
 
[AfricanAmericanAltCount]
index=8
type=INTEGER
adj=ExtractValue('AA_AC=', ';'), ExtractField(1, ','), Nullify('.')
comment=The observed alt allele counts for the African American population. Allele counts only include genotypes with quality >= 30 and read depth >= 10.
 
[AfricanAmericanRefCount]
index=8
type=INTEGER
adj=ExtractValue('AA_AC=', ';'), ExtractField(2, ','), Nullify('.')
comment=The observed ref allele counts for the African American population. Allele counts only include genotypes with quality >= 30 and read depth >= 10.
 
[AllAltCount]
index=8
type=INTEGER
adj=ExtractValue('TAC=', ';'), ExtractField(1, ','), Nullify('.')
comment=The observed alt allele counts for all populations. Allele counts only include genotypes with quality >= 30 and read depth >= 10.
 
[AllRefCount]
index=8
type=INTEGER
adj=ExtractValue('TAC=', ';'), ExtractField(2, ','), Nullify('.')
comment=The observed ref allele counts for all populations. Allele counts only include genotypes with quality >= 30 and read depth >= 10.
 
[EuropeanAmericanMaf]
index=8
type=FLOAT
adj=ExtractValue('MAF=', ';'), ExtractField(1, ','), lambda x: float(x)/100.0, Nullify('.')
comment=The European American minor-allele frequency in percent.
 
[AfricanAmericanMaf]
index=8
type=FLOAT
adj=ExtractValue('MAF=', ';'), ExtractField(2, ','), lambda x: float(x)/100.0, Nullify('.')
comment=The African American minor-allele frequency in percent.
 
[AllMaf]
index=8
type=FLOAT
adj=ExtractValue('MAF=', ';'), ExtractField(3, ','), lambda x: float(x)/100.0, Nullify('.')
comment=The minor-allele frequency in percent for all populations.
 
[AvgSampleReadDepth]
index=8
type=INTEGER
adj=ExtractValue('DP=', ';'), Nullify('.')
comment=The average sample read depth.
 
[Genes]
index=8
type=VARCHAR(255)
adj=ExtractValue('GL=', ';'), Nullify('none')
comment=One or more genes for which the SNP is in the coding region (CCDS).
 
[GeneAccession]
index=8
type=VARCHAR(255)
adj=ExtractValue('GM=', ';'), Nullify('none')
comment=NCBI mRNA transcripts accession number.
 
[FunctionGvs]
index=8
type=VARCHAR(255)
adj=ExtractValue('FG=', ';'), Nullify('.')
comment=The GVS functions are calculated by the Exome Variant Server; they are based on the alleles for all populations and individuals; the bases in the coding region are divided into codons (if a multiple of 3), and the resulting amino acids are examined.
 
[AminoAcidChange]
index=8
type=VARCHAR(20)
adj=ExtractValue('AAC=', ';'), Nullify('none'), Nullify('none, none'), Nullify('.')
comment=The corresponding amino acid change for a SNP.
 
[ProteinPos]
index=8
type=VARCHAR(20)
adj=ExtractValue('PP=', ';'), Nullify('NA,NA'), Nullify('NA'), Nullify('.')
comment=The coresponding amino acid postion in a protein relative to the whole protein length.
 
[cDNAPos]
index=8
type=INTEGER
adj=ExtractValue('CDP=', ';'), Nullify('NA,NA'), Nullify('NA'), Nullify('.')
comment=The coresponding cDNA postion for a SNP.
 
[ConservationScorePhastCons]
index=8
type=FLOAT
adj=ExtractValue('CP=', ';'), Nullify('NA'), Nullify('.')
comment=A number between 0 and 1 that describes the degree of sequence conservation among 17 vertebrate species; these numbers are downloaded from the UCSC Genome site and are defined as the "posterior probability that the corresponding alignment column was generated by the conserved state of the phylo-HMM, given the model parameters and the multiple alignment" (see UCSC description).
 
[ConservationScoreGERP]
index=8
type=FLOAT
adj=ExtractValue('CG=', ';'), Nullify('NA'), Nullify('.')
comment=The rejected-substitution score from the program GERP, a number between -11.6 and 5.82 that describes the degree of sequence conservation among 34 mammalian species, with 5.82 being the most conserved; these scores were provided by Gregory M. Cooper of the University of Washington Department of Genome Sciences to the EVS project.
 
[GranthamScore]
index=8
type=INTEGER
adj=ExtractValue('GS=', ';'), Nullify('NA,NA'), Nullify('NA'), Nullify('.')
comment=Grantham Scores categorize codon replacements into classes of increasing chemical dissimilarity based on the publication by Granthan R.in 1974, Amino acid difference formula to help explain protein evolution. Science 1974 185:862-864.
 
[PolyPhenPrediction]
index=8
type=VARCHAR(20)
adj=ExtractValue('PH=', ';'), Nullify('.')
comment=Prediction of possible impact of an amino acid substitution on protein structure and function based on Polymorphism Phenotyping (PolyPhen) program.
 
[ChimpAllele]
index=8
type=CHAR(1)
adj=ExtractValue('AA=', ';'), Nullify('.')
comment=Chimp alleles are acquired from UCSC human/chimp alignment files. If the variation does not fall within an alignment block, or if it is an indel, the chimp allele is listed as "unknown". If the variation falls within a gap in the alignment, it is listed as "-".
 
[ClinicalLink]
index=8
type=VARCHAR(100)
adj=ExtractValue('CA=', ';'), Nullify('.')
comment=The potential clinical implications associated with a SNP (limited).
 
[ExomeChip]
index=8
type=VARCHAR(20)
adj=ExtractValue('EXOME_CHIP=', ';'), Nullify('.')
comment=Whether a SNP is on the Illumina HumanExome Chip
 
[DP_geno]
# Passing 'GT:DP\t0/0:64'
index=9, 10:
adj=FieldFromFormat('DP', ':')
type=INTEGER
comment=DP fields in INFO. Depth for each sample in a vcf file.


vtools init evs
vtools import ESP6500SI-V2.snps_indels.vcf.tar.gz  --format evs.fmt  -j8 --build hg19


Click to display ⇲

Click to hide ⇱

INFO: Importing variants from ESP6500SI-V2.snps_indels.vcf.tar.gz (1/1)
ESP6500SI-V2.snps_indels.vcf.tar.gz: 100% [===================] 1,467,290 13.1K/s in 00:01:51
INFO: 1,998,173 new variants (1,872,893 SNVs, 40,527 insertions, 84,784 deletions, 773 invalid) from 1,987,119 lines are imported.


vtools show fields
vtools output variant Genes EuropeanAmericanMaf chr pos FunctionGvs | gzip > EVSEA.gz
vtools output variant Genes AfricanAmericanMaf chr pos FunctionGvs | gzip > EVSAA.gz
vtools output variant Genes EuropeanAmericanRefCount EuropeanAmericanAltCount | gzip > EVSEACounts.gz
vtools output variant Genes AfricanAmericanRefCount AfricanAmericanAltCount | gzip > EVSAACounts.gz