Skip to content

Commit 130b06f

Browse files
committed
fix: handle AN=0
1 parent 1fd5c4b commit 130b06f

3 files changed

Lines changed: 75 additions & 1 deletion

File tree

src/anyvlm/functions/ingest_vcf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,15 @@ def ingest_vcf(
106106
for variant_id, af in zip(variant_ids, afs, strict=True):
107107
if variant_id is None:
108108
continue
109+
try:
110+
allele_frequency = af.ac / af.an
111+
except ZeroDivisionError:
112+
continue
109113
caf = AnyVlmCohortAlleleFrequencyResult(
110114
focusAllele=iriReference(variant_id),
111115
focusAlleleCount=af.ac,
112116
locusAlleleCount=af.an,
113-
focusAlleleFrequency=af.ac / af.an,
117+
focusAlleleFrequency=allele_frequency,
114118
qualityMeasures=QualityMeasures(qcFilters=af.filters),
115119
ancillaryResults=AncillaryResults(
116120
heterozygotes=af.ac_het,

tests/data/vcf/vcf_an_0.vcf

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##5UTR_annotation=Variant annotation from UTRAnnotator
4+
##5UTR_consequence=Variant consequence from UTRAnnotator
5+
##CADD_PHRED=PHRED-like scaled CADD score. CADD is only available here for non-commercial use. See CADD website for more information.
6+
##CADD_RAW=Raw CADD score. CADD is only available here for non-commercial use. See CADD website for more information.
7+
##Existing_InFrame_oORFs=The number of existing inFrame overlapping ORFs (inFrame oORF) at the 5 prime UTR
8+
##Existing_OutOfFrame_oORFs=The number of existing out-of-frame overlapping ORFs (OutOfFrame oORF) at the 5 prime UTR
9+
##Existing_uORFs=The number of existing uORFs with a stop codon within the 5 prime UTR
10+
##FILTER=<ID=EXCESS_ALLELES,Description="Site has an excess of alternate alleles based on the input threshold">
11+
##FILTER=<ID=ExcessHet,Description="Site has excess het value larger than the threshold">
12+
##FILTER=<ID=LowQual,Description="Low quality">
13+
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
14+
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
15+
##FORMAT=<ID=FT,Number=.,Type=String,Description="Genotype-level filter">
16+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
17+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
18+
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
19+
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
20+
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phasing set (typically the position of the first variant in the set)">
21+
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
22+
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
23+
##INFO=<ID=AC_Hemi,Number=A,Type=Integer,Description="Allele counts in hemizygous genotypes">
24+
##INFO=<ID=AC_Het,Number=A,Type=Integer,Description="Allele counts in heterozygous genotypes">
25+
##INFO=<ID=AC_Hom,Number=A,Type=Integer,Description="Allele counts in homozygous genotypes">
26+
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
27+
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
28+
##INFO=<ID=AS_QUALapprox,Number=1,Type=String,Description="Allele-specific QUAL approximations">
29+
##INFO=<ID=CALIBRATION_SENSITIVITY,Number=A,Type=String,Description="Calibration sensitivity corresponding to the value of SCORE">
30+
##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|GENE_PHENO|NEAREST|HGVS_OFFSET|AF|CLIN_SIG|SOMATIC|PHENO|REVEL|SpliceRegion|CADD_PHRED|CADD_RAW|5UTR_annotation|5UTR_consequence|Existing_InFrame_oORFs|Existing_OutOfFrame_oORFs|Existing_uORFs|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL">
31+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
32+
##INFO=<ID=F_MISSING,Number=.,Type=Float,Description="Added by +fill-tags expression F_MISSING=F_MISSING">
33+
##INFO=<ID=OLD_MULTIALLELIC,Number=1,Type=String,Description="Original chr:pos:ref:alt encoding">
34+
##INFO=<ID=OLD_VARIANT,Number=.,Type=String,Description="Original chr:pos:ref:alt encoding">
35+
##INFO=<ID=QUALapprox,Number=1,Type=Integer,Description="Sum of PL[0] values; used to approximate the QUAL score">
36+
##INFO=<ID=SCORE,Number=A,Type=String,Description="Score according to the model applied by ScoreVariantAnnotations">
37+
##INFO=<ID=TYPE,Number=.,Type=String,Description="Variant type">
38+
##REVEL=Rare Exome Variant Ensemble Learner
39+
##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain
40+
##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss
41+
##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain
42+
##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss
43+
##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain
44+
##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss
45+
##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain
46+
##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss
47+
##SpliceAI_pred_SYMBOL=SpliceAI gene symbol
48+
##SpliceRegion=SpliceRegion predictions
49+
##contig=<ID=chr14,length=107043718>
50+
##high_CALIBRATION_SENSITIVITY_INDEL=Sample Genotype FT filter value indicating that the genotyped allele failed INDEL model calibration sensitivity cutoff (0.99)
51+
##high_CALIBRATION_SENSITIVITY_SNP=Sample Genotype FT filter value indicating that the genotyped allele failed SNP model calibration sensitivity cutoff (0.997)
52+
##source=SelectVariants
53+
##INFO=<ID=VRS_Allele_IDs,Number=R,Type=String,Description="The computed identifiers for the GA4GH VRS Alleles corresponding to the GT indexes of the REF and ALT alleles">
54+
##INFO=<ID=VRS_Error,Number=.,Type=String,Description="If an error occurred computing a VRS Identifier, the error message">
55+
##INFO=<ID=VRS_Starts,Number=R,Type=String,Description="Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the GT indexes of the REF and ALT alleles">
56+
##INFO=<ID=VRS_Ends,Number=R,Type=String,Description="Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the GT indexes of the REF and ALT alleles">
57+
##INFO=<ID=VRS_States,Number=R,Type=String,Description="The literal sequence states used for the GA4GH VRS Alleles corresponding to the GT indexes of the REF and ALT alleles">
58+
#CHROM POS ID REF ALT QUAL FILTER INFO
59+
chr14 18223529 . C A . LowQual;NO_HQ_GENOTYPES AC=0;AC_Hemi=0;AC_Het=0;AC_Hom=0;AF=0.00;AN=0;AS_QUALapprox=0|55;CALIBRATION_SENSITIVITY=.;CSQ=A|intergenic_variant|MODIFIER|||||||||||||||rs1294420531||||||||OR11H12||||||||3.503|0.321010||||||||||||||;F_MISSING=0.23692;QUALapprox=55;SCORE=.;TYPE=SNP;VRS_Allele_IDs=ga4gh:VA.8OSPHYmhyg9hJTpFQ8aNcmLgYMR77ZyJ,ga4gh:VA.slgr2fnRKaUnQrJZvYNDGMrfZHw6QCr6;VRS_Starts=18223528,18223528;VRS_Ends=18223529,18223529;VRS_States=C,A

tests/unit/functions/test_ingest_vcf.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,14 @@ def test_ingest_vcf_infocol_missing(
115115
stub_anyvar_client,
116116
postgres_storage,
117117
)
118+
119+
120+
def test_ingest_vcf_an_zero(
121+
stub_anyvar_client: BaseAnyVarClient, test_data_dir: Path, postgres_storage: Storage
122+
):
123+
"""Test smooth handling of VCF that's missing one or more required INFO columns"""
124+
ingest_vcf(
125+
test_data_dir / "vcf" / "vcf_an_0.vcf",
126+
stub_anyvar_client,
127+
postgres_storage,
128+
)

0 commit comments

Comments
 (0)