Skip to content

Commit 7ab1e1e

Browse files
committed
implement _to_gnomad translator
1 parent 596b07c commit 7ab1e1e

1 file changed

Lines changed: 82 additions & 5 deletions

File tree

src/ga4gh/vrs/extras/translator.py

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ def __init__(
188188
}
189189

190190
self.to_translators = {
191+
"gnomad": self._to_gnomad,
191192
"hgvs": self._to_hgvs,
192193
"spdi": self._to_spdi,
193194
}
@@ -416,6 +417,57 @@ def _from_spdi(self, spdi_expr: str, **kwargs) -> models.Allele | None:
416417

417418
return self._create_allele(values, **kwargs)
418419

420+
def _to_gnomad(
421+
self, vo: models.Allele, namespace: str | None = "refseq", **kwargs
422+
) -> list[str]:
423+
"""Generate a *list* of gnomAD-style identifiers for VRS Allele.
424+
425+
If no alias translations are available, an empty list is
426+
returned.
427+
428+
If the VRS object cannot be expressed in gnomAD-style, raises ValueError.
429+
"""
430+
sequence = f"ga4gh:{vo.location.get_refget_accession()}"
431+
aliases = self.data_proxy.translate_sequence_identifier(sequence, namespace)
432+
aliases = [a.split(":")[1] for a in aliases]
433+
seq_proxies = {a: SequenceProxy(self.data_proxy, a) for a in aliases}
434+
start, end = vo.location.start, vo.location.end
435+
spdi_exprs = []
436+
437+
for alias in aliases:
438+
# Get the reference sequence
439+
seq_proxy = seq_proxies[alias]
440+
ref_seq = seq_proxy[start:end]
441+
442+
if vo.state.type == models.VrsType.REF_LEN_EXPR.value:
443+
# Derived from reference. sequence included if under limit, but
444+
# we can derive it again from the reference.
445+
alt_seq = denormalize_reference_length_expression(
446+
ref_seq=ref_seq,
447+
repeat_subunit_length=vo.state.repeatSubunitLength,
448+
alt_length=vo.state.length,
449+
)
450+
# Warn if the derived sequence is different from the one in the object
451+
if vo.state.sequence and vo.state.sequence.root != alt_seq:
452+
_logger.warning(
453+
"Derived sequence '%s' is different from provided state.sequence '%s'",
454+
alt_seq,
455+
vo.state.sequence.root,
456+
)
457+
else:
458+
alt_seq = vo.state.sequence.root
459+
460+
# Optionally allow using the length of the reference sequence
461+
# instead of the sequence itself.
462+
ref_seq_limit = kwargs.get("ref_seq_limit", 0)
463+
if ref_seq_limit is not None and len(ref_seq) > int(ref_seq_limit):
464+
ref_seq = len(ref_seq)
465+
466+
spdi_expr = f"{alias}:{start}:{ref_seq}:{alt_seq}"
467+
spdi_exprs.append(spdi_expr)
468+
469+
return spdi_exprs
470+
419471
def _to_hgvs(
420472
self,
421473
vo: models.Allele,
@@ -424,6 +476,25 @@ def _to_hgvs(
424476
) -> list[str]:
425477
return self.hgvs_tools.from_allele(vo, namespace)
426478

479+
def _to_gnomad(
480+
self,
481+
vo: models.Allele,
482+
namespace: str | None = None,
483+
**kwargs, # noqa: ARG002
484+
) -> list[str]:
485+
"""Generate a *list* of gnomAD-style identifiers for VRS Allele.
486+
487+
If no alias translations are available, an empty list is returned.
488+
489+
If the VRS object cannot be expressed in gnomAD-style, raises ValueError.
490+
"""
491+
namespace = namespace or self.default_assembly_name
492+
if not namespace.startswith('GRCh'):
493+
raise ValueError(f"gnomAD-style identifiers require a GRCh reference sequence namespace, but got '{namespace}'")
494+
return self._to_location_expression(
495+
"{alias}-{start}-{ref_seq}-{alt_seq}", vo, namespace,
496+
)
497+
427498
def _to_spdi(
428499
self, vo: models.Allele, namespace: str | None = "refseq", **kwargs
429500
) -> list[str]:
@@ -450,12 +521,20 @@ def _to_spdi(
450521
SPDI and VRS use identical normalization. The incoming Allele
451522
is expected to be normalized per VRS spec.
452523
"""
524+
ref_seq_limit = kwargs.get("ref_seq_limit", 0)
525+
return self._to_location_expression(
526+
"{alias}:{start}:{ref_seq}:{alt_seq}", vo, namespace, ref_seq_limi=ref_seq_limit,
527+
)
528+
529+
def _to_location_expression(
530+
self, id_template: str, vo: models.Allele, namespace: str | None , ref_seq_limit: int | None = None,
531+
) -> list[str]:
453532
sequence = f"ga4gh:{vo.location.get_refget_accession()}"
454533
aliases = self.data_proxy.translate_sequence_identifier(sequence, namespace)
455534
aliases = [a.split(":")[1] for a in aliases]
456535
seq_proxies = {a: SequenceProxy(self.data_proxy, a) for a in aliases}
457536
start, end = vo.location.start, vo.location.end
458-
spdi_exprs = []
537+
exprs = []
459538

460539
for alias in aliases:
461540
# Get the reference sequence
@@ -482,14 +561,12 @@ def _to_spdi(
482561

483562
# Optionally allow using the length of the reference sequence
484563
# instead of the sequence itself.
485-
ref_seq_limit = kwargs.get("ref_seq_limit", 0)
486564
if ref_seq_limit is not None and len(ref_seq) > int(ref_seq_limit):
487565
ref_seq = len(ref_seq)
488566

489-
spdi_expr = f"{alias}:{start}:{ref_seq}:{alt_seq}"
490-
spdi_exprs.append(spdi_expr)
567+
exprs.append(id_template.format(alias=alias, start=start, ref_seq=ref_seq, alt_seq=alt_seq))
491568

492-
return spdi_exprs
569+
return exprs
493570

494571
def _post_process_imported_allele(
495572
self, allele: models.Allele, **kwargs

0 commit comments

Comments
 (0)