From ae4827965db6b0598b23aa3310a681256ab3b8b2 Mon Sep 17 00:00:00 2001 From: Dima Molodenskiy Date: Tue, 26 May 2026 13:02:02 +0200 Subject: [PATCH] feat: public parse_fold_chains() for pure-syntactic spec parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `parse_fold_chains(spec, protein_delimiter="+") -> List[(name, copies, regions)]`, exported from `alphapulldown_input_parser`. Pure: no filesystem access, no FeatureIndex lookup — returns chain composition and copy numbers *before* features exist on disk. Delegates to the existing internal helpers `_extract_copy_and_regions` + `_parse_regions`, so all parsing rules stay in one place. Use case: downstream tooling (e.g. AlphaPulldownSnakemake) needs to know the chain composition of a fold spec at workflow-parse time to size SLURM resources or to filter folds by total length — both run before `create_features` produces the on-disk feature index that `expand_fold_specification` / `parse_fold` require. The chain name is returned verbatim (no path/extension stripping); the caller can normalise if needed. Region tokens are parsed into `RegionSelection`, honouring the same `name[:copies][:region...]` AlphaPulldown convention. Version: 0.4.0 -> 0.5.0. README updated. 26 tests pass (6 new). Co-Authored-By: Claude Opus 4.7 --- README.md | 6 ++- pyproject.toml | 2 +- src/alphapulldown_input_parser/__init__.py | 2 + src/alphapulldown_input_parser/parser.py | 33 ++++++++++++ test/test_parser.py | 60 ++++++++++++++++++++++ 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e708413..b5469be 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,16 @@ or, for local development: pip install -e /path/to/alphapulldown-input-parser ``` -The package exposes two helpers: +The package exposes three helpers: * `parse_fold(...)` – mirrors the historical AlphaPulldown helper and performs feature existence checks. * `expand_fold_specification(...)` – expands a single fold string without raising if features are missing. +* `parse_fold_chains(spec, protein_delimiter="+")` – **pure-syntactic** parse + of a single spec into `(chain_name, copies, regions)` triples, with no + filesystem access or feature lookup. Useful for resource sizing or input + validation before features exist on disk. The parser is dependency-free and works across AlphaPulldown, the Snakemake pipeline, or any other tooling that consumes the same fold syntax. diff --git a/pyproject.toml b/pyproject.toml index c4460fa..bbbc3b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "alphapulldown-input-parser" -version = "0.4.0" +version = "0.5.0" description = "Fold specification parser for AlphaPulldown" readme = "README.md" license = {text = "MIT"} diff --git a/src/alphapulldown_input_parser/__init__.py b/src/alphapulldown_input_parser/__init__.py index ce3d429..6bed272 100644 --- a/src/alphapulldown_input_parser/__init__.py +++ b/src/alphapulldown_input_parser/__init__.py @@ -8,11 +8,13 @@ expand_fold_specification, generate_fold_specifications, parse_fold, + parse_fold_chains, ) __all__ = [ "expand_fold_specification", "parse_fold", + "parse_fold_chains", "FormatError", "FeatureIndex", "Region", diff --git a/src/alphapulldown_input_parser/parser.py b/src/alphapulldown_input_parser/parser.py index b31e447..ee79ca8 100644 --- a/src/alphapulldown_input_parser/parser.py +++ b/src/alphapulldown_input_parser/parser.py @@ -276,6 +276,39 @@ def _parse_regions(region_tokens: Sequence[str], spec: str) -> RegionSelection: return RegionSelection(regions=tuple(regions)) +def parse_fold_chains( + spec: str, + protein_delimiter: str = "+", +) -> List[Tuple[str, int, RegionSelection]]: + """Parse a single fold spec into ``(chain_name, copies, regions)`` triples. + + Pure-syntactic parse — no filesystem access, no feature-index lookup — + intended for tools that need the chain composition of a spec *before* the + corresponding features exist on disk (e.g. for resource sizing or input + validation). Follows the AlphaPulldown ``name[:copies][:region...]`` + convention; the chain ``name`` is returned **unchanged** (no path or + extension stripping; the caller can normalise if needed). + + Example: + >>> parse_fold_chains("protA:2:1-100+protB", "+") + [('protA', 2, RegionSelection(regions=(Region(start=1, end=100),))), + ('protB', 1, RegionSelection(regions=None))] + """ + chains: List[Tuple[str, int, RegionSelection]] = [] + for raw_pf in str(spec).split(protein_delimiter): + pf = raw_pf.strip() + if not pf: + continue + tokens = [token.strip() for token in pf.split(":") if token.strip()] + if not tokens: + continue + name = tokens[0] + copies, region_tokens = _extract_copy_and_regions(tokens, spec) + regions = _parse_regions(region_tokens, spec) + chains.append((name, copies, regions)) + return chains + + # --------------------------------------------------------------------------- # Expansion logic # --------------------------------------------------------------------------- diff --git a/test/test_parser.py b/test/test_parser.py index a0ebda6..e84589e 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -18,6 +18,7 @@ RegionSelection, generate_fold_specifications, parse_fold, + parse_fold_chains, ) @@ -339,3 +340,62 @@ def test_generate_fold_specifications_writes_to_disk(tmp_path: Path) -> None: assert result == ["p1+p2"] assert output_path.read_text(encoding="utf-8") == "p1+p2\n" + + +# --------------------------------------------------------------------------- +# parse_fold_chains +# --------------------------------------------------------------------------- + + +def test_parse_fold_chains_basic_heteromer() -> None: + assert parse_fold_chains("A+B") == [ + ("A", 1, RegionSelection.all()), + ("B", 1, RegionSelection.all()), + ] + + +def test_parse_fold_chains_copies() -> None: + # copy number as the second token (canonical form) + assert parse_fold_chains("A:2") == [("A", 2, RegionSelection.all())] + # copy + region: name:copies:region + assert parse_fold_chains("A:2:1-100") == [ + ("A", 2, RegionSelection(regions=(Region(start=1, end=100),))), + ] + + +def test_parse_fold_chains_region_without_copies() -> None: + # A region alone (not a bare integer) implies a single copy + assert parse_fold_chains("A:1-100") == [ + ("A", 1, RegionSelection(regions=(Region(start=1, end=100),))), + ] + + +def test_parse_fold_chains_multiple_regions_and_copies() -> None: + chains = parse_fold_chains("A:2:1-100:200-300+B") + assert chains[0][0] == "A" + assert chains[0][1] == 2 + assert chains[0][2] == RegionSelection( + regions=(Region(start=1, end=100), Region(start=200, end=300)) + ) + assert chains[1] == ("B", 1, RegionSelection.all()) + + +def test_parse_fold_chains_preserves_paths_and_json_names() -> None: + """Names are returned verbatim — no extension or path stripping.""" + chains = parse_fold_chains("/path/to/protA_af3_input.json:2+protB.fasta") + assert chains[0][0] == "/path/to/protA_af3_input.json" + assert chains[0][1] == 2 + assert chains[1][0] == "protB.fasta" + assert chains[1][1] == 1 + + +def test_parse_fold_chains_custom_delimiter_and_whitespace() -> None: + assert parse_fold_chains(" A , B ", protein_delimiter=",") == [ + ("A", 1, RegionSelection.all()), + ("B", 1, RegionSelection.all()), + ] + # empty tokens are skipped + assert parse_fold_chains("A++B") == [ + ("A", 1, RegionSelection.all()), + ("B", 1, RegionSelection.all()), + ]