Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ or, for local development:
pip install -e /path/to/alphapulldown-input-parser
```

The package exposes two helpers:
The package exposes three helpers:

* `parse_fold(...)` – mirrors the historical AlphaPulldown helper and performs
feature existence checks.
* `expand_fold_specification(...)` – expands a single fold string without
raising if features are missing.
* `parse_fold_chains(spec, protein_delimiter="+")` – **pure-syntactic** parse
of a single spec into `(chain_name, copies, regions)` triples, with no
filesystem access or feature lookup. Useful for resource sizing or input
validation before features exist on disk.

The parser is dependency-free and works across AlphaPulldown, the Snakemake
pipeline, or any other tooling that consumes the same fold syntax.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "alphapulldown-input-parser"
version = "0.4.0"
version = "0.5.0"
description = "Fold specification parser for AlphaPulldown"
readme = "README.md"
license = {text = "MIT"}
Expand Down
2 changes: 2 additions & 0 deletions src/alphapulldown_input_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
expand_fold_specification,
generate_fold_specifications,
parse_fold,
parse_fold_chains,
)

__all__ = [
"expand_fold_specification",
"parse_fold",
"parse_fold_chains",
"FormatError",
"FeatureIndex",
"Region",
Expand Down
33 changes: 33 additions & 0 deletions src/alphapulldown_input_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,39 @@ def _parse_regions(region_tokens: Sequence[str], spec: str) -> RegionSelection:
return RegionSelection(regions=tuple(regions))


def parse_fold_chains(
spec: str,
protein_delimiter: str = "+",
) -> List[Tuple[str, int, RegionSelection]]:
"""Parse a single fold spec into ``(chain_name, copies, regions)`` triples.

Pure-syntactic parse — no filesystem access, no feature-index lookup —
intended for tools that need the chain composition of a spec *before* the
corresponding features exist on disk (e.g. for resource sizing or input
validation). Follows the AlphaPulldown ``name[:copies][:region...]``
convention; the chain ``name`` is returned **unchanged** (no path or
extension stripping; the caller can normalise if needed).

Example:
>>> parse_fold_chains("protA:2:1-100+protB", "+")
[('protA', 2, RegionSelection(regions=(Region(start=1, end=100),))),
('protB', 1, RegionSelection(regions=None))]
"""
chains: List[Tuple[str, int, RegionSelection]] = []
for raw_pf in str(spec).split(protein_delimiter):
pf = raw_pf.strip()
if not pf:
continue
tokens = [token.strip() for token in pf.split(":") if token.strip()]
if not tokens:
continue
name = tokens[0]
copies, region_tokens = _extract_copy_and_regions(tokens, spec)
regions = _parse_regions(region_tokens, spec)
chains.append((name, copies, regions))
return chains


# ---------------------------------------------------------------------------
# Expansion logic
# ---------------------------------------------------------------------------
Expand Down
60 changes: 60 additions & 0 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
RegionSelection,
generate_fold_specifications,
parse_fold,
parse_fold_chains,
)


Expand Down Expand Up @@ -339,3 +340,62 @@ def test_generate_fold_specifications_writes_to_disk(tmp_path: Path) -> None:

assert result == ["p1+p2"]
assert output_path.read_text(encoding="utf-8") == "p1+p2\n"


# ---------------------------------------------------------------------------
# parse_fold_chains
# ---------------------------------------------------------------------------


def test_parse_fold_chains_basic_heteromer() -> None:
assert parse_fold_chains("A+B") == [
("A", 1, RegionSelection.all()),
("B", 1, RegionSelection.all()),
]


def test_parse_fold_chains_copies() -> None:
# copy number as the second token (canonical form)
assert parse_fold_chains("A:2") == [("A", 2, RegionSelection.all())]
# copy + region: name:copies:region
assert parse_fold_chains("A:2:1-100") == [
("A", 2, RegionSelection(regions=(Region(start=1, end=100),))),
]


def test_parse_fold_chains_region_without_copies() -> None:
# A region alone (not a bare integer) implies a single copy
assert parse_fold_chains("A:1-100") == [
("A", 1, RegionSelection(regions=(Region(start=1, end=100),))),
]


def test_parse_fold_chains_multiple_regions_and_copies() -> None:
chains = parse_fold_chains("A:2:1-100:200-300+B")
assert chains[0][0] == "A"
assert chains[0][1] == 2
assert chains[0][2] == RegionSelection(
regions=(Region(start=1, end=100), Region(start=200, end=300))
)
assert chains[1] == ("B", 1, RegionSelection.all())


def test_parse_fold_chains_preserves_paths_and_json_names() -> None:
"""Names are returned verbatim — no extension or path stripping."""
chains = parse_fold_chains("/path/to/protA_af3_input.json:2+protB.fasta")
assert chains[0][0] == "/path/to/protA_af3_input.json"
assert chains[0][1] == 2
assert chains[1][0] == "protB.fasta"
assert chains[1][1] == 1


def test_parse_fold_chains_custom_delimiter_and_whitespace() -> None:
assert parse_fold_chains(" A , B ", protein_delimiter=",") == [
("A", 1, RegionSelection.all()),
("B", 1, RegionSelection.all()),
]
# empty tokens are skipped
assert parse_fold_chains("A++B") == [
("A", 1, RegionSelection.all()),
("B", 1, RegionSelection.all()),
]