From a4cb963d4b60b377aa934e4ab184df90618e9085 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 28 Apr 2026 23:21:09 +0300 Subject: [PATCH] =?UTF-8?q?feat:=20Milestone=209=20=E2=80=94=20exposure=20?= =?UTF-8?q?filtering=20layer=20(student=5Fpublic=20/=20research=5Finstruct?= =?UTF-8?q?or)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .agent-plan.md | 23 ++-- leadforge/api/bundle.py | 12 +- leadforge/exposure/filters.py | 45 ++++++++ leadforge/exposure/modes.py | 36 ++++++ leadforge/exposure/redaction.py | 79 +++++++++++++ tests/exposure/__init__.py | 0 tests/exposure/test_exposure.py | 193 ++++++++++++++++++++++++++++++++ 7 files changed, 378 insertions(+), 10 deletions(-) create mode 100644 leadforge/exposure/filters.py create mode 100644 leadforge/exposure/modes.py create mode 100644 leadforge/exposure/redaction.py create mode 100644 tests/exposure/__init__.py create mode 100644 tests/exposure/test_exposure.py diff --git a/.agent-plan.md b/.agent-plan.md index 04dc996..ffbc469 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -6,19 +6,19 @@ ## Current System State -**v0.4.0 in progress — Milestones 7–8 complete (PRs open).** Full simulation engine + render/bundle -layer implemented. 521 tests passing. +**v0.4.0 in progress — Milestones 7–9 complete (PR open).** Full simulation engine + render/bundle +layer + exposure filtering implemented. 545 tests passing. --- -## Next Up — Milestone 9: Exposure Filtering (v0.4.0) +## Next Up — Milestone 10: CLI `generate` command + `inspect` / `validate` stubs (v0.4.0) -Goal: Apply `student_public` / `research_instructor` exposure-mode filtering during bundle write. +Goal: Wire `leadforge generate` CLI command end-to-end; implement `inspect` and `validate` output. -- [ ] `exposure/modes.py` — `ExposureMode`-aware filter dispatch -- [ ] `exposure/filters.py` — column/table redaction rules per mode -- [ ] `exposure/redaction.py` — latent-column scrubbing for `student_public` -- [ ] Wire into `api/bundle.py` write pipeline +- [ ] `cli/commands/generate.py` — parse flags, call `Generator.from_recipe().generate()`, call `.save()` +- [ ] `cli/commands/inspect.py` — print manifest summary for a written bundle +- [ ] `cli/commands/validate.py` — basic schema / FK / leakage checks on a written bundle +- [ ] Tests for each command --- @@ -32,6 +32,13 @@ Goal: Apply `student_public` / `research_instructor` exposure-mode filtering dur ## Completed Phases +### Milestone 9 — Exposure Filtering ✓ (v0.4.0 in PR) +- `exposure/filters.py`: `BundleFilter` frozen dataclass; `FILTERS` dict keyed by `ExposureMode`; `get_filter()` +- `exposure/redaction.py`: `write_metadata_dir()` — writes `metadata/` with `graph.json`, `graph.graphml`, `world_spec.json`, `latent_registry.json`, `mechanism_summary.json` +- `exposure/modes.py`: `apply_exposure(bundle, root, mode)` — dispatch; skips `metadata/` for `student_public` +- Wired into `api/bundle.py` between dataset card and manifest steps +- 24 new tests; total 545 passing + ### Milestone 8 — Render / Bundle Layer ✓ (v0.4.0 in PR) - `render/relational.py`: `to_dataframes()` — 9-table dict of typed DataFrames from SimulationResult + PopulationResult - `render/snapshots.py`: `build_snapshot()` — 30-column leakage-free lead snapshot with touch/session/activity aggregates, account/contact field joins diff --git a/leadforge/api/bundle.py b/leadforge/api/bundle.py index 2cfdb4c..5874596 100644 --- a/leadforge/api/bundle.py +++ b/leadforge/api/bundle.py @@ -6,7 +6,9 @@ 1. Write relational Parquet tables (``tables/``). 2. Build the lead snapshot and write task splits (``tasks/``). 3. Write ``dataset_card.md`` and ``feature_dictionary.csv``. -4. Build and write ``manifest.json``. +4. Apply exposure filtering — write ``metadata/`` for ``research_instructor`` + mode; skip it for ``student_public``. +5. Build and write ``manifest.json``. """ from __future__ import annotations @@ -14,6 +16,7 @@ from pathlib import Path from typing import TYPE_CHECKING +from leadforge.exposure.modes import apply_exposure from leadforge.narrative.dataset_card import render_dataset_card from leadforge.render.manifests import build_manifest, write_manifest from leadforge.render.relational import to_dataframes @@ -74,7 +77,12 @@ def write_bundle(bundle: WorldBundle, path: str) -> None: write_feature_dictionary(root / "feature_dictionary.csv") # ------------------------------------------------------------------ - # 4. Manifest + # 4. Exposure metadata (research_instructor only) + # ------------------------------------------------------------------ + apply_exposure(bundle, root, config.exposure_mode) + + # ------------------------------------------------------------------ + # 5. Manifest # ------------------------------------------------------------------ manifest = build_manifest( config=config, diff --git a/leadforge/exposure/filters.py b/leadforge/exposure/filters.py new file mode 100644 index 0000000..cbcab5c --- /dev/null +++ b/leadforge/exposure/filters.py @@ -0,0 +1,45 @@ +"""Per-mode bundle filter rules. + +:data:`FILTERS` maps every :class:`~leadforge.core.enums.ExposureMode` to a +:class:`BundleFilter` that governs which artefacts are written when +:func:`~leadforge.api.bundle.write_bundle` produces an output bundle. + +Adding a new mode: define its ``BundleFilter`` entry in ``FILTERS``. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from leadforge.core.enums import ExposureMode + + +@dataclass(frozen=True) +class BundleFilter: + """Rules that govern bundle publication for one :class:`ExposureMode`. + + Attributes: + write_metadata: Whether to create ``metadata/`` with hidden-truth + files (``graph.json``, ``graph.graphml``, ``world_spec.json``, + ``latent_registry.json``, ``mechanism_summary.json``). + """ + + write_metadata: bool + + +#: Canonical filter rules for every supported exposure mode. +FILTERS: dict[ExposureMode, BundleFilter] = { + ExposureMode.student_public: BundleFilter(write_metadata=False), + ExposureMode.research_instructor: BundleFilter(write_metadata=True), +} + + +def get_filter(mode: ExposureMode) -> BundleFilter: + """Return the :class:`BundleFilter` for *mode*. + + Raises: + KeyError: if *mode* has no registered filter (should never happen + with well-typed callers, but guards against future enum additions + that forget to update ``FILTERS``). + """ + return FILTERS[mode] diff --git a/leadforge/exposure/modes.py b/leadforge/exposure/modes.py new file mode 100644 index 0000000..97d5ce6 --- /dev/null +++ b/leadforge/exposure/modes.py @@ -0,0 +1,36 @@ +"""Exposure-mode dispatch for bundle publication. + +:func:`apply_exposure` is the single entry point called by +:func:`~leadforge.api.bundle.write_bundle`. It reads the resolved +:class:`~leadforge.exposure.filters.BundleFilter` for the requested mode +and performs the corresponding writes (or skips them). +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +from leadforge.core.enums import ExposureMode +from leadforge.exposure.filters import get_filter +from leadforge.exposure.redaction import write_metadata_dir + +if TYPE_CHECKING: + from leadforge.core.models import WorldBundle + + +def apply_exposure(bundle: WorldBundle, bundle_root: Path, mode: ExposureMode) -> None: + """Apply exposure filtering for *mode* to the bundle at *bundle_root*. + + For ``research_instructor`` mode this writes the ``metadata/`` + directory with all hidden-truth files. For ``student_public`` mode the + directory is not created and no hidden truth is published. + + Args: + bundle: Fully populated :class:`~leadforge.core.models.WorldBundle`. + bundle_root: Root directory of the written bundle (must already exist). + mode: Exposure mode that controls which artefacts are published. + """ + filt = get_filter(mode) + if filt.write_metadata: + write_metadata_dir(bundle, bundle_root) diff --git a/leadforge/exposure/redaction.py b/leadforge/exposure/redaction.py new file mode 100644 index 0000000..13bde34 --- /dev/null +++ b/leadforge/exposure/redaction.py @@ -0,0 +1,79 @@ +"""Write hidden-truth metadata files for ``research_instructor`` mode. + +:func:`write_metadata_dir` creates ``bundle_root/metadata/`` and populates +it with five files that expose the full hidden world: + +- ``graph.json`` — world graph as JSON (nodes, edges, motif family) +- ``graph.graphml`` — world graph as GraphML for graph tools +- ``world_spec.json`` — generation config + narrative spec +- ``latent_registry.json`` — per-entity latent trait values +- ``mechanism_summary.json`` — mechanism assignment summary +""" + +from __future__ import annotations + +import dataclasses +import json +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from leadforge.core.models import WorldBundle + + +def write_metadata_dir(bundle: WorldBundle, bundle_root: Path) -> None: + """Populate ``bundle_root/metadata/`` with all hidden-truth files. + + Args: + bundle: Fully populated :class:`~leadforge.core.models.WorldBundle`. + bundle_root: Root directory of the written bundle. + """ + from leadforge.core.rng import RNGRoot + from leadforge.mechanisms.policies import assign_mechanisms + + # Callers must only invoke this after full bundle assembly; world_graph + # and population are guaranteed non-None at that point. + assert bundle.world_graph is not None # noqa: S101 + assert bundle.population is not None # noqa: S101 + + meta_dir = bundle_root / "metadata" + meta_dir.mkdir(exist_ok=True) + + # ------------------------------------------------------------------ + # graph.json + graph.graphml + # ------------------------------------------------------------------ + (meta_dir / "graph.json").write_text(bundle.world_graph.to_json()) + (meta_dir / "graph.graphml").write_text(bundle.world_graph.to_graphml()) + + # ------------------------------------------------------------------ + # latent_registry.json + # ------------------------------------------------------------------ + ls = bundle.population.latent_state + latent_registry: dict[str, object] = { + "account_latents": ls.account_latents, + "contact_latents": ls.contact_latents, + "lead_latents": ls.lead_latents, + } + (meta_dir / "latent_registry.json").write_text(json.dumps(latent_registry, indent=2)) + + # ------------------------------------------------------------------ + # world_spec.json — config + narrative (if present) + # ------------------------------------------------------------------ + config_dict = dataclasses.asdict(bundle.spec.config) + narrative_dict = ( + dataclasses.asdict(bundle.spec.narrative) if bundle.spec.narrative is not None else None + ) + world_spec_dict = {"config": config_dict, "narrative": narrative_dict} + (meta_dir / "world_spec.json").write_text(json.dumps(world_spec_dict, indent=2)) + + # ------------------------------------------------------------------ + # mechanism_summary.json + # ------------------------------------------------------------------ + # Reconstruct the mechanism assignment with the same RNG substream that + # was used during simulation — produces the identical parameter values. + motif_family = bundle.world_graph.motif_family + mech_rng = RNGRoot(bundle.spec.config.seed).child("mechanisms") + assignment = assign_mechanisms(motif_family, mech_rng) + (meta_dir / "mechanism_summary.json").write_text( + json.dumps(assignment.summary().to_dict(), indent=2) + ) diff --git a/tests/exposure/__init__.py b/tests/exposure/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/exposure/test_exposure.py b/tests/exposure/test_exposure.py new file mode 100644 index 0000000..0c55bb2 --- /dev/null +++ b/tests/exposure/test_exposure.py @@ -0,0 +1,193 @@ +"""Tests for leadforge.exposure — ExposureMode filtering and metadata writes.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from leadforge.api.generator import Generator +from leadforge.core.enums import ExposureMode +from leadforge.exposure.filters import FILTERS, BundleFilter, get_filter + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_SMALL_GENERATE_KWARGS: dict[str, int] = {"n_leads": 30, "n_accounts": 15, "n_contacts": 45} + + +def _make_bundle(mode: str, seed: int = 42): + gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=seed, exposure_mode=mode) + return gen.generate(**_SMALL_GENERATE_KWARGS) + + +# --------------------------------------------------------------------------- +# Unit tests — BundleFilter / FILTERS +# --------------------------------------------------------------------------- + + +class TestFilters: + def test_all_modes_have_filter(self) -> None: + for mode in ExposureMode: + assert mode in FILTERS, f"{mode!r} has no entry in FILTERS" + + def test_student_public_no_metadata(self) -> None: + f = get_filter(ExposureMode.student_public) + assert isinstance(f, BundleFilter) + assert f.write_metadata is False + + def test_research_instructor_writes_metadata(self) -> None: + f = get_filter(ExposureMode.research_instructor) + assert f.write_metadata is True + + def test_unknown_mode_raises(self) -> None: + """get_filter must raise KeyError for an unregistered mode string.""" + with pytest.raises(KeyError): + get_filter("totally_fake_mode") # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# Integration tests — write_bundle via WorldBundle.save +# --------------------------------------------------------------------------- + + +class TestStudentPublicMode: + def test_no_metadata_dir(self, tmp_path: Path) -> None: + bundle = _make_bundle("student_public") + bundle.save(str(tmp_path)) + assert not (tmp_path / "metadata").exists() + + def test_core_files_present(self, tmp_path: Path) -> None: + bundle = _make_bundle("student_public") + bundle.save(str(tmp_path)) + assert (tmp_path / "manifest.json").exists() + assert (tmp_path / "dataset_card.md").exists() + assert (tmp_path / "feature_dictionary.csv").exists() + assert (tmp_path / "tables").is_dir() + assert (tmp_path / "tasks").is_dir() + + +class TestResearchInstructorMode: + def test_metadata_dir_created(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + assert (tmp_path / "metadata").is_dir() + + def test_all_metadata_files_present(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + meta = tmp_path / "metadata" + for fname in ( + "graph.json", + "graph.graphml", + "world_spec.json", + "latent_registry.json", + "mechanism_summary.json", + ): + assert (meta / fname).exists(), f"Missing metadata file: {fname}" + + def test_graph_json_valid(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + data = json.loads((tmp_path / "metadata" / "graph.json").read_text()) + assert "nodes" in data + assert "edges" in data + assert "motif_family" in data + + def test_graph_graphml_valid_xml(self, tmp_path: Path) -> None: + import xml.etree.ElementTree as ET # stdlib + + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + text = (tmp_path / "metadata" / "graph.graphml").read_text() + # Must parse without error. + ET.fromstring(text) # noqa: S314 — bundle data we generated, not external input + + def test_latent_registry_keys(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + data = json.loads((tmp_path / "metadata" / "latent_registry.json").read_text()) + assert set(data.keys()) == {"account_latents", "contact_latents", "lead_latents"} + + def test_latent_registry_populated(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + data = json.loads((tmp_path / "metadata" / "latent_registry.json").read_text()) + # Each registry should be non-empty. + assert len(data["account_latents"]) > 0 + assert len(data["contact_latents"]) > 0 + assert len(data["lead_latents"]) > 0 + + def test_latent_registry_values_in_unit_interval(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + data = json.loads((tmp_path / "metadata" / "latent_registry.json").read_text()) + for registry_key in ("account_latents", "contact_latents", "lead_latents"): + for entity_id, traits in data[registry_key].items(): + for trait_name, value in traits.items(): + assert 0.0 <= value <= 1.0, ( + f"{registry_key}[{entity_id!r}][{trait_name!r}] = {value} out of [0, 1]" + ) + + def test_world_spec_json_keys(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + data = json.loads((tmp_path / "metadata" / "world_spec.json").read_text()) + assert "config" in data + assert "narrative" in data + + def test_world_spec_config_matches_bundle(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor", seed=77) + bundle.save(str(tmp_path)) + data = json.loads((tmp_path / "metadata" / "world_spec.json").read_text()) + assert data["config"]["seed"] == 77 + assert data["config"]["recipe_id"] == "b2b_saas_procurement_v1" + + def test_mechanism_summary_keys(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + data = json.loads((tmp_path / "metadata" / "mechanism_summary.json").read_text()) + assert "motif_family" in data + assert "conversion_hazard" in data + assert "stage_transition" in data + assert "touch_intensity" in data + assert "measurement" in data + + def test_mechanism_summary_motif_matches_graph(self, tmp_path: Path) -> None: + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + graph_data = json.loads((tmp_path / "metadata" / "graph.json").read_text()) + mech_data = json.loads((tmp_path / "metadata" / "mechanism_summary.json").read_text()) + assert graph_data["motif_family"] == mech_data["motif_family"] + + def test_core_files_still_present(self, tmp_path: Path) -> None: + """Metadata write must not replace or skip the standard bundle files.""" + bundle = _make_bundle("research_instructor") + bundle.save(str(tmp_path)) + assert (tmp_path / "manifest.json").exists() + assert (tmp_path / "dataset_card.md").exists() + assert (tmp_path / "feature_dictionary.csv").exists() + assert (tmp_path / "tables").is_dir() + assert (tmp_path / "tasks").is_dir() + + +class TestModeDeterminism: + def test_same_seed_same_latent_registry(self, tmp_path: Path) -> None: + p1 = tmp_path / "run1" + p2 = tmp_path / "run2" + _make_bundle("research_instructor", seed=42).save(str(p1)) + _make_bundle("research_instructor", seed=42).save(str(p2)) + d1 = json.loads((p1 / "metadata" / "latent_registry.json").read_text()) + d2 = json.loads((p2 / "metadata" / "latent_registry.json").read_text()) + assert d1 == d2 + + def test_different_seeds_different_latent_registries(self, tmp_path: Path) -> None: + p1 = tmp_path / "run1" + p2 = tmp_path / "run2" + _make_bundle("research_instructor", seed=1).save(str(p1)) + _make_bundle("research_instructor", seed=2).save(str(p2)) + d1 = json.loads((p1 / "metadata" / "latent_registry.json").read_text()) + d2 = json.loads((p2 / "metadata" / "latent_registry.json").read_text()) + assert d1 != d2