From a4cb963d4b60b377aa934e4ab184df90618e9085 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Tue, 28 Apr 2026 23:21:09 +0300
Subject: [PATCH] =?UTF-8?q?feat:=20Milestone=209=20=E2=80=94=20exposure=20?=
 =?UTF-8?q?filtering=20layer=20(student=5Fpublic=20/=20research=5Finstruct?=
 =?UTF-8?q?or)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .agent-plan.md                  |  23 ++--
 leadforge/api/bundle.py         |  12 +-
 leadforge/exposure/filters.py   |  45 ++++++++
 leadforge/exposure/modes.py     |  36 ++++++
 leadforge/exposure/redaction.py |  79 +++++++++++++
 tests/exposure/__init__.py      |   0
 tests/exposure/test_exposure.py | 193 ++++++++++++++++++++++++++++++++
 7 files changed, 378 insertions(+), 10 deletions(-)
 create mode 100644 leadforge/exposure/filters.py
 create mode 100644 leadforge/exposure/modes.py
 create mode 100644 leadforge/exposure/redaction.py
 create mode 100644 tests/exposure/__init__.py
 create mode 100644 tests/exposure/test_exposure.py

diff --git a/.agent-plan.md b/.agent-plan.md
index 04dc996..ffbc469 100644
--- a/.agent-plan.md
+++ b/.agent-plan.md
@@ -6,19 +6,19 @@
 
 ## Current System State
 
-**v0.4.0 in progress — Milestones 7–8 complete (PRs open).** Full simulation engine + render/bundle
-layer implemented. 521 tests passing.
+**v0.4.0 in progress — Milestones 7–9 complete (PR open).** Full simulation engine + render/bundle
+layer + exposure filtering implemented. 545 tests passing.
 
 ---
 
-## Next Up — Milestone 9: Exposure Filtering (v0.4.0)
+## Next Up — Milestone 10: CLI `generate` command + `inspect` / `validate` stubs (v0.4.0)
 
-Goal: Apply `student_public` / `research_instructor` exposure-mode filtering during bundle write.
+Goal: Wire `leadforge generate` CLI command end-to-end; implement `inspect` and `validate` output.
 
-- [ ] `exposure/modes.py` — `ExposureMode`-aware filter dispatch
-- [ ] `exposure/filters.py` — column/table redaction rules per mode
-- [ ] `exposure/redaction.py` — latent-column scrubbing for `student_public`
-- [ ] Wire into `api/bundle.py` write pipeline
+- [ ] `cli/commands/generate.py` — parse flags, call `Generator.from_recipe().generate()`, call `.save()`
+- [ ] `cli/commands/inspect.py` — print manifest summary for a written bundle
+- [ ] `cli/commands/validate.py` — basic schema / FK / leakage checks on a written bundle
+- [ ] Tests for each command
 
 ---
 
@@ -32,6 +32,13 @@ Goal: Apply `student_public` / `research_instructor` exposure-mode filtering dur
 
 ## Completed Phases
 
+### Milestone 9 — Exposure Filtering ✓ (v0.4.0 in PR)
+- `exposure/filters.py`: `BundleFilter` frozen dataclass; `FILTERS` dict keyed by `ExposureMode`; `get_filter()`
+- `exposure/redaction.py`: `write_metadata_dir()` — writes `metadata/` with `graph.json`, `graph.graphml`, `world_spec.json`, `latent_registry.json`, `mechanism_summary.json`
+- `exposure/modes.py`: `apply_exposure(bundle, root, mode)` — dispatch; skips `metadata/` for `student_public`
+- Wired into `api/bundle.py` between dataset card and manifest steps
+- 24 new tests; total 545 passing
+
 ### Milestone 8 — Render / Bundle Layer ✓ (v0.4.0 in PR)
 - `render/relational.py`: `to_dataframes()` — 9-table dict of typed DataFrames from SimulationResult + PopulationResult
 - `render/snapshots.py`: `build_snapshot()` — 30-column leakage-free lead snapshot with touch/session/activity aggregates, account/contact field joins
diff --git a/leadforge/api/bundle.py b/leadforge/api/bundle.py
index 2cfdb4c..5874596 100644
--- a/leadforge/api/bundle.py
+++ b/leadforge/api/bundle.py
@@ -6,7 +6,9 @@
 1. Write relational Parquet tables (``tables/``).
 2. Build the lead snapshot and write task splits (``tasks/``).
 3. Write ``dataset_card.md`` and ``feature_dictionary.csv``.
-4. Build and write ``manifest.json``.
+4. Apply exposure filtering — write ``metadata/`` for ``research_instructor``
+   mode; skip it for ``student_public``.
+5. Build and write ``manifest.json``.
 """
 
 from __future__ import annotations
@@ -14,6 +16,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+from leadforge.exposure.modes import apply_exposure
 from leadforge.narrative.dataset_card import render_dataset_card
 from leadforge.render.manifests import build_manifest, write_manifest
 from leadforge.render.relational import to_dataframes
@@ -74,7 +77,12 @@ def write_bundle(bundle: WorldBundle, path: str) -> None:
     write_feature_dictionary(root / "feature_dictionary.csv")
 
     # ------------------------------------------------------------------
-    # 4. Manifest
+    # 4. Exposure metadata (research_instructor only)
+    # ------------------------------------------------------------------
+    apply_exposure(bundle, root, config.exposure_mode)
+
+    # ------------------------------------------------------------------
+    # 5. Manifest
     # ------------------------------------------------------------------
     manifest = build_manifest(
         config=config,
diff --git a/leadforge/exposure/filters.py b/leadforge/exposure/filters.py
new file mode 100644
index 0000000..cbcab5c
--- /dev/null
+++ b/leadforge/exposure/filters.py
@@ -0,0 +1,45 @@
+"""Per-mode bundle filter rules.
+
+:data:`FILTERS` maps every :class:`~leadforge.core.enums.ExposureMode` to a
+:class:`BundleFilter` that governs which artefacts are written when
+:func:`~leadforge.api.bundle.write_bundle` produces an output bundle.
+
+Adding a new mode: define its ``BundleFilter`` entry in ``FILTERS``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from leadforge.core.enums import ExposureMode
+
+
+@dataclass(frozen=True)
+class BundleFilter:
+    """Rules that govern bundle publication for one :class:`ExposureMode`.
+
+    Attributes:
+        write_metadata: Whether to create ``metadata/`` with hidden-truth
+            files (``graph.json``, ``graph.graphml``, ``world_spec.json``,
+            ``latent_registry.json``, ``mechanism_summary.json``).
+    """
+
+    write_metadata: bool
+
+
+#: Canonical filter rules for every supported exposure mode.
+FILTERS: dict[ExposureMode, BundleFilter] = {
+    ExposureMode.student_public: BundleFilter(write_metadata=False),
+    ExposureMode.research_instructor: BundleFilter(write_metadata=True),
+}
+
+
+def get_filter(mode: ExposureMode) -> BundleFilter:
+    """Return the :class:`BundleFilter` for *mode*.
+
+    Raises:
+        KeyError: if *mode* has no registered filter (should never happen
+            with well-typed callers, but guards against future enum additions
+            that forget to update ``FILTERS``).
+    """
+    return FILTERS[mode]
diff --git a/leadforge/exposure/modes.py b/leadforge/exposure/modes.py
new file mode 100644
index 0000000..97d5ce6
--- /dev/null
+++ b/leadforge/exposure/modes.py
@@ -0,0 +1,36 @@
+"""Exposure-mode dispatch for bundle publication.
+
+:func:`apply_exposure` is the single entry point called by
+:func:`~leadforge.api.bundle.write_bundle`.  It reads the resolved
+:class:`~leadforge.exposure.filters.BundleFilter` for the requested mode
+and performs the corresponding writes (or skips them).
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from leadforge.core.enums import ExposureMode
+from leadforge.exposure.filters import get_filter
+from leadforge.exposure.redaction import write_metadata_dir
+
+if TYPE_CHECKING:
+    from leadforge.core.models import WorldBundle
+
+
+def apply_exposure(bundle: WorldBundle, bundle_root: Path, mode: ExposureMode) -> None:
+    """Apply exposure filtering for *mode* to the bundle at *bundle_root*.
+
+    For ``research_instructor`` mode this writes the ``metadata/``
+    directory with all hidden-truth files.  For ``student_public`` mode the
+    directory is not created and no hidden truth is published.
+
+    Args:
+        bundle: Fully populated :class:`~leadforge.core.models.WorldBundle`.
+        bundle_root: Root directory of the written bundle (must already exist).
+        mode: Exposure mode that controls which artefacts are published.
+    """
+    filt = get_filter(mode)
+    if filt.write_metadata:
+        write_metadata_dir(bundle, bundle_root)
diff --git a/leadforge/exposure/redaction.py b/leadforge/exposure/redaction.py
new file mode 100644
index 0000000..13bde34
--- /dev/null
+++ b/leadforge/exposure/redaction.py
@@ -0,0 +1,79 @@
+"""Write hidden-truth metadata files for ``research_instructor`` mode.
+
+:func:`write_metadata_dir` creates ``bundle_root/metadata/`` and populates
+it with five files that expose the full hidden world:
+
+- ``graph.json`` — world graph as JSON (nodes, edges, motif family)
+- ``graph.graphml`` — world graph as GraphML for graph tools
+- ``world_spec.json`` — generation config + narrative spec
+- ``latent_registry.json`` — per-entity latent trait values
+- ``mechanism_summary.json`` — mechanism assignment summary
+"""
+
+from __future__ import annotations
+
+import dataclasses
+import json
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from leadforge.core.models import WorldBundle
+
+
+def write_metadata_dir(bundle: WorldBundle, bundle_root: Path) -> None:
+    """Populate ``bundle_root/metadata/`` with all hidden-truth files.
+
+    Args:
+        bundle: Fully populated :class:`~leadforge.core.models.WorldBundle`.
+        bundle_root: Root directory of the written bundle.
+    """
+    from leadforge.core.rng import RNGRoot
+    from leadforge.mechanisms.policies import assign_mechanisms
+
+    # Callers must only invoke this after full bundle assembly; world_graph
+    # and population are guaranteed non-None at that point.
+    assert bundle.world_graph is not None  # noqa: S101
+    assert bundle.population is not None  # noqa: S101
+
+    meta_dir = bundle_root / "metadata"
+    meta_dir.mkdir(exist_ok=True)
+
+    # ------------------------------------------------------------------
+    # graph.json + graph.graphml
+    # ------------------------------------------------------------------
+    (meta_dir / "graph.json").write_text(bundle.world_graph.to_json())
+    (meta_dir / "graph.graphml").write_text(bundle.world_graph.to_graphml())
+
+    # ------------------------------------------------------------------
+    # latent_registry.json
+    # ------------------------------------------------------------------
+    ls = bundle.population.latent_state
+    latent_registry: dict[str, object] = {
+        "account_latents": ls.account_latents,
+        "contact_latents": ls.contact_latents,
+        "lead_latents": ls.lead_latents,
+    }
+    (meta_dir / "latent_registry.json").write_text(json.dumps(latent_registry, indent=2))
+
+    # ------------------------------------------------------------------
+    # world_spec.json — config + narrative (if present)
+    # ------------------------------------------------------------------
+    config_dict = dataclasses.asdict(bundle.spec.config)
+    narrative_dict = (
+        dataclasses.asdict(bundle.spec.narrative) if bundle.spec.narrative is not None else None
+    )
+    world_spec_dict = {"config": config_dict, "narrative": narrative_dict}
+    (meta_dir / "world_spec.json").write_text(json.dumps(world_spec_dict, indent=2))
+
+    # ------------------------------------------------------------------
+    # mechanism_summary.json
+    # ------------------------------------------------------------------
+    # Reconstruct the mechanism assignment with the same RNG substream that
+    # was used during simulation — produces the identical parameter values.
+    motif_family = bundle.world_graph.motif_family
+    mech_rng = RNGRoot(bundle.spec.config.seed).child("mechanisms")
+    assignment = assign_mechanisms(motif_family, mech_rng)
+    (meta_dir / "mechanism_summary.json").write_text(
+        json.dumps(assignment.summary().to_dict(), indent=2)
+    )
diff --git a/tests/exposure/__init__.py b/tests/exposure/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/exposure/test_exposure.py b/tests/exposure/test_exposure.py
new file mode 100644
index 0000000..0c55bb2
--- /dev/null
+++ b/tests/exposure/test_exposure.py
@@ -0,0 +1,193 @@
+"""Tests for leadforge.exposure — ExposureMode filtering and metadata writes."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from leadforge.api.generator import Generator
+from leadforge.core.enums import ExposureMode
+from leadforge.exposure.filters import FILTERS, BundleFilter, get_filter
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_SMALL_GENERATE_KWARGS: dict[str, int] = {"n_leads": 30, "n_accounts": 15, "n_contacts": 45}
+
+
+def _make_bundle(mode: str, seed: int = 42):
+    gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=seed, exposure_mode=mode)
+    return gen.generate(**_SMALL_GENERATE_KWARGS)
+
+
+# ---------------------------------------------------------------------------
+# Unit tests — BundleFilter / FILTERS
+# ---------------------------------------------------------------------------
+
+
+class TestFilters:
+    def test_all_modes_have_filter(self) -> None:
+        for mode in ExposureMode:
+            assert mode in FILTERS, f"{mode!r} has no entry in FILTERS"
+
+    def test_student_public_no_metadata(self) -> None:
+        f = get_filter(ExposureMode.student_public)
+        assert isinstance(f, BundleFilter)
+        assert f.write_metadata is False
+
+    def test_research_instructor_writes_metadata(self) -> None:
+        f = get_filter(ExposureMode.research_instructor)
+        assert f.write_metadata is True
+
+    def test_unknown_mode_raises(self) -> None:
+        """get_filter must raise KeyError for an unregistered mode string."""
+        with pytest.raises(KeyError):
+            get_filter("totally_fake_mode")  # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# Integration tests — write_bundle via WorldBundle.save
+# ---------------------------------------------------------------------------
+
+
+class TestStudentPublicMode:
+    def test_no_metadata_dir(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("student_public")
+        bundle.save(str(tmp_path))
+        assert not (tmp_path / "metadata").exists()
+
+    def test_core_files_present(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("student_public")
+        bundle.save(str(tmp_path))
+        assert (tmp_path / "manifest.json").exists()
+        assert (tmp_path / "dataset_card.md").exists()
+        assert (tmp_path / "feature_dictionary.csv").exists()
+        assert (tmp_path / "tables").is_dir()
+        assert (tmp_path / "tasks").is_dir()
+
+
+class TestResearchInstructorMode:
+    def test_metadata_dir_created(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        assert (tmp_path / "metadata").is_dir()
+
+    def test_all_metadata_files_present(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        meta = tmp_path / "metadata"
+        for fname in (
+            "graph.json",
+            "graph.graphml",
+            "world_spec.json",
+            "latent_registry.json",
+            "mechanism_summary.json",
+        ):
+            assert (meta / fname).exists(), f"Missing metadata file: {fname}"
+
+    def test_graph_json_valid(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        data = json.loads((tmp_path / "metadata" / "graph.json").read_text())
+        assert "nodes" in data
+        assert "edges" in data
+        assert "motif_family" in data
+
+    def test_graph_graphml_valid_xml(self, tmp_path: Path) -> None:
+        import xml.etree.ElementTree as ET  # stdlib
+
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        text = (tmp_path / "metadata" / "graph.graphml").read_text()
+        # Must parse without error.
+        ET.fromstring(text)  # noqa: S314 — bundle data we generated, not external input
+
+    def test_latent_registry_keys(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        data = json.loads((tmp_path / "metadata" / "latent_registry.json").read_text())
+        assert set(data.keys()) == {"account_latents", "contact_latents", "lead_latents"}
+
+    def test_latent_registry_populated(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        data = json.loads((tmp_path / "metadata" / "latent_registry.json").read_text())
+        # Each registry should be non-empty.
+        assert len(data["account_latents"]) > 0
+        assert len(data["contact_latents"]) > 0
+        assert len(data["lead_latents"]) > 0
+
+    def test_latent_registry_values_in_unit_interval(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        data = json.loads((tmp_path / "metadata" / "latent_registry.json").read_text())
+        for registry_key in ("account_latents", "contact_latents", "lead_latents"):
+            for entity_id, traits in data[registry_key].items():
+                for trait_name, value in traits.items():
+                    assert 0.0 <= value <= 1.0, (
+                        f"{registry_key}[{entity_id!r}][{trait_name!r}] = {value} out of [0, 1]"
+                    )
+
+    def test_world_spec_json_keys(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        data = json.loads((tmp_path / "metadata" / "world_spec.json").read_text())
+        assert "config" in data
+        assert "narrative" in data
+
+    def test_world_spec_config_matches_bundle(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor", seed=77)
+        bundle.save(str(tmp_path))
+        data = json.loads((tmp_path / "metadata" / "world_spec.json").read_text())
+        assert data["config"]["seed"] == 77
+        assert data["config"]["recipe_id"] == "b2b_saas_procurement_v1"
+
+    def test_mechanism_summary_keys(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        data = json.loads((tmp_path / "metadata" / "mechanism_summary.json").read_text())
+        assert "motif_family" in data
+        assert "conversion_hazard" in data
+        assert "stage_transition" in data
+        assert "touch_intensity" in data
+        assert "measurement" in data
+
+    def test_mechanism_summary_motif_matches_graph(self, tmp_path: Path) -> None:
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        graph_data = json.loads((tmp_path / "metadata" / "graph.json").read_text())
+        mech_data = json.loads((tmp_path / "metadata" / "mechanism_summary.json").read_text())
+        assert graph_data["motif_family"] == mech_data["motif_family"]
+
+    def test_core_files_still_present(self, tmp_path: Path) -> None:
+        """Metadata write must not replace or skip the standard bundle files."""
+        bundle = _make_bundle("research_instructor")
+        bundle.save(str(tmp_path))
+        assert (tmp_path / "manifest.json").exists()
+        assert (tmp_path / "dataset_card.md").exists()
+        assert (tmp_path / "feature_dictionary.csv").exists()
+        assert (tmp_path / "tables").is_dir()
+        assert (tmp_path / "tasks").is_dir()
+
+
+class TestModeDeterminism:
+    def test_same_seed_same_latent_registry(self, tmp_path: Path) -> None:
+        p1 = tmp_path / "run1"
+        p2 = tmp_path / "run2"
+        _make_bundle("research_instructor", seed=42).save(str(p1))
+        _make_bundle("research_instructor", seed=42).save(str(p2))
+        d1 = json.loads((p1 / "metadata" / "latent_registry.json").read_text())
+        d2 = json.loads((p2 / "metadata" / "latent_registry.json").read_text())
+        assert d1 == d2
+
+    def test_different_seeds_different_latent_registries(self, tmp_path: Path) -> None:
+        p1 = tmp_path / "run1"
+        p2 = tmp_path / "run2"
+        _make_bundle("research_instructor", seed=1).save(str(p1))
+        _make_bundle("research_instructor", seed=2).save(str(p2))
+        d1 = json.loads((p1 / "metadata" / "latent_registry.json").read_text())
+        d2 = json.loads((p2 / "metadata" / "latent_registry.json").read_text())
+        assert d1 != d2