leadforge-dev · shaypal5 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/.agent-plan.md b/.agent-plan.md
@@ -6,35 +6,40 @@
 
 ## Current System State
 
-**v0.4.0 complete — Milestones 7–10 done.** Full simulation engine + render/bundle layer + exposure filtering + CLI commands implemented. 562 tests passing.
+**v0.5.0 in progress — Milestones 7–11 complete.** Full simulation engine + render/bundle + exposure filtering + CLI commands + validation harness implemented. 581 tests passing.
 
 ---
 
-## Next Up — Milestone 11: Validation harness (v0.5.0)
+## Next Up — Milestone 12: CLI polish + JSON output (v0.5.0)
 
-Goal: Implement comprehensive bundle validation — invariant checks, realism heuristics, difficulty drift detection.
+Goal: Polish CLI commands with JSON output mode, richer help text, and progress feedback.
 
-- [ ] `validation/invariants.py` — DAG acyclicity, FK integrity, determinism, exposure monotonicity
-- [ ] `validation/artifact_checks.py` — file presence, hash verification, schema conformance
-- [ ] `validation/realism.py` — distributional sanity checks (conversion rates, feature ranges)
-- [ ] `validation/difficulty.py` — difficulty profile adherence checks
-- [ ] `validation/drift.py` — cross-seed stability / drift detection
-- [ ] Wire into `cli/commands/validate.py` with richer output
-- [ ] Tests for each validation module
+- [ ] Add `--json` flag to `inspect` and `validate` for machine-readable output
+- [ ] Add `--strict` flag to `validate` to control whether realism checks are errors vs warnings
+- [ ] Improve CLI help text and error messages
+- [ ] Tests for JSON output mode
 
 ---
 
 ## Context Pointers
 
-- Milestone 11 scope: `docs/leadforge_implementation_plan.md` §10 "Milestone 11"
-- Current validate CLI: `leadforge/cli/commands/validate.py` (basic checks implemented in M10)
-- FK constraints: `leadforge/schema/relationships.py`
-- Feature spec: `leadforge/schema/features.py`
+- Milestone 12 scope: `docs/leadforge_implementation_plan.md` §10 "Milestone 12"
+- CLI commands: `leadforge/cli/commands/`
+- Validation modules: `leadforge/validation/`
 
 ---
 
 ## Completed Phases
 
+### Milestone 11 — Validation Harness ✓ (v0.5.0)
+- `validation/bundle_checks.py`: orchestrator — artifact, FK, leakage checks + wires realism/difficulty
+- `validation/invariants.py`: determinism (same seed → identical hashes), exposure monotonicity (student ⊂ instructor)
+- `validation/realism.py`: conversion rate bounds, non-empty core tables, feature value ranges (non-negative counts, valid booleans), stage distribution diversity
+- `validation/difficulty.py`: known-difficulty validation, ordering check (no-op until engine modulates by difficulty)
+- `validation/drift.py`: cross-seed stability — conversion rate spread, degenerate seed detection
+- All wired into `validate_bundle()` via `include_realism` flag
+- 18 new validation tests; total 581 passing
+
 ### Milestone 10 — CLI Commands ✓ (v0.4.0)
 - `cli/commands/generate.py`: fully wired — parses all flags, calls `Generator.from_recipe().generate()`, writes bundle via `.save()`
 - `cli/commands/inspect.py`: reads `manifest.json` and prints summary (recipe, seed, mode, tables with row counts, task splits, metadata presence)

diff --git a/leadforge/validation/bundle_checks.py b/leadforge/validation/bundle_checks.py
@@ -17,11 +17,18 @@
 from leadforge.core.serialization import load_json
 from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES
 from leadforge.schema.relationships import ALL_CONSTRAINTS
+from leadforge.validation.difficulty import check_difficulty
+from leadforge.validation.realism import check_realism
 
 
-def validate_bundle(bundle_root: Path) -> list[str]:
+def validate_bundle(bundle_root: Path, *, include_realism: bool = True) -> list[str]:
     """Run all validation checks on the bundle at *bundle_root*.
 
+    Args:
+        bundle_root: Path to the bundle directory.
+        include_realism: If True (default), also run distributional sanity
+            and difficulty-adherence checks.
+
     Returns:
         A list of error strings.  An empty list means the bundle is valid.
 
@@ -37,6 +44,11 @@ def validate_bundle(bundle_root: Path) -> list[str]:
     errors.extend(_check_task_splits(bundle_root, manifest))
     errors.extend(_check_fk_integrity(tables))
     errors.extend(_check_leakage(bundle_root, manifest))
+
+    if include_realism:
+        errors.extend(check_realism(bundle_root, manifest))
+        errors.extend(check_difficulty(manifest))
+
     return errors
 
 

diff --git a/leadforge/validation/difficulty.py b/leadforge/validation/difficulty.py
@@ -0,0 +1,51 @@
+"""Difficulty profile adherence checks.
+
+Verifies that a bundle's manifest declares a known difficulty profile.
+
+NOTE: The v1 simulation engine does not yet modulate conversion rates by
+difficulty profile — all profiles currently produce the same rate.  The
+``check_difficulty_ordering`` function is therefore a no-op.  Once the
+engine wires in difficulty-dependent parameters, it can be extended with
+per-profile rate assertions.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+# Known difficulty profiles.
+_KNOWN_DIFFICULTIES = {"intro", "intermediate", "advanced"}
+
+
+def check_difficulty(manifest: dict[str, Any]) -> list[str]:
+    """Check that the manifest declares a known difficulty profile.
+
+    Args:
+        manifest: Parsed manifest dict.
+
+    Returns a list of error strings (empty = pass).
+    """
+    errors: list[str] = []
+    difficulty = manifest.get("difficulty")
+    if difficulty is None:
+        errors.append("Manifest missing 'difficulty' field")
+    elif difficulty not in _KNOWN_DIFFICULTIES:
+        errors.append(f"Unknown difficulty profile: '{difficulty}'")
+    return errors
+
+
+def check_difficulty_ordering(bundles: dict[str, Path]) -> list[str]:
+    """Check that conversion rates decrease as difficulty increases.
+
+    Args:
+        bundles: Mapping of difficulty name → bundle path.
+
+    Returns:
+        Error strings if the ordering is violated.
+
+    NOTE: This check is a no-op until the simulation engine modulates
+    conversion rates by difficulty.  Currently all difficulties produce
+    the same rate so we return an empty list unconditionally.
+    """
+    return []
diff --git a/leadforge/validation/drift.py b/leadforge/validation/drift.py
@@ -0,0 +1,68 @@
+"""Cross-seed stability checks.
+
+Verifies that different seeds produce statistically similar distributions,
+catching degenerate parameter regimes where one seed produces reasonable
+output but another collapses.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+
+
+def check_cross_seed_stability(bundles: dict[int, Path]) -> list[str]:
+    """Compare bundles generated with different seeds.
+
+    Args:
+        bundles: Mapping of seed → bundle path.  Must contain at least 2
+            entries to perform any checks.
+
+    Returns:
+        Error strings for any instabilities detected.
+    """
+    if len(bundles) < 2:
+        return []
+
+    errors: list[str] = []
+    rates: dict[int, float] = {}
+    stage_counts: dict[int, int] = {}
+
+    for seed, bundle_path in bundles.items():
+        train_path = bundle_path / "tasks/converted_within_90_days/train.parquet"
+        if not train_path.exists():
+            errors.append(f"Seed {seed}: missing tasks/converted_within_90_days/train.parquet")
+            continue
+        df = pd.read_parquet(train_path, columns=["converted_within_90_days"])
+        if len(df) > 0:
+            rates[seed] = float(df["converted_within_90_days"].mean())
+
+        leads_path = bundle_path / "tables/leads.parquet"
+        if leads_path.exists():
+            leads = pd.read_parquet(leads_path, columns=["current_stage"])
+            stage_counts[seed] = int(leads["current_stage"].nunique())
+
+    # Check conversion rate spread — if one seed's rate is 5x another's, that's suspicious
+    if len(rates) >= 2:
+        min_rate = min(rates.values())
+        max_rate = max(rates.values())
+        if min_rate > 0 and max_rate / min_rate > 5.0:
+            errors.append(
+                f"Conversion rate spread too wide across seeds: "
+                f"min={min_rate:.4f}, max={max_rate:.4f} (ratio {max_rate / min_rate:.1f}x)"
+            )
+        # Also flag if any seed produces near-0% or near-100% conversion
+        eps = 1e-9
+        for seed, rate in rates.items():
+            if rate < eps:
+                errors.append(f"Seed {seed}: 0% conversion rate — simulation degenerate")
+            elif rate > 1.0 - eps:
+                errors.append(f"Seed {seed}: 100% conversion rate — simulation degenerate")
+
+    # Check stage diversity — all seeds should produce multiple stages
+    for seed, n_stages in stage_counts.items():
+        if n_stages < 2:
+            errors.append(f"Seed {seed}: only {n_stages} funnel stage(s) — degenerate")
+
+    return errors
diff --git a/leadforge/validation/invariants.py b/leadforge/validation/invariants.py
@@ -0,0 +1,158 @@
+"""Determinism and exposure-monotonicity invariant checks.
+
+These checks verify structural guarantees that must hold for every bundle:
+
+- **Determinism**: same (recipe, seed, config) → identical output.
+- **Exposure monotonicity**: ``student_public`` artefacts are a strict subset
+  of ``research_instructor`` artefacts.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from leadforge.core.hashing import file_sha256
+
+
+def check_determinism(bundle_a: Path, bundle_b: Path) -> list[str]:
+    """Compare two bundles that should be identical (same seed/config).
+
+    Both bundles must already exist on disk.  Returns a list of mismatch
+    descriptions (empty = deterministic).
+    """
+    errors: list[str] = []
+
+    # Compare core non-Parquet files that must also be deterministic.
+    for fname in ("manifest.json", "dataset_card.md", "feature_dictionary.csv"):
+        fa = bundle_a / fname
+        fb = bundle_b / fname
+        if fa.exists() and fb.exists():
+            if file_sha256(fa) != file_sha256(fb):
+                errors.append(f"Hash mismatch: {fname}")
+        elif fa.exists() != fb.exists():
+            errors.append(f"File '{fname}' exists in one bundle but not the other")
+
+    # Compare all Parquet files under tables/ and tasks/
+    for subdir in ("tables", "tasks"):
+        dir_a = bundle_a / subdir
+        dir_b = bundle_b / subdir
+        if not dir_a.exists() or not dir_b.exists():
+            if dir_a.exists() != dir_b.exists():
+                errors.append(f"Directory '{subdir}' exists in one bundle but not the other")
+            continue
+
+        files_a = {p.relative_to(dir_a) for p in dir_a.rglob("*.parquet")}
+        files_b = {p.relative_to(dir_b) for p in dir_b.rglob("*.parquet")}
+
+        only_a = files_a - files_b
+        only_b = files_b - files_a
+        if only_a:
+            errors.append(f"Files only in bundle A {subdir}/: {sorted(str(f) for f in only_a)}")
+        if only_b:
+            errors.append(f"Files only in bundle B {subdir}/: {sorted(str(f) for f in only_b)}")
+
+        for rel in sorted(files_a & files_b):
+            sha_a = file_sha256(dir_a / rel)
+            sha_b = file_sha256(dir_b / rel)
+            if sha_a != sha_b:
+                errors.append(f"Hash mismatch: {subdir}/{rel}")
+
+    return errors
+
+
+def check_exposure_monotonicity(student_bundle: Path, instructor_bundle: Path) -> list[str]:
+    """Verify that student_public is a subset of research_instructor.
+
+    The instructor bundle must contain everything the student bundle has,
+    plus additional ``metadata/`` artefacts.  Shared files must be identical
+    (same SHA-256 hash).  Returns errors if violated.
+    """
+    errors: list[str] = []
+
+    # Student must NOT have metadata/
+    if (student_bundle / "metadata").exists():
+        errors.append("student_public bundle should not contain metadata/")
+
+    # Instructor MUST have metadata/
+    if not (instructor_bundle / "metadata").exists():
+        errors.append("research_instructor bundle is missing metadata/")
+
+    # Both must have the same core files.
+    # manifest.json and dataset_card.md legitimately differ between modes
+    # (exposure_mode field, metadata references), so only check presence.
+    # feature_dictionary.csv should be identical (checked below).
+    core_files = ["manifest.json", "dataset_card.md", "feature_dictionary.csv"]
+    for fname in core_files:
+        s_path = student_bundle / fname
+        i_path = instructor_bundle / fname
+        if s_path.exists() and not i_path.exists():
+            errors.append(f"Student has {fname} but instructor does not")
+        elif not s_path.exists() and i_path.exists():
+            errors.append(f"Instructor has {fname} but student does not")
+
+    # feature_dictionary.csv should be identical across modes.
+    s_dict = student_bundle / "feature_dictionary.csv"
+    i_dict = instructor_bundle / "feature_dictionary.csv"
+    if s_dict.exists() and i_dict.exists():
+        if file_sha256(s_dict) != file_sha256(i_dict):
+            errors.append("Content mismatch in shared file: feature_dictionary.csv")
+
+    # Both must have the same tables with identical content
+    student_tables = (
+        {p.name for p in (student_bundle / "tables").glob("*.parquet")}
+        if (student_bundle / "tables").exists()
+        else set()
+    )
+    instructor_tables = (
+        {p.name for p in (instructor_bundle / "tables").glob("*.parquet")}
+        if (instructor_bundle / "tables").exists()
+        else set()
+    )
+    missing_from_instructor = student_tables - instructor_tables
+    if missing_from_instructor:
+        errors.append(f"Tables in student but not instructor: {sorted(missing_from_instructor)}")
+    extra_in_instructor = instructor_tables - student_tables
+    if extra_in_instructor:
+        errors.append(f"Tables in instructor but not student: {sorted(extra_in_instructor)}")
+
+    for table in sorted(student_tables & instructor_tables):
+        s_sha = file_sha256(student_bundle / "tables" / table)
+        i_sha = file_sha256(instructor_bundle / "tables" / table)
+        if s_sha != i_sha:
+            errors.append(f"Table content mismatch: {table}")
+
+    # Both must have the same task splits with identical content
+    student_tasks = (
+        {
+            p.relative_to(student_bundle / "tasks")
+            for p in (student_bundle / "tasks").rglob("*.parquet")
+        }
+        if (student_bundle / "tasks").exists()
+        else set()
+    )
+    instructor_tasks = (
+        {
+            p.relative_to(instructor_bundle / "tasks")
+            for p in (instructor_bundle / "tasks").rglob("*.parquet")
+        }
+        if (instructor_bundle / "tasks").exists()
+        else set()
+    )
+    missing_tasks = student_tasks - instructor_tasks
+    if missing_tasks:
+        errors.append(
+            f"Task files in student but not instructor: {sorted(str(f) for f in missing_tasks)}"
+        )
+    extra_tasks = instructor_tasks - student_tasks
+    if extra_tasks:
+        errors.append(
+            f"Task files in instructor but not student: {sorted(str(f) for f in extra_tasks)}"
+        )
+
+    for rel in sorted(student_tasks & instructor_tasks):
+        s_sha = file_sha256(student_bundle / "tasks" / rel)
+        i_sha = file_sha256(instructor_bundle / "tasks" / rel)
+        if s_sha != i_sha:
+            errors.append(f"Task content mismatch: {rel}")
+
+    return errors