diff --git a/.agent-plan.md b/.agent-plan.md index a2779df..d3e309d 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -44,7 +44,7 @@ First public dataset release: `leadforge-b2b-lead-scoring`. Three difficulty tie - [x] Verify three tiers produce different conversion rates (intro 41.5%, intermediate 20.1%, advanced 7.9%) - [x] Update release/README.md — remove stale "Known limitations", add conversion rates to dataset summary - [x] Update release/HF_DATASET_CARD.md — add conversion rates to summary table -- [ ] Verify SHA-256 hash determinism (re-run build, compare hashes) +- [x] Verify SHA-256 hash determinism (re-run build, compare hashes) — `scripts/verify_hash_determinism.py`; 73/73 files identical across two `build_public_release.py` runs (modulo `manifest.json`'s wall-clock `generation_timestamp`) - [ ] Upload to Kaggle and HuggingFace - [ ] Announce diff --git a/.gitignore b/.gitignore index 8347321..385be89 100644 --- a/.gitignore +++ b/.gitignore @@ -216,3 +216,4 @@ release/intermediate/ release/advanced/ release/intermediate_instructor/ release/LICENSE +release/_determinism/ diff --git a/leadforge/render/manifests.py b/leadforge/render/manifests.py index 03d6201..d43fade 100644 --- a/leadforge/render/manifests.py +++ b/leadforge/render/manifests.py @@ -22,6 +22,11 @@ # Bump this whenever the bundle layout or manifest schema changes. BUNDLE_SCHEMA_VERSION = "2" +# Manifest fields whose value is non-deterministic by design (wall-clock, +# host metadata, etc.). Determinism checks must ignore these fields when +# comparing two bundles produced from the same (recipe, config, seed, version). +NON_DETERMINISTIC_MANIFEST_FIELDS: tuple[str, ...] = ("generation_timestamp",) + def build_manifest( config: GenerationConfig, diff --git a/leadforge/validation/invariants.py b/leadforge/validation/invariants.py index d06ac16..cb7caf1 100644 --- a/leadforge/validation/invariants.py +++ b/leadforge/validation/invariants.py @@ -9,9 +9,11 @@ from __future__ import annotations +import json from pathlib import Path from leadforge.core.hashing import file_sha256 +from leadforge.render.manifests import NON_DETERMINISTIC_MANIFEST_FIELDS def check_determinism(bundle_a: Path, bundle_b: Path) -> list[str]: @@ -60,6 +62,69 @@ def check_determinism(bundle_a: Path, bundle_b: Path) -> list[str]: return errors +def _manifest_payloads_match_modulo_non_deterministic(a: Path, b: Path) -> bool: + """Compare two manifest.json files after stripping non-deterministic fields. + + Re-dumps both payloads with ``sort_keys=True`` so a key reordering still + counts as a mismatch. + """ + payload_a = json.loads(a.read_text()) + payload_b = json.loads(b.read_text()) + for field in NON_DETERMINISTIC_MANIFEST_FIELDS: + payload_a.pop(field, None) + payload_b.pop(field, None) + return json.dumps(payload_a, sort_keys=True) == json.dumps(payload_b, sort_keys=True) + + +def compare_bundle_trees(bundle_a: Path, bundle_b: Path) -> list[str]: + """Full-tree byte-identical comparison of two bundle directories. + + Walks every file under both roots and reports: + + - files present in only one tree (``only in A:`` / ``only in B:``) + - files whose SHA-256 differs (``hash mismatch:``) + + The bundle ``manifest.json`` is special-cased: it carries + ``generation_timestamp`` (wall-clock UTC, set by ``build_manifest()``), + which is expected to differ across runs unless the caller pinned it. + For that one file, if the raw hashes differ, the function re-compares the + payload with non-deterministic fields stripped (see + :data:`NON_DETERMINISTIC_MANIFEST_FIELDS`). A mismatch *after* stripping + is still reported. + + Use this for release-time integration checks; for the fast in-process + determinism property used in CI, see :func:`check_determinism`. + """ + errors: list[str] = [] + + files_a = {p.relative_to(bundle_a) for p in bundle_a.rglob("*") if p.is_file()} + files_b = {p.relative_to(bundle_b) for p in bundle_b.rglob("*") if p.is_file()} + + for rel in sorted(files_a - files_b): + errors.append(f"only in A: {rel}") + for rel in sorted(files_b - files_a): + errors.append(f"only in B: {rel}") + + for rel in sorted(files_a & files_b): + path_a = bundle_a / rel + path_b = bundle_b / rel + if file_sha256(path_a) == file_sha256(path_b): + continue + if rel.name == "manifest.json" and rel.parent == Path(): + if _manifest_payloads_match_modulo_non_deterministic(path_a, path_b): + continue + errors.append( + f"manifest payload mismatch (after stripping " + f"{list(NON_DETERMINISTIC_MANIFEST_FIELDS)}): {rel}" + ) + continue + size_a = path_a.stat().st_size + size_b = path_b.stat().st_size + errors.append(f"hash mismatch: {rel} (sizes: A={size_a}B, B={size_b}B)") + + return errors + + def check_exposure_monotonicity(student_bundle: Path, instructor_bundle: Path) -> list[str]: """Verify that student_public is a subset of research_instructor. diff --git a/scripts/build_public_release.py b/scripts/build_public_release.py index e348453..c4e72c2 100644 --- a/scripts/build_public_release.py +++ b/scripts/build_public_release.py @@ -2,7 +2,7 @@ """Build the public release bundles for Kaggle/HuggingFace. Usage: - python scripts/build_public_release.py [OUTPUT_DIR] + python scripts/build_public_release.py [OUTPUT_DIR] [--generation-timestamp ISO8601] Generates four bundles: - intro/ (student_public, intro difficulty) @@ -14,10 +14,16 @@ (lead_scoring.csv) merging train/valid/test with a ``split`` column. All bundles are validated with ``leadforge validate`` after generation. + +The ``--generation-timestamp`` flag pins ``manifest.generation_timestamp`` to a +caller-supplied ISO-8601 UTC string. This is the supported way to produce +byte-reproducible bundles (used by ``scripts/verify_hash_determinism.py``); +the released bundles always use the wall-clock default. """ from __future__ import annotations +import argparse import json import shutil import sys @@ -45,6 +51,7 @@ def generate_and_save( exposure_mode: str, difficulty: str, seed: int = SEED, + generation_timestamp: str | None = None, ) -> None: """Generate a bundle and write it to *out_dir*.""" gen = Generator.from_recipe( @@ -54,7 +61,7 @@ def generate_and_save( difficulty=difficulty, ) bundle = gen.generate() - bundle.save(str(out_dir)) + bundle.save(str(out_dir), generation_timestamp=generation_timestamp) # Columns to drop from the flat CSV convenience export. @@ -111,7 +118,24 @@ def print_summary(bundle_dir: Path, name: str) -> None: def main() -> None: - output_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("release") + parser = argparse.ArgumentParser(description=__doc__.split("\n", maxsplit=1)[0]) + parser.add_argument( + "output_dir", + nargs="?", + default="release", + help="Output directory (default: release/)", + ) + parser.add_argument( + "--generation-timestamp", + default=None, + help=( + "ISO-8601 UTC string to pin manifest.generation_timestamp. " + "Default: wall-clock now. Use this for reproducible bundles." + ), + ) + args = parser.parse_args() + + output_root = Path(args.output_dir) output_root.mkdir(parents=True, exist_ok=True) # Copy LICENSE @@ -122,7 +146,12 @@ def main() -> None: for dir_name, exposure_mode, difficulty in BUNDLES: bundle_dir = output_root / dir_name print(f"Generating {dir_name} ({exposure_mode}, {difficulty})...", file=sys.stderr) - generate_and_save(bundle_dir, exposure_mode, difficulty) + generate_and_save( + bundle_dir, + exposure_mode, + difficulty, + generation_timestamp=args.generation_timestamp, + ) # Flat CSV for student_public bundles if exposure_mode == "student_public": diff --git a/scripts/verify_hash_determinism.py b/scripts/verify_hash_determinism.py new file mode 100755 index 0000000..000f8ff --- /dev/null +++ b/scripts/verify_hash_determinism.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +"""Verify SHA-256 hash determinism of the public release build. + +Runs ``scripts/build_public_release.py`` twice into two output directories with +the same seed/config and a *pinned* manifest timestamp, then asserts every +generated file hashes identically across runs. + +Pinning ``--generation-timestamp`` on the build script means the resulting +``manifest.json`` is also byte-identical — no special-cased manifest stripping +needed at compare time. (For defence-in-depth, the underlying +:func:`leadforge.validation.invariants.compare_bundle_trees` still tolerates +a wall-clock-only manifest diff, but pinning is the supported workflow.) + +The architectural invariant being enforced is +"generation is deterministic given (recipe, config, seed, version)". +The corresponding fast in-process check lives in +``tests/validation/test_invariants.py::TestDeterminism`` and runs in CI on +every PR; this script is the slower release-time check that exercises the +full ``build_public_release.py`` pipeline. + +On failure, output directories are preserved (NOT auto-cleaned) so the +mismatching artifacts can be diffed directly. + +Exit code: 0 on PASS, 1 on FAIL. + +Usage: + python scripts/verify_hash_determinism.py [--out DIR] [--keep-on-success] +""" + +from __future__ import annotations + +import argparse +import shutil +import subprocess +import sys +from pathlib import Path + +from leadforge.core.hashing import file_sha256 +from leadforge.validation.invariants import compare_bundle_trees + +REPO_ROOT = Path(__file__).resolve().parent.parent +BUILD_SCRIPT = REPO_ROOT / "scripts" / "build_public_release.py" + +# Pinned timestamp for both runs. Any fixed ISO-8601 UTC string works; using +# the unix epoch makes it obvious that it's a sentinel, not a real run time. +PINNED_TIMESTAMP = "1970-01-01T00:00:00+00:00" + +# Bundle subdirectories produced by build_public_release.py. Hardcoded here +# because the script's BUNDLES list is not exposed as a public API. If the +# build script grows new bundles, add them here. +BUNDLE_DIRS = ("intro", "intermediate", "advanced", "intermediate_instructor") + + +def run_build(out_dir: Path) -> None: + cmd = [ + sys.executable, + str(BUILD_SCRIPT), + str(out_dir), + "--generation-timestamp", + PINNED_TIMESTAMP, + ] + print(f" $ {' '.join(cmd)}") + subprocess.run(cmd, check=True, cwd=REPO_ROOT) # noqa: S603 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__.split("\n", maxsplit=1)[0]) + parser.add_argument( + "--out", + type=Path, + default=REPO_ROOT / "release" / "_determinism", + help="Base directory for both runs (will be wiped at start). " + "Default: release/_determinism/", + ) + parser.add_argument( + "--keep-on-success", + action="store_true", + help="Keep output directories even on PASS (default: clean up on PASS, " + "always preserve on FAIL).", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + + if not BUILD_SCRIPT.exists(): + print(f"FAIL: build script not found at {BUILD_SCRIPT}", file=sys.stderr) + return 1 + + base = args.out + run_a = base / "run_a" + run_b = base / "run_b" + + # Wipe and recreate. + if base.exists(): + shutil.rmtree(base) + base.mkdir(parents=True) + + print(f"Run A → {run_a}") + run_build(run_a) + print(f"Run B → {run_b}") + run_build(run_b) + + # Per-bundle comparison so error messages stay scoped to a single bundle. + all_errors: list[tuple[str, list[str]]] = [] + total_files = 0 + for name in BUNDLE_DIRS: + bundle_a = run_a / name + bundle_b = run_b / name + if not bundle_a.exists() or not bundle_b.exists(): + all_errors.append((name, [f"bundle directory missing: {name}"])) + continue + errors = compare_bundle_trees(bundle_a, bundle_b) + bundle_files = sum(1 for p in bundle_a.rglob("*") if p.is_file()) + total_files += bundle_files + if errors: + all_errors.append((name, errors)) + + # Top-level files (LICENSE, etc.) — compare via hash directly. + top_a = {p.name for p in run_a.iterdir() if p.is_file()} + top_b = {p.name for p in run_b.iterdir() if p.is_file()} + top_errors: list[str] = [] + for name in sorted(top_a - top_b): + top_errors.append(f"top-level file only in A: {name}") + for name in sorted(top_b - top_a): + top_errors.append(f"top-level file only in B: {name}") + for name in sorted(top_a & top_b): + if file_sha256(run_a / name) != file_sha256(run_b / name): + top_errors.append(f"top-level hash mismatch: {name}") + total_files += len(top_a) + if top_errors: + all_errors.append(("", top_errors)) + + if not all_errors: + print(f"\nPASS: all {total_files} files hash identically across runs.") + if not args.keep_on_success: + shutil.rmtree(base) + print(f"(cleaned up {base})") + else: + print(f"(kept artifacts at {base})") + return 0 + + print(f"\nFAIL: mismatches in {len(all_errors)} bundle(s):") + for name, errors in all_errors: + print(f" [{name}]") + for e in errors: + print(f" - {e}") + print(f"\nArtifacts preserved for inspection:\n A: {run_a}\n B: {run_b}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/validation/test_invariants.py b/tests/validation/test_invariants.py index 3668ccf..078548c 100644 --- a/tests/validation/test_invariants.py +++ b/tests/validation/test_invariants.py @@ -2,12 +2,17 @@ from __future__ import annotations +import json from pathlib import Path import pytest from leadforge.api.generator import Generator -from leadforge.validation.invariants import check_determinism, check_exposure_monotonicity +from leadforge.validation.invariants import ( + check_determinism, + check_exposure_monotonicity, + compare_bundle_trees, +) _SMALL = {"n_leads": 20, "n_accounts": 10, "n_contacts": 30} @@ -80,3 +85,111 @@ def test_instructor_without_metadata_fails(self, exposure_bundles: tuple[Path, P # Student as "instructor" lacks metadata/ errors = check_exposure_monotonicity(student, student) assert any("missing metadata" in e for e in errors) + + +def _make_synthetic_bundle( + root: Path, + files: dict[str, str | bytes], + manifest: dict | None = None, +) -> Path: + """Write a fake bundle layout with the given files and optional manifest.""" + root.mkdir(parents=True, exist_ok=True) + if manifest is not None: + (root / "manifest.json").write_text(json.dumps(manifest, indent=2)) + for rel, content in files.items(): + path = root / rel + path.parent.mkdir(parents=True, exist_ok=True) + if isinstance(content, bytes): + path.write_bytes(content) + else: + path.write_text(content) + return root + + +class TestCompareBundleTrees: + """Synthetic-bundle unit tests for compare_bundle_trees. + + These avoid running the full generator so the verifier's logic is exercised + independently of generation determinism. Real end-to-end determinism is + covered by TestDeterminism above. + """ + + def test_identical_trees_no_errors(self, tmp_path: Path) -> None: + a = _make_synthetic_bundle( + tmp_path / "a", + files={"tables/x.parquet": b"\x01\x02", "dataset_card.md": "hello"}, + ) + b = _make_synthetic_bundle( + tmp_path / "b", + files={"tables/x.parquet": b"\x01\x02", "dataset_card.md": "hello"}, + ) + assert compare_bundle_trees(a, b) == [] + + def test_only_in_a_reported(self, tmp_path: Path) -> None: + a = _make_synthetic_bundle( + tmp_path / "a", + files={"tables/x.parquet": b"x", "tables/extra.parquet": b"y"}, + ) + b = _make_synthetic_bundle(tmp_path / "b", files={"tables/x.parquet": b"x"}) + errors = compare_bundle_trees(a, b) + assert any("only in A" in e and "extra.parquet" in e for e in errors) + + def test_only_in_b_reported(self, tmp_path: Path) -> None: + a = _make_synthetic_bundle(tmp_path / "a", files={"tables/x.parquet": b"x"}) + b = _make_synthetic_bundle( + tmp_path / "b", + files={"tables/x.parquet": b"x", "metadata/world_spec.json": "{}"}, + ) + errors = compare_bundle_trees(a, b) + assert any("only in B" in e and "world_spec.json" in e for e in errors) + + def test_hash_mismatch_reported_with_sizes(self, tmp_path: Path) -> None: + a = _make_synthetic_bundle(tmp_path / "a", files={"tables/x.parquet": b"abc"}) + b = _make_synthetic_bundle(tmp_path / "b", files={"tables/x.parquet": b"abcd"}) + errors = compare_bundle_trees(a, b) + assert len(errors) == 1 + assert "hash mismatch" in errors[0] + assert "x.parquet" in errors[0] + assert "A=3B" in errors[0] + assert "B=4B" in errors[0] + + def test_manifest_only_timestamp_diff_passes(self, tmp_path: Path) -> None: + manifest_a = {"seed": 42, "generation_timestamp": "2026-01-01T00:00:00+00:00"} + manifest_b = {"seed": 42, "generation_timestamp": "2026-12-31T23:59:59+00:00"} + a = _make_synthetic_bundle(tmp_path / "a", files={}, manifest=manifest_a) + b = _make_synthetic_bundle(tmp_path / "b", files={}, manifest=manifest_b) + assert compare_bundle_trees(a, b) == [] + + def test_manifest_real_diff_reported(self, tmp_path: Path) -> None: + manifest_a = {"seed": 42, "generation_timestamp": "2026-01-01T00:00:00+00:00"} + manifest_b = {"seed": 43, "generation_timestamp": "2026-01-01T00:00:00+00:00"} + a = _make_synthetic_bundle(tmp_path / "a", files={}, manifest=manifest_a) + b = _make_synthetic_bundle(tmp_path / "b", files={}, manifest=manifest_b) + errors = compare_bundle_trees(a, b) + assert len(errors) == 1 + assert "manifest payload mismatch" in errors[0] + + def test_manifest_key_reorder_only_passes(self, tmp_path: Path) -> None: + # Same logical payload, different on-disk key order — must NOT be flagged + # as a mismatch. (json.dumps with sort_keys=True normalises both sides.) + a_root = tmp_path / "a" + b_root = tmp_path / "b" + a_root.mkdir() + b_root.mkdir() + (a_root / "manifest.json").write_text(json.dumps({"seed": 42, "n_leads": 100}, indent=2)) + (b_root / "manifest.json").write_text(json.dumps({"n_leads": 100, "seed": 42}, indent=2)) + assert compare_bundle_trees(a_root, b_root) == [] + + def test_nested_manifest_not_special_cased(self, tmp_path: Path) -> None: + # Only the top-level bundle manifest.json gets timestamp-stripping. + # A file named manifest.json deeper in the tree is compared byte-for-byte. + a = _make_synthetic_bundle( + tmp_path / "a", + files={"tasks/foo/manifest.json": '{"generation_timestamp": "T1"}'}, + ) + b = _make_synthetic_bundle( + tmp_path / "b", + files={"tasks/foo/manifest.json": '{"generation_timestamp": "T2"}'}, + ) + errors = compare_bundle_trees(a, b) + assert any("hash mismatch" in e for e in errors)