From 490eeb948c11080df59df718999315aab1510327 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Mon, 4 May 2026 10:56:37 +0300 Subject: [PATCH 1/3] test: add SHA-256 hash determinism verifier for public release Runs scripts/build_public_release.py twice into temp directories and asserts every generated file hashes identically across runs (modulo manifest.json's wall-clock generation_timestamp, which is stripped before comparison). Enforces the "deterministic given (recipe, config, seed, version)" architectural invariant on the bundle layer. Exits 0 on PASS, 1 on FAIL. Co-Authored-By: Claude Opus 4.7 --- scripts/verify_hash_determinism.py | 143 +++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100755 scripts/verify_hash_determinism.py diff --git a/scripts/verify_hash_determinism.py b/scripts/verify_hash_determinism.py new file mode 100755 index 0000000..78e0f9b --- /dev/null +++ b/scripts/verify_hash_determinism.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +"""Verify SHA-256 hash determinism of the public release build. + +Runs ``scripts/build_public_release.py`` twice into two temp directories with +the same seed/config and compares the SHA-256 digest of every generated file. + +The architectural invariant is that generation is deterministic given +``(recipe, config, seed, version)``. This script enforces that invariant on +the bundle layer: every file written under each bundle directory must hash +identically across runs. + +Two practical exceptions are handled: + +1. ``manifest.json`` contains a ``generation_timestamp`` field set to + ``datetime.now(UTC)`` at write time, so the file bytes legitimately differ + between runs. The script strips that field and compares the remaining + manifest payload (which already includes per-file SHA-256 digests for the + relational and task Parquet files). + +2. ``LICENSE`` is copied from the repo root and is identical by construction; + it is hashed and compared like any other file. + +Exit code: 0 on PASS (all hashes match), 1 on FAIL. + +Usage: + python scripts/verify_hash_determinism.py +""" + +from __future__ import annotations + +import hashlib +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +BUILD_SCRIPT = REPO_ROOT / "scripts" / "build_public_release.py" + +# Field stripped before comparing manifest payloads; differs by design between +# runs (set to wall-clock time in build_manifest()). +MANIFEST_TIMESTAMP_FIELD = "generation_timestamp" + + +def file_sha256(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as fh: + for chunk in iter(lambda: fh.read(65536), b""): + h.update(chunk) + return h.hexdigest() + + +def walk_files(root: Path) -> list[Path]: + """Return all regular files under *root*, sorted by relative path.""" + return sorted(p for p in root.rglob("*") if p.is_file()) + + +def hash_tree(root: Path) -> dict[str, str]: + """Map relative-path → SHA-256 for every file under *root*.""" + return {str(p.relative_to(root)): file_sha256(p) for p in walk_files(root)} + + +def manifest_payload_without_timestamp(path: Path) -> dict: + payload = json.loads(path.read_text()) + payload.pop(MANIFEST_TIMESTAMP_FIELD, None) + return payload + + +def run_build(out_dir: Path) -> None: + cmd = [sys.executable, str(BUILD_SCRIPT), str(out_dir)] + print(f" $ {' '.join(cmd)}", flush=True) + subprocess.run(cmd, check=True, cwd=REPO_ROOT) # noqa: S603 + + +def compare(run_a: Path, run_b: Path) -> list[str]: + """Return a list of human-readable mismatch messages (empty == identical).""" + tree_a = hash_tree(run_a) + tree_b = hash_tree(run_b) + + mismatches: list[str] = [] + + only_a = sorted(set(tree_a) - set(tree_b)) + only_b = sorted(set(tree_b) - set(tree_a)) + for rel in only_a: + mismatches.append(f"only in run A: {rel}") + for rel in only_b: + mismatches.append(f"only in run B: {rel}") + + for rel in sorted(set(tree_a) & set(tree_b)): + if tree_a[rel] == tree_b[rel]: + continue + # manifest.json carries a wall-clock timestamp; compare the rest. + if Path(rel).name == "manifest.json": + payload_a = manifest_payload_without_timestamp(run_a / rel) + payload_b = manifest_payload_without_timestamp(run_b / rel) + if payload_a == payload_b: + continue + mismatches.append( + f"manifest payload mismatch (excluding {MANIFEST_TIMESTAMP_FIELD}): {rel}" + ) + continue + mismatches.append(f"hash mismatch: {rel}\n A={tree_a[rel]}\n B={tree_b[rel]}") + + return mismatches + + +def main() -> int: + if not BUILD_SCRIPT.exists(): + print(f"FAIL: build script not found at {BUILD_SCRIPT}", file=sys.stderr) + return 1 + + with tempfile.TemporaryDirectory(prefix="leadforge_determinism_") as tmp: + run_a = Path(tmp) / "run_a" + run_b = Path(tmp) / "run_b" + + print(f"Run A → {run_a}") + run_build(run_a) + print(f"Run B → {run_b}") + run_build(run_b) + + files_a = len(walk_files(run_a)) + files_b = len(walk_files(run_b)) + print(f"\nRun A produced {files_a} files; run B produced {files_b} files.") + + mismatches = compare(run_a, run_b) + + if not mismatches: + print(f"\nPASS: all {files_a} files hash identically across runs.") + print( + f"(manifest.json compared after stripping {MANIFEST_TIMESTAMP_FIELD}, " + "which is wall-clock by design.)" + ) + return 0 + + print(f"\nFAIL: {len(mismatches)} mismatch(es):") + for m in mismatches: + print(f" - {m}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) From 80ac580e260e3ecee2162fd9171812218a202896 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Mon, 4 May 2026 10:57:19 +0300 Subject: [PATCH 2/3] docs: mark hash determinism verification as done Verified via scripts/verify_hash_determinism.py: 73/73 files in the release bundle hash identically across two consecutive builds with the same seed/config (manifest.json compared after stripping its wall-clock generation_timestamp). Co-Authored-By: Claude Opus 4.7 --- .agent-plan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.agent-plan.md b/.agent-plan.md index a2779df..d3e309d 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -44,7 +44,7 @@ First public dataset release: `leadforge-b2b-lead-scoring`. Three difficulty tie - [x] Verify three tiers produce different conversion rates (intro 41.5%, intermediate 20.1%, advanced 7.9%) - [x] Update release/README.md — remove stale "Known limitations", add conversion rates to dataset summary - [x] Update release/HF_DATASET_CARD.md — add conversion rates to summary table -- [ ] Verify SHA-256 hash determinism (re-run build, compare hashes) +- [x] Verify SHA-256 hash determinism (re-run build, compare hashes) — `scripts/verify_hash_determinism.py`; 73/73 files identical across two `build_public_release.py` runs (modulo `manifest.json`'s wall-clock `generation_timestamp`) - [ ] Upload to Kaggle and HuggingFace - [ ] Announce From 48ad332baaee3a3c0d5c247fb60c30760c1ef4ad Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Mon, 4 May 2026 11:12:12 +0300 Subject: [PATCH 3/3] refactor: address senior-dev review of determinism verifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issues from review of PR #55, applied here: 1. Reuse existing infrastructure. The original script reimplemented tree-walk + hash compare locally despite leadforge.validation.invariants already exporting check_determinism. Extracted compare_bundle_trees() into the same module as a public, full-tree check (the existing check_determinism only inspects a hardcoded 3-file list). 2. Drop manifest-stripping hack in favour of timestamp pinning. WorldBundle.save() already accepts generation_timestamp=; build_public_release now exposes it as --generation-timestamp. The verifier pins it to the unix epoch on both runs, so manifest.json is byte-identical too — no special-casing required at compare time. compare_bundle_trees keeps a defence-in-depth fallback that strips NON_DETERMINISTIC_MANIFEST_FIELDS and re-dumps with sort_keys=True (catches accidental key reordering). 3. Single source of truth for non-deterministic fields. New constant NON_DETERMINISTIC_MANIFEST_FIELDS in leadforge/render/manifests.py; consumed by the invariants module. No more duplicated string literal. 4. Preserve artifacts on failure. Verifier now writes to release/_determinism/ (gitignored), wipes at start, cleans up only on PASS (unless --keep-on-success). On FAIL the dirs stay so the dev can diff the offending files. 5. Better failure diagnostics. compare_bundle_trees() reports byte-size delta on hash mismatches; manifest mismatches list which fields were stripped before comparison. 6. Self-tested. New TestCompareBundleTrees suite (8 tests) covers identical, only-in-A, only-in-B, hash mismatch, manifest timestamp-only diff, manifest real diff, manifest key reorder, and the nested- manifest.json edge case (only the top-level manifest is special-cased). 7. argparse on both scripts (--out, --keep-on-success on verifier; --generation-timestamp on build_public_release). Verifier still runs subprocess (intentional — the script's job is to test the build script end-to-end). The fast in-process determinism check that runs in CI on every PR continues to live in tests/validation/test_invariants.py::TestDeterminism. Result: PASS — 73/73 files identical across two pinned-timestamp runs; all 876 tests pass. Co-Authored-By: Claude Opus 4.7 --- .gitignore | 1 + leadforge/render/manifests.py | 5 + leadforge/validation/invariants.py | 65 ++++++++ scripts/build_public_release.py | 37 ++++- scripts/verify_hash_determinism.py | 221 +++++++++++++++------------- tests/validation/test_invariants.py | 115 ++++++++++++++- 6 files changed, 334 insertions(+), 110 deletions(-) diff --git a/.gitignore b/.gitignore index 8347321..385be89 100644 --- a/.gitignore +++ b/.gitignore @@ -216,3 +216,4 @@ release/intermediate/ release/advanced/ release/intermediate_instructor/ release/LICENSE +release/_determinism/ diff --git a/leadforge/render/manifests.py b/leadforge/render/manifests.py index 03d6201..d43fade 100644 --- a/leadforge/render/manifests.py +++ b/leadforge/render/manifests.py @@ -22,6 +22,11 @@ # Bump this whenever the bundle layout or manifest schema changes. BUNDLE_SCHEMA_VERSION = "2" +# Manifest fields whose value is non-deterministic by design (wall-clock, +# host metadata, etc.). Determinism checks must ignore these fields when +# comparing two bundles produced from the same (recipe, config, seed, version). +NON_DETERMINISTIC_MANIFEST_FIELDS: tuple[str, ...] = ("generation_timestamp",) + def build_manifest( config: GenerationConfig, diff --git a/leadforge/validation/invariants.py b/leadforge/validation/invariants.py index d06ac16..cb7caf1 100644 --- a/leadforge/validation/invariants.py +++ b/leadforge/validation/invariants.py @@ -9,9 +9,11 @@ from __future__ import annotations +import json from pathlib import Path from leadforge.core.hashing import file_sha256 +from leadforge.render.manifests import NON_DETERMINISTIC_MANIFEST_FIELDS def check_determinism(bundle_a: Path, bundle_b: Path) -> list[str]: @@ -60,6 +62,69 @@ def check_determinism(bundle_a: Path, bundle_b: Path) -> list[str]: return errors +def _manifest_payloads_match_modulo_non_deterministic(a: Path, b: Path) -> bool: + """Compare two manifest.json files after stripping non-deterministic fields. + + Re-dumps both payloads with ``sort_keys=True`` so a key reordering still + counts as a mismatch. + """ + payload_a = json.loads(a.read_text()) + payload_b = json.loads(b.read_text()) + for field in NON_DETERMINISTIC_MANIFEST_FIELDS: + payload_a.pop(field, None) + payload_b.pop(field, None) + return json.dumps(payload_a, sort_keys=True) == json.dumps(payload_b, sort_keys=True) + + +def compare_bundle_trees(bundle_a: Path, bundle_b: Path) -> list[str]: + """Full-tree byte-identical comparison of two bundle directories. + + Walks every file under both roots and reports: + + - files present in only one tree (``only in A:`` / ``only in B:``) + - files whose SHA-256 differs (``hash mismatch:``) + + The bundle ``manifest.json`` is special-cased: it carries + ``generation_timestamp`` (wall-clock UTC, set by ``build_manifest()``), + which is expected to differ across runs unless the caller pinned it. + For that one file, if the raw hashes differ, the function re-compares the + payload with non-deterministic fields stripped (see + :data:`NON_DETERMINISTIC_MANIFEST_FIELDS`). A mismatch *after* stripping + is still reported. + + Use this for release-time integration checks; for the fast in-process + determinism property used in CI, see :func:`check_determinism`. + """ + errors: list[str] = [] + + files_a = {p.relative_to(bundle_a) for p in bundle_a.rglob("*") if p.is_file()} + files_b = {p.relative_to(bundle_b) for p in bundle_b.rglob("*") if p.is_file()} + + for rel in sorted(files_a - files_b): + errors.append(f"only in A: {rel}") + for rel in sorted(files_b - files_a): + errors.append(f"only in B: {rel}") + + for rel in sorted(files_a & files_b): + path_a = bundle_a / rel + path_b = bundle_b / rel + if file_sha256(path_a) == file_sha256(path_b): + continue + if rel.name == "manifest.json" and rel.parent == Path(): + if _manifest_payloads_match_modulo_non_deterministic(path_a, path_b): + continue + errors.append( + f"manifest payload mismatch (after stripping " + f"{list(NON_DETERMINISTIC_MANIFEST_FIELDS)}): {rel}" + ) + continue + size_a = path_a.stat().st_size + size_b = path_b.stat().st_size + errors.append(f"hash mismatch: {rel} (sizes: A={size_a}B, B={size_b}B)") + + return errors + + def check_exposure_monotonicity(student_bundle: Path, instructor_bundle: Path) -> list[str]: """Verify that student_public is a subset of research_instructor. diff --git a/scripts/build_public_release.py b/scripts/build_public_release.py index e348453..c4e72c2 100644 --- a/scripts/build_public_release.py +++ b/scripts/build_public_release.py @@ -2,7 +2,7 @@ """Build the public release bundles for Kaggle/HuggingFace. Usage: - python scripts/build_public_release.py [OUTPUT_DIR] + python scripts/build_public_release.py [OUTPUT_DIR] [--generation-timestamp ISO8601] Generates four bundles: - intro/ (student_public, intro difficulty) @@ -14,10 +14,16 @@ (lead_scoring.csv) merging train/valid/test with a ``split`` column. All bundles are validated with ``leadforge validate`` after generation. + +The ``--generation-timestamp`` flag pins ``manifest.generation_timestamp`` to a +caller-supplied ISO-8601 UTC string. This is the supported way to produce +byte-reproducible bundles (used by ``scripts/verify_hash_determinism.py``); +the released bundles always use the wall-clock default. """ from __future__ import annotations +import argparse import json import shutil import sys @@ -45,6 +51,7 @@ def generate_and_save( exposure_mode: str, difficulty: str, seed: int = SEED, + generation_timestamp: str | None = None, ) -> None: """Generate a bundle and write it to *out_dir*.""" gen = Generator.from_recipe( @@ -54,7 +61,7 @@ def generate_and_save( difficulty=difficulty, ) bundle = gen.generate() - bundle.save(str(out_dir)) + bundle.save(str(out_dir), generation_timestamp=generation_timestamp) # Columns to drop from the flat CSV convenience export. @@ -111,7 +118,24 @@ def print_summary(bundle_dir: Path, name: str) -> None: def main() -> None: - output_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("release") + parser = argparse.ArgumentParser(description=__doc__.split("\n", maxsplit=1)[0]) + parser.add_argument( + "output_dir", + nargs="?", + default="release", + help="Output directory (default: release/)", + ) + parser.add_argument( + "--generation-timestamp", + default=None, + help=( + "ISO-8601 UTC string to pin manifest.generation_timestamp. " + "Default: wall-clock now. Use this for reproducible bundles." + ), + ) + args = parser.parse_args() + + output_root = Path(args.output_dir) output_root.mkdir(parents=True, exist_ok=True) # Copy LICENSE @@ -122,7 +146,12 @@ def main() -> None: for dir_name, exposure_mode, difficulty in BUNDLES: bundle_dir = output_root / dir_name print(f"Generating {dir_name} ({exposure_mode}, {difficulty})...", file=sys.stderr) - generate_and_save(bundle_dir, exposure_mode, difficulty) + generate_and_save( + bundle_dir, + exposure_mode, + difficulty, + generation_timestamp=args.generation_timestamp, + ) # Flat CSV for student_public bundles if exposure_mode == "student_public": diff --git a/scripts/verify_hash_determinism.py b/scripts/verify_hash_determinism.py index 78e0f9b..000f8ff 100755 --- a/scripts/verify_hash_determinism.py +++ b/scripts/verify_hash_determinism.py @@ -1,142 +1,153 @@ #!/usr/bin/env python3 """Verify SHA-256 hash determinism of the public release build. -Runs ``scripts/build_public_release.py`` twice into two temp directories with -the same seed/config and compares the SHA-256 digest of every generated file. +Runs ``scripts/build_public_release.py`` twice into two output directories with +the same seed/config and a *pinned* manifest timestamp, then asserts every +generated file hashes identically across runs. -The architectural invariant is that generation is deterministic given -``(recipe, config, seed, version)``. This script enforces that invariant on -the bundle layer: every file written under each bundle directory must hash -identically across runs. +Pinning ``--generation-timestamp`` on the build script means the resulting +``manifest.json`` is also byte-identical — no special-cased manifest stripping +needed at compare time. (For defence-in-depth, the underlying +:func:`leadforge.validation.invariants.compare_bundle_trees` still tolerates +a wall-clock-only manifest diff, but pinning is the supported workflow.) -Two practical exceptions are handled: +The architectural invariant being enforced is +"generation is deterministic given (recipe, config, seed, version)". +The corresponding fast in-process check lives in +``tests/validation/test_invariants.py::TestDeterminism`` and runs in CI on +every PR; this script is the slower release-time check that exercises the +full ``build_public_release.py`` pipeline. -1. ``manifest.json`` contains a ``generation_timestamp`` field set to - ``datetime.now(UTC)`` at write time, so the file bytes legitimately differ - between runs. The script strips that field and compares the remaining - manifest payload (which already includes per-file SHA-256 digests for the - relational and task Parquet files). +On failure, output directories are preserved (NOT auto-cleaned) so the +mismatching artifacts can be diffed directly. -2. ``LICENSE`` is copied from the repo root and is identical by construction; - it is hashed and compared like any other file. - -Exit code: 0 on PASS (all hashes match), 1 on FAIL. +Exit code: 0 on PASS, 1 on FAIL. Usage: - python scripts/verify_hash_determinism.py + python scripts/verify_hash_determinism.py [--out DIR] [--keep-on-success] """ from __future__ import annotations -import hashlib -import json +import argparse +import shutil import subprocess import sys -import tempfile from pathlib import Path +from leadforge.core.hashing import file_sha256 +from leadforge.validation.invariants import compare_bundle_trees + REPO_ROOT = Path(__file__).resolve().parent.parent BUILD_SCRIPT = REPO_ROOT / "scripts" / "build_public_release.py" -# Field stripped before comparing manifest payloads; differs by design between -# runs (set to wall-clock time in build_manifest()). -MANIFEST_TIMESTAMP_FIELD = "generation_timestamp" - - -def file_sha256(path: Path) -> str: - h = hashlib.sha256() - with path.open("rb") as fh: - for chunk in iter(lambda: fh.read(65536), b""): - h.update(chunk) - return h.hexdigest() - - -def walk_files(root: Path) -> list[Path]: - """Return all regular files under *root*, sorted by relative path.""" - return sorted(p for p in root.rglob("*") if p.is_file()) +# Pinned timestamp for both runs. Any fixed ISO-8601 UTC string works; using +# the unix epoch makes it obvious that it's a sentinel, not a real run time. +PINNED_TIMESTAMP = "1970-01-01T00:00:00+00:00" - -def hash_tree(root: Path) -> dict[str, str]: - """Map relative-path → SHA-256 for every file under *root*.""" - return {str(p.relative_to(root)): file_sha256(p) for p in walk_files(root)} - - -def manifest_payload_without_timestamp(path: Path) -> dict: - payload = json.loads(path.read_text()) - payload.pop(MANIFEST_TIMESTAMP_FIELD, None) - return payload +# Bundle subdirectories produced by build_public_release.py. Hardcoded here +# because the script's BUNDLES list is not exposed as a public API. If the +# build script grows new bundles, add them here. +BUNDLE_DIRS = ("intro", "intermediate", "advanced", "intermediate_instructor") def run_build(out_dir: Path) -> None: - cmd = [sys.executable, str(BUILD_SCRIPT), str(out_dir)] - print(f" $ {' '.join(cmd)}", flush=True) + cmd = [ + sys.executable, + str(BUILD_SCRIPT), + str(out_dir), + "--generation-timestamp", + PINNED_TIMESTAMP, + ] + print(f" $ {' '.join(cmd)}") subprocess.run(cmd, check=True, cwd=REPO_ROOT) # noqa: S603 -def compare(run_a: Path, run_b: Path) -> list[str]: - """Return a list of human-readable mismatch messages (empty == identical).""" - tree_a = hash_tree(run_a) - tree_b = hash_tree(run_b) - - mismatches: list[str] = [] - - only_a = sorted(set(tree_a) - set(tree_b)) - only_b = sorted(set(tree_b) - set(tree_a)) - for rel in only_a: - mismatches.append(f"only in run A: {rel}") - for rel in only_b: - mismatches.append(f"only in run B: {rel}") - - for rel in sorted(set(tree_a) & set(tree_b)): - if tree_a[rel] == tree_b[rel]: - continue - # manifest.json carries a wall-clock timestamp; compare the rest. - if Path(rel).name == "manifest.json": - payload_a = manifest_payload_without_timestamp(run_a / rel) - payload_b = manifest_payload_without_timestamp(run_b / rel) - if payload_a == payload_b: - continue - mismatches.append( - f"manifest payload mismatch (excluding {MANIFEST_TIMESTAMP_FIELD}): {rel}" - ) - continue - mismatches.append(f"hash mismatch: {rel}\n A={tree_a[rel]}\n B={tree_b[rel]}") - - return mismatches +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__.split("\n", maxsplit=1)[0]) + parser.add_argument( + "--out", + type=Path, + default=REPO_ROOT / "release" / "_determinism", + help="Base directory for both runs (will be wiped at start). " + "Default: release/_determinism/", + ) + parser.add_argument( + "--keep-on-success", + action="store_true", + help="Keep output directories even on PASS (default: clean up on PASS, " + "always preserve on FAIL).", + ) + return parser.parse_args() def main() -> int: + args = parse_args() + if not BUILD_SCRIPT.exists(): print(f"FAIL: build script not found at {BUILD_SCRIPT}", file=sys.stderr) return 1 - with tempfile.TemporaryDirectory(prefix="leadforge_determinism_") as tmp: - run_a = Path(tmp) / "run_a" - run_b = Path(tmp) / "run_b" - - print(f"Run A → {run_a}") - run_build(run_a) - print(f"Run B → {run_b}") - run_build(run_b) - - files_a = len(walk_files(run_a)) - files_b = len(walk_files(run_b)) - print(f"\nRun A produced {files_a} files; run B produced {files_b} files.") - - mismatches = compare(run_a, run_b) - - if not mismatches: - print(f"\nPASS: all {files_a} files hash identically across runs.") - print( - f"(manifest.json compared after stripping {MANIFEST_TIMESTAMP_FIELD}, " - "which is wall-clock by design.)" - ) - return 0 - - print(f"\nFAIL: {len(mismatches)} mismatch(es):") - for m in mismatches: - print(f" - {m}") - return 1 + base = args.out + run_a = base / "run_a" + run_b = base / "run_b" + + # Wipe and recreate. + if base.exists(): + shutil.rmtree(base) + base.mkdir(parents=True) + + print(f"Run A → {run_a}") + run_build(run_a) + print(f"Run B → {run_b}") + run_build(run_b) + + # Per-bundle comparison so error messages stay scoped to a single bundle. + all_errors: list[tuple[str, list[str]]] = [] + total_files = 0 + for name in BUNDLE_DIRS: + bundle_a = run_a / name + bundle_b = run_b / name + if not bundle_a.exists() or not bundle_b.exists(): + all_errors.append((name, [f"bundle directory missing: {name}"])) + continue + errors = compare_bundle_trees(bundle_a, bundle_b) + bundle_files = sum(1 for p in bundle_a.rglob("*") if p.is_file()) + total_files += bundle_files + if errors: + all_errors.append((name, errors)) + + # Top-level files (LICENSE, etc.) — compare via hash directly. + top_a = {p.name for p in run_a.iterdir() if p.is_file()} + top_b = {p.name for p in run_b.iterdir() if p.is_file()} + top_errors: list[str] = [] + for name in sorted(top_a - top_b): + top_errors.append(f"top-level file only in A: {name}") + for name in sorted(top_b - top_a): + top_errors.append(f"top-level file only in B: {name}") + for name in sorted(top_a & top_b): + if file_sha256(run_a / name) != file_sha256(run_b / name): + top_errors.append(f"top-level hash mismatch: {name}") + total_files += len(top_a) + if top_errors: + all_errors.append(("", top_errors)) + + if not all_errors: + print(f"\nPASS: all {total_files} files hash identically across runs.") + if not args.keep_on_success: + shutil.rmtree(base) + print(f"(cleaned up {base})") + else: + print(f"(kept artifacts at {base})") + return 0 + + print(f"\nFAIL: mismatches in {len(all_errors)} bundle(s):") + for name, errors in all_errors: + print(f" [{name}]") + for e in errors: + print(f" - {e}") + print(f"\nArtifacts preserved for inspection:\n A: {run_a}\n B: {run_b}") + return 1 if __name__ == "__main__": diff --git a/tests/validation/test_invariants.py b/tests/validation/test_invariants.py index 3668ccf..078548c 100644 --- a/tests/validation/test_invariants.py +++ b/tests/validation/test_invariants.py @@ -2,12 +2,17 @@ from __future__ import annotations +import json from pathlib import Path import pytest from leadforge.api.generator import Generator -from leadforge.validation.invariants import check_determinism, check_exposure_monotonicity +from leadforge.validation.invariants import ( + check_determinism, + check_exposure_monotonicity, + compare_bundle_trees, +) _SMALL = {"n_leads": 20, "n_accounts": 10, "n_contacts": 30} @@ -80,3 +85,111 @@ def test_instructor_without_metadata_fails(self, exposure_bundles: tuple[Path, P # Student as "instructor" lacks metadata/ errors = check_exposure_monotonicity(student, student) assert any("missing metadata" in e for e in errors) + + +def _make_synthetic_bundle( + root: Path, + files: dict[str, str | bytes], + manifest: dict | None = None, +) -> Path: + """Write a fake bundle layout with the given files and optional manifest.""" + root.mkdir(parents=True, exist_ok=True) + if manifest is not None: + (root / "manifest.json").write_text(json.dumps(manifest, indent=2)) + for rel, content in files.items(): + path = root / rel + path.parent.mkdir(parents=True, exist_ok=True) + if isinstance(content, bytes): + path.write_bytes(content) + else: + path.write_text(content) + return root + + +class TestCompareBundleTrees: + """Synthetic-bundle unit tests for compare_bundle_trees. + + These avoid running the full generator so the verifier's logic is exercised + independently of generation determinism. Real end-to-end determinism is + covered by TestDeterminism above. + """ + + def test_identical_trees_no_errors(self, tmp_path: Path) -> None: + a = _make_synthetic_bundle( + tmp_path / "a", + files={"tables/x.parquet": b"\x01\x02", "dataset_card.md": "hello"}, + ) + b = _make_synthetic_bundle( + tmp_path / "b", + files={"tables/x.parquet": b"\x01\x02", "dataset_card.md": "hello"}, + ) + assert compare_bundle_trees(a, b) == [] + + def test_only_in_a_reported(self, tmp_path: Path) -> None: + a = _make_synthetic_bundle( + tmp_path / "a", + files={"tables/x.parquet": b"x", "tables/extra.parquet": b"y"}, + ) + b = _make_synthetic_bundle(tmp_path / "b", files={"tables/x.parquet": b"x"}) + errors = compare_bundle_trees(a, b) + assert any("only in A" in e and "extra.parquet" in e for e in errors) + + def test_only_in_b_reported(self, tmp_path: Path) -> None: + a = _make_synthetic_bundle(tmp_path / "a", files={"tables/x.parquet": b"x"}) + b = _make_synthetic_bundle( + tmp_path / "b", + files={"tables/x.parquet": b"x", "metadata/world_spec.json": "{}"}, + ) + errors = compare_bundle_trees(a, b) + assert any("only in B" in e and "world_spec.json" in e for e in errors) + + def test_hash_mismatch_reported_with_sizes(self, tmp_path: Path) -> None: + a = _make_synthetic_bundle(tmp_path / "a", files={"tables/x.parquet": b"abc"}) + b = _make_synthetic_bundle(tmp_path / "b", files={"tables/x.parquet": b"abcd"}) + errors = compare_bundle_trees(a, b) + assert len(errors) == 1 + assert "hash mismatch" in errors[0] + assert "x.parquet" in errors[0] + assert "A=3B" in errors[0] + assert "B=4B" in errors[0] + + def test_manifest_only_timestamp_diff_passes(self, tmp_path: Path) -> None: + manifest_a = {"seed": 42, "generation_timestamp": "2026-01-01T00:00:00+00:00"} + manifest_b = {"seed": 42, "generation_timestamp": "2026-12-31T23:59:59+00:00"} + a = _make_synthetic_bundle(tmp_path / "a", files={}, manifest=manifest_a) + b = _make_synthetic_bundle(tmp_path / "b", files={}, manifest=manifest_b) + assert compare_bundle_trees(a, b) == [] + + def test_manifest_real_diff_reported(self, tmp_path: Path) -> None: + manifest_a = {"seed": 42, "generation_timestamp": "2026-01-01T00:00:00+00:00"} + manifest_b = {"seed": 43, "generation_timestamp": "2026-01-01T00:00:00+00:00"} + a = _make_synthetic_bundle(tmp_path / "a", files={}, manifest=manifest_a) + b = _make_synthetic_bundle(tmp_path / "b", files={}, manifest=manifest_b) + errors = compare_bundle_trees(a, b) + assert len(errors) == 1 + assert "manifest payload mismatch" in errors[0] + + def test_manifest_key_reorder_only_passes(self, tmp_path: Path) -> None: + # Same logical payload, different on-disk key order — must NOT be flagged + # as a mismatch. (json.dumps with sort_keys=True normalises both sides.) + a_root = tmp_path / "a" + b_root = tmp_path / "b" + a_root.mkdir() + b_root.mkdir() + (a_root / "manifest.json").write_text(json.dumps({"seed": 42, "n_leads": 100}, indent=2)) + (b_root / "manifest.json").write_text(json.dumps({"n_leads": 100, "seed": 42}, indent=2)) + assert compare_bundle_trees(a_root, b_root) == [] + + def test_nested_manifest_not_special_cased(self, tmp_path: Path) -> None: + # Only the top-level bundle manifest.json gets timestamp-stripping. + # A file named manifest.json deeper in the tree is compared byte-for-byte. + a = _make_synthetic_bundle( + tmp_path / "a", + files={"tasks/foo/manifest.json": '{"generation_timestamp": "T1"}'}, + ) + b = _make_synthetic_bundle( + tmp_path / "b", + files={"tasks/foo/manifest.json": '{"generation_timestamp": "T2"}'}, + ) + errors = compare_bundle_trees(a, b) + assert any("hash mismatch" in e for e in errors)