Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .agent-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ First public dataset release: `leadforge-b2b-lead-scoring`. Three difficulty tie
- [x] Verify three tiers produce different conversion rates (intro 41.5%, intermediate 20.1%, advanced 7.9%)
- [x] Update release/README.md — remove stale "Known limitations", add conversion rates to dataset summary
- [x] Update release/HF_DATASET_CARD.md — add conversion rates to summary table
- [ ] Verify SHA-256 hash determinism (re-run build, compare hashes)
- [x] Verify SHA-256 hash determinism (re-run build, compare hashes) — `scripts/verify_hash_determinism.py`; 73/73 files identical across two `build_public_release.py` runs (modulo `manifest.json`'s wall-clock `generation_timestamp`)
- [ ] Upload to Kaggle and HuggingFace
- [ ] Announce

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,4 @@ release/intermediate/
release/advanced/
release/intermediate_instructor/
release/LICENSE
release/_determinism/
5 changes: 5 additions & 0 deletions leadforge/render/manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@
# Bump this whenever the bundle layout or manifest schema changes.
BUNDLE_SCHEMA_VERSION = "2"

# Manifest fields whose value is non-deterministic by design (wall-clock,
# host metadata, etc.). Determinism checks must ignore these fields when
# comparing two bundles produced from the same (recipe, config, seed, version).
NON_DETERMINISTIC_MANIFEST_FIELDS: tuple[str, ...] = ("generation_timestamp",)


def build_manifest(
config: GenerationConfig,
Expand Down
65 changes: 65 additions & 0 deletions leadforge/validation/invariants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@

from __future__ import annotations

import json
from pathlib import Path

from leadforge.core.hashing import file_sha256
from leadforge.render.manifests import NON_DETERMINISTIC_MANIFEST_FIELDS


def check_determinism(bundle_a: Path, bundle_b: Path) -> list[str]:
Expand Down Expand Up @@ -60,6 +62,69 @@ def check_determinism(bundle_a: Path, bundle_b: Path) -> list[str]:
return errors


def _manifest_payloads_match_modulo_non_deterministic(a: Path, b: Path) -> bool:
"""Compare two manifest.json files after stripping non-deterministic fields.

Re-dumps both payloads with ``sort_keys=True`` so a key reordering still
counts as a mismatch.
"""
payload_a = json.loads(a.read_text())
payload_b = json.loads(b.read_text())
for field in NON_DETERMINISTIC_MANIFEST_FIELDS:
payload_a.pop(field, None)
payload_b.pop(field, None)
return json.dumps(payload_a, sort_keys=True) == json.dumps(payload_b, sort_keys=True)


def compare_bundle_trees(bundle_a: Path, bundle_b: Path) -> list[str]:
"""Full-tree byte-identical comparison of two bundle directories.

Walks every file under both roots and reports:

- files present in only one tree (``only in A:`` / ``only in B:``)
- files whose SHA-256 differs (``hash mismatch:``)

The bundle ``manifest.json`` is special-cased: it carries
``generation_timestamp`` (wall-clock UTC, set by ``build_manifest()``),
which is expected to differ across runs unless the caller pinned it.
For that one file, if the raw hashes differ, the function re-compares the
payload with non-deterministic fields stripped (see
:data:`NON_DETERMINISTIC_MANIFEST_FIELDS`). A mismatch *after* stripping
is still reported.

Use this for release-time integration checks; for the fast in-process
determinism property used in CI, see :func:`check_determinism`.
"""
errors: list[str] = []

files_a = {p.relative_to(bundle_a) for p in bundle_a.rglob("*") if p.is_file()}
files_b = {p.relative_to(bundle_b) for p in bundle_b.rglob("*") if p.is_file()}

for rel in sorted(files_a - files_b):
errors.append(f"only in A: {rel}")
for rel in sorted(files_b - files_a):
errors.append(f"only in B: {rel}")

for rel in sorted(files_a & files_b):
path_a = bundle_a / rel
path_b = bundle_b / rel
if file_sha256(path_a) == file_sha256(path_b):
continue
if rel.name == "manifest.json" and rel.parent == Path():
if _manifest_payloads_match_modulo_non_deterministic(path_a, path_b):
continue
errors.append(
f"manifest payload mismatch (after stripping "
f"{list(NON_DETERMINISTIC_MANIFEST_FIELDS)}): {rel}"
)
continue
size_a = path_a.stat().st_size
size_b = path_b.stat().st_size
errors.append(f"hash mismatch: {rel} (sizes: A={size_a}B, B={size_b}B)")

return errors


def check_exposure_monotonicity(student_bundle: Path, instructor_bundle: Path) -> list[str]:
"""Verify that student_public is a subset of research_instructor.

Expand Down
37 changes: 33 additions & 4 deletions scripts/build_public_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""Build the public release bundles for Kaggle/HuggingFace.

Usage:
python scripts/build_public_release.py [OUTPUT_DIR]
python scripts/build_public_release.py [OUTPUT_DIR] [--generation-timestamp ISO8601]

Generates four bundles:
- intro/ (student_public, intro difficulty)
Expand All @@ -14,10 +14,16 @@
(lead_scoring.csv) merging train/valid/test with a ``split`` column.

All bundles are validated with ``leadforge validate`` after generation.

The ``--generation-timestamp`` flag pins ``manifest.generation_timestamp`` to a
caller-supplied ISO-8601 UTC string. This is the supported way to produce
byte-reproducible bundles (used by ``scripts/verify_hash_determinism.py``);
the released bundles always use the wall-clock default.
"""

from __future__ import annotations

import argparse
import json
import shutil
import sys
Expand Down Expand Up @@ -45,6 +51,7 @@ def generate_and_save(
exposure_mode: str,
difficulty: str,
seed: int = SEED,
generation_timestamp: str | None = None,
) -> None:
"""Generate a bundle and write it to *out_dir*."""
gen = Generator.from_recipe(
Expand All @@ -54,7 +61,7 @@ def generate_and_save(
difficulty=difficulty,
)
bundle = gen.generate()
bundle.save(str(out_dir))
bundle.save(str(out_dir), generation_timestamp=generation_timestamp)


# Columns to drop from the flat CSV convenience export.
Expand Down Expand Up @@ -111,7 +118,24 @@ def print_summary(bundle_dir: Path, name: str) -> None:


def main() -> None:
output_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("release")
parser = argparse.ArgumentParser(description=__doc__.split("\n", maxsplit=1)[0])
parser.add_argument(
"output_dir",
nargs="?",
default="release",
help="Output directory (default: release/)",
)
parser.add_argument(
"--generation-timestamp",
default=None,
help=(
"ISO-8601 UTC string to pin manifest.generation_timestamp. "
"Default: wall-clock now. Use this for reproducible bundles."
),
)
args = parser.parse_args()

output_root = Path(args.output_dir)
output_root.mkdir(parents=True, exist_ok=True)

# Copy LICENSE
Expand All @@ -122,7 +146,12 @@ def main() -> None:
for dir_name, exposure_mode, difficulty in BUNDLES:
bundle_dir = output_root / dir_name
print(f"Generating {dir_name} ({exposure_mode}, {difficulty})...", file=sys.stderr)
generate_and_save(bundle_dir, exposure_mode, difficulty)
generate_and_save(
bundle_dir,
exposure_mode,
difficulty,
generation_timestamp=args.generation_timestamp,
)

# Flat CSV for student_public bundles
if exposure_mode == "student_public":
Expand Down
154 changes: 154 additions & 0 deletions scripts/verify_hash_determinism.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""Verify SHA-256 hash determinism of the public release build.

Runs ``scripts/build_public_release.py`` twice into two output directories with
the same seed/config and a *pinned* manifest timestamp, then asserts every
generated file hashes identically across runs.

Pinning ``--generation-timestamp`` on the build script means the resulting
``manifest.json`` is also byte-identical — no special-cased manifest stripping
needed at compare time. (For defence-in-depth, the underlying
:func:`leadforge.validation.invariants.compare_bundle_trees` still tolerates
a wall-clock-only manifest diff, but pinning is the supported workflow.)

The architectural invariant being enforced is
"generation is deterministic given (recipe, config, seed, version)".
The corresponding fast in-process check lives in
``tests/validation/test_invariants.py::TestDeterminism`` and runs in CI on
every PR; this script is the slower release-time check that exercises the
full ``build_public_release.py`` pipeline.

On failure, output directories are preserved (NOT auto-cleaned) so the
mismatching artifacts can be diffed directly.

Exit code: 0 on PASS, 1 on FAIL.

Usage:
python scripts/verify_hash_determinism.py [--out DIR] [--keep-on-success]
"""

from __future__ import annotations

import argparse
import shutil
import subprocess
import sys
from pathlib import Path

from leadforge.core.hashing import file_sha256
from leadforge.validation.invariants import compare_bundle_trees

REPO_ROOT = Path(__file__).resolve().parent.parent
BUILD_SCRIPT = REPO_ROOT / "scripts" / "build_public_release.py"

# Pinned timestamp for both runs. Any fixed ISO-8601 UTC string works; using
# the unix epoch makes it obvious that it's a sentinel, not a real run time.
PINNED_TIMESTAMP = "1970-01-01T00:00:00+00:00"

# Bundle subdirectories produced by build_public_release.py. Hardcoded here
# because the script's BUNDLES list is not exposed as a public API. If the
# build script grows new bundles, add them here.
BUNDLE_DIRS = ("intro", "intermediate", "advanced", "intermediate_instructor")


def run_build(out_dir: Path) -> None:
cmd = [
sys.executable,
str(BUILD_SCRIPT),
str(out_dir),
"--generation-timestamp",
PINNED_TIMESTAMP,
]
print(f" $ {' '.join(cmd)}")
subprocess.run(cmd, check=True, cwd=REPO_ROOT) # noqa: S603


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__.split("\n", maxsplit=1)[0])
parser.add_argument(
"--out",
type=Path,
default=REPO_ROOT / "release" / "_determinism",
help="Base directory for both runs (will be wiped at start). "
"Default: release/_determinism/",
)
parser.add_argument(
"--keep-on-success",
action="store_true",
help="Keep output directories even on PASS (default: clean up on PASS, "
"always preserve on FAIL).",
)
return parser.parse_args()


def main() -> int:
args = parse_args()

if not BUILD_SCRIPT.exists():
print(f"FAIL: build script not found at {BUILD_SCRIPT}", file=sys.stderr)
return 1

base = args.out
run_a = base / "run_a"
run_b = base / "run_b"

# Wipe and recreate.
if base.exists():
shutil.rmtree(base)
base.mkdir(parents=True)

print(f"Run A → {run_a}")
run_build(run_a)
print(f"Run B → {run_b}")
run_build(run_b)

# Per-bundle comparison so error messages stay scoped to a single bundle.
all_errors: list[tuple[str, list[str]]] = []
total_files = 0
for name in BUNDLE_DIRS:
bundle_a = run_a / name
bundle_b = run_b / name
if not bundle_a.exists() or not bundle_b.exists():
all_errors.append((name, [f"bundle directory missing: {name}"]))
continue
errors = compare_bundle_trees(bundle_a, bundle_b)
bundle_files = sum(1 for p in bundle_a.rglob("*") if p.is_file())
total_files += bundle_files
if errors:
all_errors.append((name, errors))

# Top-level files (LICENSE, etc.) — compare via hash directly.
top_a = {p.name for p in run_a.iterdir() if p.is_file()}
top_b = {p.name for p in run_b.iterdir() if p.is_file()}
top_errors: list[str] = []
for name in sorted(top_a - top_b):
top_errors.append(f"top-level file only in A: {name}")
for name in sorted(top_b - top_a):
top_errors.append(f"top-level file only in B: {name}")
for name in sorted(top_a & top_b):
if file_sha256(run_a / name) != file_sha256(run_b / name):
top_errors.append(f"top-level hash mismatch: {name}")
total_files += len(top_a)
if top_errors:
all_errors.append(("<top-level>", top_errors))

if not all_errors:
print(f"\nPASS: all {total_files} files hash identically across runs.")
if not args.keep_on_success:
shutil.rmtree(base)
print(f"(cleaned up {base})")
else:
print(f"(kept artifacts at {base})")
return 0

print(f"\nFAIL: mismatches in {len(all_errors)} bundle(s):")
for name, errors in all_errors:
print(f" [{name}]")
for e in errors:
print(f" - {e}")
print(f"\nArtifacts preserved for inspection:\n A: {run_a}\n B: {run_b}")
return 1


if __name__ == "__main__":
sys.exit(main())
Loading
Loading