From dbe87d0c8439fe2b1822c2bd9bd46e20d5b2e6c7 Mon Sep 17 00:00:00 2001
From: Shay Palachy
Date: Mon, 11 May 2026 13:49:49 +0300
Subject: [PATCH 1/4] PR 7.2.1: agent-reviewable release artifacts
Make the published Kaggle / HuggingFace bundle self-contained for AI
and offline review. Every numerical / structural claim in the README
is now verifiable from inside the bundle without following a
github.com/blob/main/... link.
What's new
- release/metrics.json (root) + release//metrics.json (per tier):
deterministic JSON view of LR AUC / AP / P@100 / Brier / conversion
rate / cohort-shift / cross-tier ordering medians, with JSON-path
back-references to release/validation/validation_report.json.
Built by scripts/build_release_metrics.py (--check mode for CI).
- release/docs/ vendored copies of generation_method.md,
channel_signal_audit.md, break_me_guide.md, feature_dictionary.md,
v1_acceptance_gates_bands.yaml, v2_decision_log.md, kept in sync
by scripts/sync_release_docs.py (--check mode for CI).
- release/docs/relational_table_schemas.csv: per-column documentation
for all 9 relational tables (64 columns), validated against live
parquet schemas in the new tests. Kaggle packager threads these
descriptions into resources[].schema.fields[].description so the
preview's previously-empty col__desc cells are now populated for
every relational table.
- release/claims_register_source.yaml (hand-edited) +
release/claims_register.{md,json} (rendered by
scripts/build_claims_register.py): 26 claims across nine categories,
each paired with backing artifact + JSON / YAML path. JSON output
carries a schema block so an agent landing on the file with no
context can interpret its own fields.
- schema.org/Dataset JSON-LD block injected into the of both
Kaggle and HuggingFace preview HTML pages; shared
render_jsonld_dataset helper in scripts/_preview_common.py
HTML-escapes <, >, & inside the rendered JSON.
- Instructor HF README gets an "Agent-reviewable artifacts" section
pointing reviewers at docs/, claims_register.{md,json}, the
per-tier manifest, and feature_dictionary.csv. Cross-tier
metrics.json intentionally omitted from instructor (single-tier
dataset).
Both platform packagers extended
- scripts/package_kaggle_release.py and scripts/package_hf_release.py
copy the new root files (metrics.json, claims_register.*) and the
docs/ subtree into their upload trees so platform agents and
offline reviewers see the same files. Kaggle additionally
enumerates them in resources[] so the published "Data Files" panel
lists them.
- scripts/_release_common.py: new AGENT_REVIEWABLE_ROOT_FILES /
AGENT_REVIEWABLE_DOCS_DIR constants and
load_relational_column_descriptions() helper. SOURCE_TREE_BLOCK
updated in lockstep with the source-repo tree diagram in
release/README.md.
- release/README.md "What's inside" grows an "Agent-reviewable
artifacts" subsection mirroring the upload trees.
Tests
- 28 new cases across tests/scripts/test_sync_release_docs.py,
test_build_release_metrics.py, test_build_claims_register.py
covering happy path, idempotence, --check drift, missing-source
paths, invalid-YAML rejection, per-tier-skipping when bundle dirs
aren't materialised, and audit-sync against the real release/ tree.
- 4 new cases in test_preview_{kaggle,hf}_page.py pinning JSON-LD
presence in , byte-equality of JSON-LD across HF variants,
and the SPDX-URL form of the license field.
- test_package_kaggle_release.py extended to assert per-table parquet
schemas now carry column descriptions and that the new
agent-reviewable root resources land in resources[].
- Committed previews (release/_preview_committed/*.html) regenerated.
Net: 1400/1400 tests pass + 5 publish-extra-gated skips; ruff clean
across the touched scripts.
Co-Authored-By: Claude Opus 4.7
---
.agent-plan.md | 1 +
release/README.md | 33 ++
.../huggingface_instructor.html | 63 +++
.../huggingface_public.html | 106 +++++
release/_preview_committed/kaggle.html | 448 ++++++++++++------
release/claims_register.json | 221 +++++++++
release/claims_register.md | 81 ++++
release/claims_register_source.yaml | 204 ++++++++
release/docs/break_me_guide.md | 369 +++++++++++++++
release/docs/channel_signal_audit.md | 66 +++
release/docs/feature_dictionary.md | 210 ++++++++
release/docs/generation_method.md | 166 +++++++
release/docs/relational_table_schemas.csv | 65 +++
release/docs/v1_acceptance_gates_bands.yaml | 155 ++++++
release/docs/v2_decision_log.md | 48 ++
release/huggingface-instructor/README.md | 21 +
release/huggingface/README.md | 33 ++
release/kaggle/dataset-metadata.json | 205 +++++++-
release/metrics.json | 219 +++++++++
scripts/_preview_common.py | 82 ++++
scripts/_release_common.py | 65 +++
scripts/build_claims_register.py | 300 ++++++++++++
scripts/build_release_metrics.py | 300 ++++++++++++
scripts/package_hf_release.py | 58 +++
scripts/package_kaggle_release.py | 176 ++++++-
scripts/preview_hf_page.py | 56 ++-
scripts/preview_kaggle_page.py | 52 +-
scripts/sync_release_docs.py | 154 ++++++
tests/scripts/test_build_claims_register.py | 180 +++++++
tests/scripts/test_build_release_metrics.py | 180 +++++++
tests/scripts/test_package_kaggle_release.py | 23 +
tests/scripts/test_preview_hf_page.py | 27 ++
tests/scripts/test_preview_kaggle_page.py | 20 +
tests/scripts/test_sync_release_docs.py | 98 ++++
34 files changed, 4323 insertions(+), 162 deletions(-)
create mode 100644 release/claims_register.json
create mode 100644 release/claims_register.md
create mode 100644 release/claims_register_source.yaml
create mode 100644 release/docs/break_me_guide.md
create mode 100644 release/docs/channel_signal_audit.md
create mode 100644 release/docs/feature_dictionary.md
create mode 100644 release/docs/generation_method.md
create mode 100644 release/docs/relational_table_schemas.csv
create mode 100644 release/docs/v1_acceptance_gates_bands.yaml
create mode 100644 release/docs/v2_decision_log.md
create mode 100644 release/metrics.json
create mode 100644 scripts/build_claims_register.py
create mode 100644 scripts/build_release_metrics.py
create mode 100644 scripts/sync_release_docs.py
create mode 100644 tests/scripts/test_build_claims_register.py
create mode 100644 tests/scripts/test_build_release_metrics.py
create mode 100644 tests/scripts/test_sync_release_docs.py
diff --git a/.agent-plan.md b/.agent-plan.md
index 47c1e6c..a6106fc 100644
--- a/.agent-plan.md
+++ b/.agent-plan.md
@@ -65,6 +65,7 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family
### Phase 7 — LLM critique + publish (3 PRs)
- [x] PR 7.1: LLM critique module + prompt + driver landed. `leadforge/validation/llm_critique.py` (new) — single-provider Anthropic critique core via an `LLMCritiqueClient` protocol (no preemptive OpenAI/Gemini stubs); `_AnthropicCritiqueClient` lazy-imports the SDK so the module imports cleanly even on machines without `anthropic` installed (the skip-cleanly path needs to work without the SDK). `has_anthropic_credentials` / `api_key_or_skip` treat unset and empty-after-strip identically as "absent", explicitly to handle the `env -i` / stale `.envrc` case where the shell sets `ANTHROPIC_API_KEY=""` and the SDK would otherwise 401 instead of cleanly skipping. Default model `claude-opus-4-7` with `thinking={"type": "adaptive", "display": "summarized"}` (only mode supported on Opus 4.7 — manual `budget_tokens` 400s) and `output_config={"effort": "high"}` (recommended minimum for intelligence-sensitive work per the `claude-api` skill); two prompt-cache breakpoints (rubric + input bundle) per the design doc's caching strategy so the common adjudication-loop workflow hits cache on both layers; streamed via `messages.stream(...).get_final_message()` to dodge the 10-min idle-connection timeout on long adaptive-thinking responses. `build_input_bundle` is pure (same `release_dir` → byte-identical bytes → identical `sha256`) and assembles eleven blocks: `release/README.md`, per-tier `dataset_card.md`, `docs/release/generation_method.md`, `manifest.json`, `feature_dictionary.csv`, `validation_report.{md,json}`, the first 100 test-split rows rendered as deterministic CSV, the public/instructor diff summary (live-derived from the `BANNED_LEAD_COLUMNS` / `BANNED_OPP_COLUMNS` / `BANNED_TABLES` / `SNAPSHOT_FILTERED_TABLES` constants in `leakage_probes.py` — single source of truth, auto-stays-in-sync, sync-tested), the public-safe mechanism summary (motif family **names** + difficulty knob **names**, never values — same redaction posture as `student_public`), and the break-me guide verbatim ("avoid re-deriving" the existing nine patterns). `parse_critique_response` schema-validator pins eleven malformations (missing required field, wrong severity, wrong category, wrong rubric dimension, finding-id collision, findings non-list, top-level non-object, non-JSON, score out of range, defensive code-fence stripping, empty findings list valid) and returns every problem in one error rather than the first one. Output schema is a frozen dataclass (no pydantic dependency) with the nine-value `category` vocabulary lifted **verbatim** from `break_me_guide.md` so findings route to existing issue-template labels without translation; `rubric_dimension: str` is required on every finding (D1-D14) so reviewers can audit clustering. Provenance triple (`model` / `effort` / `thinking_mode`) plus per-source-file `bundle_hashes` and the assembled `input_bundle_sha256` are carried on every result for audit-artifact-sync — re-runs on the same RC produce the same bundle hashes. `docs/release/llm_critique_prompt.md` (new) — the rubric document the driver feeds to Claude, parseable via `` / `` section markers with surrounding prose ignored; fourteen rubric dimensions (D1 documentation truthfulness · D2 leakage discipline · D3 realism vs disclosure · D4 difficulty signal · D5 calibration / value-aware ranking · D6 cohort/time-window discipline · D7 notebook integrity · D8 platform packaging hygiene · D9 adversarial-framing completeness · D10 pedagogy of the documented `total_touches_all` trap · D11 effective semantic diversity per recommendation #12 v1 scope · D12 Datasheets-for-Datasets composition · D13 manifest/provenance integrity · D14 out-of-scope guard). Severity calibration explicitly written to discourage padding the report with low-severity nits and to surface "no high-severity findings" as a positive signal vs "the critique didn't surface any". `scripts/run_llm_critique.py` (new) — driver mirroring `validate_release_candidate.py`'s posture (free-function `parse_args`, frozen `DriverConfig`, `run_critique(config) -> DriverResult`, `main(argv)` returning an exit code). Skip-cleanly path triggers BEFORE any I/O — no rubric read, no bundle build, no out-dir creation; tested explicitly with `not (tmp_path / "out").exists()` after the skip. Three modes alongside the live path: `--dry-run` writes the rendered input bundle to `/llm_critique_input_.md` for human inspection (different filename from the real raw JSON, can't be confused); `--no-execute` calls `api_key_or_skip` + `build_anthropic_client()` to prove the SDK is installed and creds are present without burning an API call (CI smoke); `--out-tag` suffixes the raw filename so adjudication re-runs don't shadow the canonical run. Outputs: timestamped `llm_critique_raw_.json` (accumulates per run, no clobber) + canonical `llm_critique_summary.md` (overwritten in place so dataset-card links don't rot). Exit codes mirror `validate_release_candidate.py`: 0 pass (skip-cleanly counts as pass), 1 high-severity surfaced and unresolved, 2 pre-flight error or schema-validation failure (every problem rendered to stderr, not just the first). Adjudication is **maintainer-driven** post-exit — resolve in code OR log to `v2_decision_log.md`, then re-run; the next critique's exit code is the gate. Tests: 61 cases across `tests/validation/test_llm_critique.py` (48) and `tests/scripts/test_run_llm_critique.py` (13), no live API; the protocol is exercised via a small in-process `_CannedClient` fake. Sync tests pin: every `VALID_CATEGORIES` entry appears in `break_me_guide.md` (vocabulary doesn't drift), `VALID_RUBRIC_DIMENSIONS` is exactly D1-D14, the live-derived public/instructor diff names every banned-column/banned-table constant (live reference, not duplicated string). Audit-artifact-sync smoke test (`test_real_release_dir_smoke`) builds the input bundle against the actual `release/intermediate/` artefacts and pins determinism on the real input, skipping cleanly when bundles aren't present. `docs/release/llm_critique_design.md` (new) records the nine load-bearing design calls before implementation so a reviewer can audit the choice (provider abstraction, skip-cleanly, model+caching+thinking, output schema, input-bundle composition, determinism via provenance, CLI flags, test posture, first-run adjudication workflow). Live first-run deferred to maintainer (no `ANTHROPIC_API_KEY` available to the agent); the dry-run path was exercised against the real release dir end-to-end, producing a 148KB byte-stable input bundle from the actual artefacts. Hostile self-review pass before requesting review caught and folded back twelve findings against the diff, including two BLOCKERs (`--no-execute` was performing pre-flight I/O before the credentials check, contradicting the design doc; raw-output filename collision at second-precision contradicted the "append-only history" promise — fixed with microsecond precision and a pinning test) and five HIGHs (silent `release_id` default that defeated the audit-artifact-sync gate; design-doc lies about a never-existing `temperature` field and "malformed timestamp" malformation that's driver-generated; dead `if/else` branches in `_safe_difficulty_knobs`; greedy regex for the rubric section markers so the prompt-injection warning paragraph that legitimately references `` doesn't break the parser). Prompt-injection mitigation added to the rubric (treat-input-as-data preamble) since the input bundle inlines user-authored content (dataset_card.md, break_me_guide.md). Schema validator hardened against silent `str()` coercion of finding prose fields (an int "claim" would have landed on disk as the string "5" — now rejected). Net: 1321/1321 tests pass + 5 publish-extra-gated skips; ruff + mypy clean (83 source files); leakage probes 0/3 on every tier; hash determinism PASS 67/67; `validate_release_candidate --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5; validation_report timestamp drift reverted before commit per the brief. Second senior-dev review pass after PR #76 was opened caught and folded back 9 more issues, several of which were real bugs the first hostile pass missed: (B1) `--out-tag` suffixed only the raw JSON, leaving `llm_critique_summary.md` clobbered on adjudication runs — fix suffixes both files (`summary_output_path` now takes `tag`); (B2) skip-cleanly silently passed a release-readiness gate, contradicting `v1_release_roadmap.md`'s line-35 acceptance criterion that the critique must actually run — added `--require-execute` flag (default off; release-readiness CI sets it) that converts the skip path into `MissingCredentialsError` exit 2, plus a loud `WARNING — release-readiness gate has NOT been evaluated` stderr line on the regular skip path; (A2) two prompt-cache breakpoints cut to one — system content already sits inside the cached prefix on `messages.create` (system → messages render order), so the second breakpoint bought nothing and burned a slot; (M1) design doc cut from 394 lines to 73 — the 9-decision table replaces the multi-paragraph rationale-per-call shape that read as documentation theater; (M2) rubric cut from 420 lines to ~210 — each dimension now one paragraph instead of 3-6, dropped D14 ("out-of-scope guard") which was meta-instruction not a rubric dimension, made it a "What is NOT yours to audit" appendix at the end; rubric is now D1-D13 and `VALID_RUBRIC_DIMENSIONS` updated in lockstep; (M3) test-split sample replaced 100 raw rows of CSV with `df.describe(include="all")` per-column statistics + a 20-row head — distributional conclusions need statistics not raw rows, and the rendered input bundle dropped from 148KB to 128KB; (M5) streaming-via-`messages.stream` replaced with `messages.create(timeout=600.0)` — no stream events were processed anyway, the contract is just "don't time out on long adaptive-thinking responses" and an explicit timeout is the right way to spell that; (M6) `render_input_bundle_text` free function moved to `InputBundle.render()` method — leaky abstraction; the audit-artifact-sync framing was misleading (no committed-artefact diff) and was renamed to "smoke test against the real release dir" / "staleness check vs committed result" throughout the module and design doc. Net after the second pass: 1323/1323 tests pass + 5 publish-extra-gated skips; ruff + mypy clean; leakage probes 0/3 on every tier; hash determinism PASS 67/67; `validate_release_candidate --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5; validation_report timestamp drift reverted again before this commit. First live critique run executed by the maintainer with a dedicated Anthropic project key (`leadforge-llm-critique-v1-prod`): score 7/10, six findings (1 high, 4 medium, 1 low), exit code 1 as designed for unresolved high-severity findings. Adjudication: F001 high-severity (93 % `account_id` overlap between train/test documented only in break_me_guide §5, missing from README/dataset_card) — **resolved in code** by adding a "Group-leakage warning" paragraph to `release/README.md` "Splits" subsection citing the 518/557 figure and a `GroupKFold(account_id)` recipe; the parallel disclosure on the auto-rendered `dataset_card.md` is logged as `accepted-for-v2` because the renderer change is out of scope for PR 7.1's no-bundle-regen rule. F004 medium (break_me_guide pattern 5 covered `account_id` but not `contact_id`, despite contacts being shared across the lead-keyed split at the same magnitude) — **resolved in code** by extending §5 to enumerate both keys and any reusable foreign-key column as group-leakage axes. F006 low (README "Conversion rate (recipe band)" column header didn't make clear it was a recipe-acceptance window not an observed range) — **resolved in code** by renaming to "(acceptance band, gate G7.\*)" and adding a one-sentence note that observed five-seed spreads sit comfortably inside the band. F002 medium (Gaussian noise produces non-physical values: negative ACV, negative day-deltas, day-deltas > snapshot_day=30, undisclosed in dataset card) — `accepted-for-v2`; requires `leadforge/narrative/dataset_card.py` change. F003 medium (`](../foo)` relative links would 404 on Kaggle/HF) — `wont-fix`: already treated by `scripts/_release_common.py::rewrite_release_links()` which both platform packagers (PR 5.1, 5.2) call at packaging time; the LLM didn't have visibility into the platform packagers and made a wrong inference. F005 medium (advanced-tier `calibration_max_bin_error = 0.5234` driven by an n=2 high-probability bin, no minimum-bin-count footnote) — `accepted-for-v2`; not a 1-line change, touches `release_quality.py` metric definition and would require regenerating `validation_report.{json,md}` which PR 7.1's brief explicitly forbids. Three missing-section callouts (Datasheets §Biases, §Privacy, per-bundle group-split warning) and three maintainer questions (noise/windowing interaction, `top_decile_rate` naming, Kaggle/HF docs subtree) all logged to `docs/release/v2_decision_log.md`. README edits cascaded into the platform packager artefacts; `release/kaggle/dataset-metadata.json` and `release/huggingface/README.md` regenerated cleanly via the existing packagers (`scripts/package_{kaggle,hf}_release.py`). Critique run output committed to `release/validation/llm_critique_raw_20260508T204359.124834Z.json` + `release/validation/llm_critique_summary.md`. Final net: 1325/1325 tests pass + 5 publish-extra-gated skips; ruff + mypy clean (83 source files); leakage probes 0/3 on every tier; hash determinism PASS 67/67; `validate_release_candidate --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5. Phase 7 PR 7.1 closed; PR 7.2 (local Kaggle/HF mock-page preview) is next.
- [x] PR 7.2: local Kaggle + HuggingFace mock-page preview tooling landed. `scripts/preview_kaggle_page.py` (new) — reads the *exact* artefacts the publish PR will upload (`release/kaggle/dataset-metadata.json` + the inlined README body + the cover image, prefer `release/kaggle/dataset-cover-image.png` then fall back to the gitignore-resilient `release/dataset-cover-image.png` master copy) and renders an offline HTML page mocking the public Kaggle dataset view: header (title / subtitle / id pill / licence / update-frequency / visibility), cover image, rendered description (the inlined README body), file tree of declared resources grouped by tier with per-tier counts, schema/columns table for every tabular resource (`resources[].schema.fields[].name/type/description`) with per-table column counts in the heading, user-specified-sources block (rendered only when present), keywords + licence footer. Serves on `http://localhost:8765` via stdlib `http.server.ThreadingHTTPServer` (the threading variant inherits `allow_reuse_address=True` from `HTTPServer`, so Ctrl-C → re-run within ~60s does not raise `OSError [Errno 48] Address already in use` while the socket sits in TIME_WAIT — caught and folded back in self-review pass 1, the initial draft used `socketserver.ThreadingTCPServer` which defaults to `False`). `--no-serve` builds the HTML and exits (CI / inspection mode); `--open-browser` pops a tab on startup; `--port` / `--release-dir` / `--out-dir` round out the surface. `scripts/preview_hf_page.py` (new) — reads `release/huggingface/README.md` (or `release/huggingface-instructor/README.md` per `--variant=public|instructor`) and parses YAML frontmatter + Markdown body via a single anchored regex (`r"\A---\n(?P.*?)\n---\n(?P.*)\Z"` with `re.DOTALL`); renders the analogous HF view: header pills (pretty_name + license + task_categories + size_categories + language), tag chips, configs dropdown (one details-block per `configs[]` entry with the default config flagged via a single `badge--default` instance, data_files split→path table per config), file tree of declared YAML paths bucketed by config, README body, footer carrying the variant for human visual confirmation. `--variant` defaults `--out-dir` to `release/_preview/huggingface/` (public) or `release/_preview/huggingface-instructor/` (instructor); the instructor path also reads its README from a different location (`huggingface-instructor/README.md`) and looks for the cover under the variant directory first. Both scripts share the validation discipline from the Phase 5 packagers: build → validate → write; pre-flight failures (missing metadata, malformed JSON / YAML, unknown variant, missing cover) raise and the CLI converts to rc=2 without touching disk; runtime success exits 0. Markdown rendering via `markdown-it-py` in `gfm-like` preset (tables / fenced code / strikethrough on; `linkify` explicitly disabled so the optional `linkify-it-py` transitive dep is not required); the dep is added to the `[publish]` extra alongside `datasets` / `kaggle` (mirrors the PR 5.1 / 5.2 gating posture for publish-pipeline tooling), and absent imports raise a clean `ImportError` pointing at `pip install -e ".[publish]"` instead of a cryptic stdlib `ModuleNotFoundError`. Both renderers are pure: same `(metadata|doc, cover_filename|variant)` → byte-identical HTML (no `now()`, no random, no clock). Output landing at `release/_preview//index.html` is gitignored (`.gitignore` adds `release/_preview/`); the audit-artefact-sync gate lives at `release/_preview_committed/{kaggle,huggingface_public,huggingface_instructor}.html` (committed alongside the scripts, mirrors the PR 4.1 / 5.1 / 5.2 / 7.1 audit-sync pattern). HTML is wrapped in a single self-contained file (CSS inlined, no external stylesheet) so each committed sample is human-inspectable directly from `git show` or a browser without a server. XSS-safety: every user-controlled string passes through a hand-rolled `_escape` (`&`, `<`, `>`, `"`, `'`); kept hand-rolled rather than `html.escape` so the committed samples' `'` (decimal) escapes don't churn against `html.escape`'s `'` (hex) entity. Tests: 48 cases across `tests/scripts/test_preview_kaggle_page.py` (20) and `tests/scripts/test_preview_hf_page.py` (28); no live HTTP, no network, no socket open. The four roadmap-mandated checks per script: required field labels appear in rendered HTML (Kaggle: title / subtitle / id / license / file count / schema column count; HF: pretty_name / license / configs / tags); every Markdown link in the source resolves to a non-allowlisted URL pattern fails the test (allow-list: `https://github.com/leadforge-dev/leadforge`, `https://huggingface.co/datasets/leadforge`, sibling-relative `LICENSE`, in-document `#` anchors — anything else is a 404 risk on the live page); the Kaggle schema table lists every column declared in `resources[].schema.fields` (iterates the committed metadata, asserts each `{name}` appears); every `configs[]` block in the HF YAML round-trips into the rendered dropdown. Determinism is double-tested: `test_render_is_byte_deterministic` runs two passes against the real release artefact and pins equality; `test_committed_*_sample_matches_fresh_regeneration` pins the committed HTML against fresh regeneration byte-for-byte (the audit-sync gate). Pre-flight error paths exercised end-to-end: missing artefact (`FileNotFoundError`), malformed JSON / YAML (`ValueError`), unknown variant, missing cover image — all return rc=2 via `main()` with informative stderr. HTML escape coverage: `test_render_escapes_html_in_field_values` asserts a `
@@ -92,6 +136,8 @@ What this companion contains
│ ├── tables/*.parquet # full-horizon tables (incl. customers, subscriptions)
│ ├── tasks/converted_within_90_days/{train,valid,test}.parquet
│ └── metadata/ # world_spec, graph.{graphml,json}, latent_registry, etc.
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
├── README.md # this file (HF dataset card)
├── dataset-cover-image.png # dataset thumbnail
└── LICENSE
@@ -218,6 +264,23 @@ Composition
every parquet file.
Bundle schema version. 5 (matches the public dataset).
+Agent-reviewable artifacts
+The companion ships the same self-contained review surface as the public
+bundle so an AI reviewer (or a researcher without GitHub access) can
+verify claims locally:
+
+docs/ — vendored copies of the generation method, leakage probes
+contract, acceptance bands, break-me guide, v2 decision log, and the
+per-relational-table column descriptions (relational_table_schemas.csv).
+claims_register.{md,json} — every numerical / structural claim
+in this card paired with the artifact and path that backs it.
+intermediate/manifest.json and intermediate/feature_dictionary.csv
+— SHA-256-hashed provenance and the authoritative column spec.
+
+The instructor companion intentionally omits the top-level
+metrics.json (cross-tier medians would be misleading for a single
+tier). Use the public dataset's metrics.json when comparing tier
+behaviour.
Maintenance, license
We want the dataset to be broken. See the
public dataset card
diff --git a/release/_preview_committed/huggingface_public.html b/release/_preview_committed/huggingface_public.html
index 3f1df70..c71a39e 100644
--- a/release/_preview_committed/huggingface_public.html
+++ b/release/_preview_committed/huggingface_public.html
@@ -35,6 +35,80 @@
.dataset-footer { margin-top: 48px; padding-top: 16px; border-top: 1px solid var(--border); color: var(--muted); font-size: 0.9em; }
.dataset-footer__note { font-style: italic; margin-top: 8px; }
+
@@ -115,11 +189,15 @@ What's inside
.
├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier
│ ├── manifest.json # provenance + file hashes
+│ ├── metrics.json # per-tier headline metrics (medians + spreads)
│ ├── dataset_card.md # auto-rendered per-bundle card
│ ├── feature_dictionary.csv # authoritative column spec
│ ├── lead_scoring.csv # flat convenience CSV (all splits)
│ ├── tables/*.parquet # 7 snapshot-safe relational tables
│ └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
+├── metrics.json # top-level cross-tier metrics summary
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
├── README.md # this file (HF dataset card)
├── dataset-cover-image.png # dataset thumbnail
└── LICENSE
@@ -129,6 +207,34 @@ What's inside
hidden causal structure (DAG, latent registry, mechanism summary)
under metadata/. The full layout is documented in each bundle's
manifest.json.
+Agent-reviewable artifacts
+The published bundle is self-contained for AI review and offline
+auditing — every numeric / structural claim on this page can be
+verified without following an external link:
+
+metrics.json (root) + <tier>/metrics.json — deterministic
+JSON view of the headline LR AUC / AP / P@100 / Brier / conversion
+rate / cohort-shift / cross-tier-ordering medians, with JSON-path
+back-references to validation/validation_report.json (the
+source of truth).
+claims_register.{md,json} — every numerical or structural
+claim on this page paired with the artifact and path that backs it.
+Rendered from claims_register_source.yaml by
+scripts/build_claims_register.py.
+docs/ — vendored copies of generation_method.md,
+channel_signal_audit.md, break_me_guide.md,
+feature_dictionary.md, v1_acceptance_gates_bands.yaml,
+v2_decision_log.md, plus a hand-authored
+relational_table_schemas.csv documenting every column of every
+relational table. These match the GitHub-blob links cited below but
+ship inside the bundle so a reviewer never needs network access.
+<tier>/manifest.json — SHA-256 hash for every file plus the
+full redaction contract (structural_redactions.columns,
+omitted_tables, relational_snapshot_safe, snapshot_day).
+- Kaggle / HuggingFace preview pages additionally inject a
+
schema.org/Dataset JSON-LD block in their <head> for agent
+ingestion without HTML parsing.
+
Quick start
# Flat CSV
df = pd.read_csv("intermediate/lead_scoring.csv")
diff --git a/release/_preview_committed/kaggle.html b/release/_preview_committed/kaggle.html
index d0ee29a..aa29ebf 100644
--- a/release/_preview_committed/kaggle.html
+++ b/release/_preview_committed/kaggle.html
@@ -42,6 +42,96 @@
.chip { display: inline-block; background: var(--pill-bg); border-radius: 12px; padding: 2px 10px; margin: 2px; font-size: 0.85em; }
.dataset-footer__note { font-style: italic; margin-top: 8px; }
+
@@ -82,11 +172,15 @@ What's inside
.
├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier
│ ├── manifest.json # provenance + file hashes
+│ ├── metrics.json # per-tier headline metrics (medians + spreads)
│ ├── dataset_card.md # auto-rendered per-bundle card
│ ├── feature_dictionary.csv # authoritative column spec
│ ├── lead_scoring.csv # flat convenience CSV (all splits)
│ ├── tables/*.parquet # 7 snapshot-safe relational tables
│ └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
+├── metrics.json # top-level cross-tier metrics summary
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
├── dataset-metadata.json # Kaggle dataset metadata
├── dataset-cover-image.png # Kaggle cover image
├── README.md # Kaggle package README
@@ -97,6 +191,34 @@ What's inside
hidden causal structure (DAG, latent registry, mechanism summary)
under metadata/. The full layout is documented in each bundle's
manifest.json.
+Agent-reviewable artifacts
+The published bundle is self-contained for AI review and offline
+auditing — every numeric / structural claim on this page can be
+verified without following an external link:
+
+metrics.json (root) + <tier>/metrics.json — deterministic
+JSON view of the headline LR AUC / AP / P@100 / Brier / conversion
+rate / cohort-shift / cross-tier-ordering medians, with JSON-path
+back-references to validation/validation_report.json (the
+source of truth).
+claims_register.{md,json} — every numerical or structural
+claim on this page paired with the artifact and path that backs it.
+Rendered from claims_register_source.yaml by
+scripts/build_claims_register.py.
+docs/ — vendored copies of generation_method.md,
+channel_signal_audit.md, break_me_guide.md,
+feature_dictionary.md, v1_acceptance_gates_bands.yaml,
+v2_decision_log.md, plus a hand-authored
+relational_table_schemas.csv documenting every column of every
+relational table. These match the GitHub-blob links cited below but
+ship inside the bundle so a reviewer never needs network access.
+<tier>/manifest.json — SHA-256 hash for every file plus the
+full redaction contract (structural_redactions.columns,
+omitted_tables, relational_snapshot_safe, snapshot_day).
+- Kaggle / HuggingFace preview pages additionally inject a
+
schema.org/Dataset JSON-LD block in their <head> for agent
+ingestion without HTML parsing.
+
Quick start
# Flat CSV
df = pd.read_csv("intermediate/lead_scoring.csv")
@@ -425,9 +547,9 @@ Maintenance, adversarial framing, license
is hashed in manifest.json.
- Data Files (42 total)
+ Data Files (56 total)
- intro/ (14 files)
+ intro/ (15 files)
intro/lead_scoring.csvIntro tier flat CSV (all splits concatenated, label retained, snapshot_day=30). The `split` column distinguishes train/valid/test rows.
intro/feature_dictionary.csvIntro tier feature dictionary (canonical column spec).
@@ -442,11 +564,12 @@ Data Files (42 total)<
intro/tables/sales_activities.parquetIntro tier `sales_activities` relational table (21,358 rows) — snapshot-safe.
intro/tables/opportunities.parquetIntro tier `opportunities` relational table (4,426 rows) — snapshot-safe.
intro/dataset_card.mdIntro tier auto-rendered dataset card.
+ intro/metrics.jsonIntro tier headline metrics (cross-seed medians + spreads, difficulty knobs, JSON-path back-reference to validation_report.json).
intro/manifest.jsonIntro tier provenance manifest (recipe, seed, package version, file hashes, snapshot_day, redaction contract).
- intermediate/ (14 files)
+ intermediate/ (15 files)
intermediate/lead_scoring.csvIntermediate tier flat CSV (all splits concatenated, label retained, snapshot_day=30). The `split` column distinguishes train/valid/test rows.
intermediate/feature_dictionary.csvIntermediate tier feature dictionary (canonical column spec).
@@ -461,11 +584,12 @@ Data Files (42 total)<
intermediate/tables/sales_activities.parquetIntermediate tier `sales_activities` relational table (20,679 rows) — snapshot-safe.
intermediate/tables/opportunities.parquetIntermediate tier `opportunities` relational table (4,255 rows) — snapshot-safe.
intermediate/dataset_card.mdIntermediate tier auto-rendered dataset card.
+ intermediate/metrics.jsonIntermediate tier headline metrics (cross-seed medians + spreads, difficulty knobs, JSON-path back-reference to validation_report.json).
intermediate/manifest.jsonIntermediate tier provenance manifest (recipe, seed, package version, file hashes, snapshot_day, redaction contract).
- advanced/ (14 files)
+ advanced/ (15 files)
advanced/lead_scoring.csvAdvanced tier flat CSV (all splits concatenated, label retained, snapshot_day=30). The `split` column distinguishes train/valid/test rows.
advanced/feature_dictionary.csvAdvanced tier feature dictionary (canonical column spec).
@@ -480,9 +604,31 @@ Data Files (42 total)<
advanced/tables/sales_activities.parquetAdvanced tier `sales_activities` relational table (19,995 rows) — snapshot-safe.
advanced/tables/opportunities.parquetAdvanced tier `opportunities` relational table (4,004 rows) — snapshot-safe.
advanced/dataset_card.mdAdvanced tier auto-rendered dataset card.
+ advanced/metrics.jsonAdvanced tier headline metrics (cross-seed medians + spreads, difficulty knobs, JSON-path back-reference to validation_report.json).
advanced/manifest.jsonAdvanced tier provenance manifest (recipe, seed, package version, file hashes, snapshot_day, redaction contract).
+
+ (top-level)/ (4 files)
+
+ metrics.jsonTop-level cross-tier headline metrics (medians + spreads + cohort-shift + cross-tier ordering booleans). Machine-readable summary backing the README's Calibration table.
+ claims_register.mdClaims register (human-readable table). Rendered from `claims_register_source.yaml`.
+ claims_register.jsonClaims register (machine-readable). Each numerical / structural claim in the README paired with its backing artifact and JSON / YAML path.
+ claims_register_source.yamlClaims-register source YAML — hand-edited; `claims_register.{md,json}` are rendered from this.
+
+
+
+ docs/ (7 files)
+
+ docs/break_me_guide.mdAdversarial-framing guide: nine breakage patterns (leakage, split contamination, ranking inversions, calibration drift) with worked-example detection recipes.
+ docs/channel_signal_audit.mdEmpirical backing for the 'channel signal is weak' claim — out-of-sample univariate AUCs of `lead_source` per tier.
+ docs/feature_dictionary.mdLong-form per-feature documentation grouped by analytical role; companion to the per-tier `feature_dictionary.csv` machine-readable spec.
+ docs/generation_method.mdGeneration method (DGP description) — what is and isn't modelled by the simulator.
+ docs/relational_table_schemas.csvPer-column descriptions for the 7 public relational tables (and the 2 instructor-only ones) — surfaced into the schema-section of this page.
+ docs/v1_acceptance_gates_bands.yamlOperational acceptance bands per gate (G5–G8); the source-of-truth thresholds the validator checks against.
+ docs/v2_decision_log.mdAccepted-for-v2 findings register — issues flagged in v1 that are scoped to the v2 release.
+
+
Schema / Columns (534 columns across 33 tabular files)
@@ -652,14 +798,14 @@ Schema / Columns (534
| Column | Type | Description |
- account_id | string | |
- company_name | string | |
- industry | string | |
- region | string | |
- employee_band | string | |
- estimated_revenue_band | string | |
- process_maturity_band | string | |
- created_at | string | |
+ account_id | string | Opaque account identifier (e.g. ``acct_000001``). Primary key. |
+ company_name | string | Synthetic display name for the account (fictional). Not a feature in the snapshot. |
+ industry | string | Industry vertical of the buying organisation; one of the recipe's industry vocabulary. |
+ region | string | Geographic region of the account's headquarters (e.g. ``US``, ``UK``). |
+ employee_band | string | Banded employee headcount of the account (e.g. ``200-500``, ``500-1000``, ``1000-2000``). |
+ estimated_revenue_band | string | Banded estimated annual revenue of the account. |
+ process_maturity_band | string | Banded internal process-maturity score of the account (drives ICP fit). |
+ created_at | string | ISO-8601 timestamp when the account was first observed (synthetic creation time). |
@@ -668,14 +814,14 @@ Schema / Columns (534
| Column | Type | Description |
- contact_id | string | |
- account_id | string | |
- job_title | string | |
- role_function | string | |
- seniority | string | |
- buyer_role | string | |
- email_domain_type | string | |
- created_at | string | |
+ contact_id | string | Opaque contact identifier (e.g. ``cont_000001``). Primary key. |
+ account_id | string | FK to ``accounts.account_id`` — the buying organisation this contact belongs to. |
+ job_title | string | Free-text job title (fictional). Used only for narrative colour; not a feature. |
+ role_function | string | Functional area of the contact (e.g. ``finance``, ``ops``, ``it``, ``procurement``). |
+ seniority | string | Seniority band of the contact (e.g. ``c_level``, ``vp``, ``director``, ``manager``). |
+ buyer_role | string | Buyer-role classification (``economic_buyer``, ``champion``, ``technical_evaluator``, ``end_user``). |
+ email_domain_type | string | Type of email domain (e.g. ``corporate``, ``free``); never resolves to a real domain. |
+ created_at | string | ISO-8601 timestamp when the contact record was first observed. |
@@ -684,13 +830,13 @@ Schema / Columns (534
| Column | Type | Description |
- lead_id | string | |
- contact_id | string | |
- account_id | string | |
- lead_created_at | string | |
- lead_source | string | |
- first_touch_channel | string | |
- owner_rep_id | string | |
+ lead_id | string | Opaque lead identifier (e.g. ``lead_000001``). Primary key for the lead-scoring task. |
+ contact_id | string | FK to ``contacts.contact_id`` — the primary contact attached to this lead. |
+ account_id | string | FK to ``accounts.account_id`` — the buying organisation this lead belongs to. |
+ lead_created_at | string | ISO-8601 timestamp at which the lead was created (= snapshot anchor t=0). |
+ lead_source | string | Origination source of the lead (e.g. ``inbound_form``, ``sdr_outbound``, ``partner``). |
+ first_touch_channel | string | Marketing channel responsible for the first recorded touch. |
+ owner_rep_id | string | Opaque sales-rep id (e.g. ``rep_000001``) owning the lead at snapshot time. |
@@ -699,13 +845,13 @@ Schema / Columns (534
| Column | Type | Description |
- touch_id | string | |
- lead_id | string | |
- touch_timestamp | string | |
- touch_type | string | |
- touch_channel | string | |
- touch_direction | string | |
- campaign_id | string | |
+ touch_id | string | Opaque touch identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ touch_timestamp | string | ISO-8601 timestamp of the touch. Public bundles filter to ``<= lead_created_at + snapshot_day`` per the redaction contract. |
+ touch_type | string | Mechanism of the touch (e.g. ``email``, ``call``, ``ad_view``, ``content_download``). |
+ touch_channel | string | Marketing/sales channel attribution (e.g. ``paid_search``, ``content``, ``cold_outreach``). |
+ touch_direction | string | ``inbound`` (lead-initiated) or ``outbound`` (vendor-initiated). |
+ campaign_id | string | Opaque campaign identifier attached to the touch, or null when unattributed. |
@@ -714,14 +860,14 @@ Schema / Columns (534
| Column | Type | Description |
- session_id | string | |
- lead_id | string | |
- session_timestamp | string | |
- session_type | string | |
- page_views | integer | |
- pricing_page_views | integer | |
- demo_page_views | integer | |
- session_duration_seconds | integer | |
+ session_id | string | Opaque session identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ session_timestamp | string | ISO-8601 timestamp of the session start. Public bundles filter to ``<= lead_created_at + snapshot_day``. |
+ session_type | string | Session type (e.g. ``marketing_site``, ``trial``, ``demo``). |
+ page_views | integer | Total page views during the session. |
+ pricing_page_views | integer | Page views landing on a pricing URL during the session. |
+ demo_page_views | integer | Page views landing on a demo URL during the session. |
+ session_duration_seconds | integer | Session duration in seconds. |
@@ -730,12 +876,12 @@ Schema / Columns (534
| Column | Type | Description |
- activity_id | string | |
- lead_id | string | |
- rep_id | string | |
- activity_timestamp | string | |
- activity_type | string | |
- activity_outcome | string | |
+ activity_id | string | Opaque sales-activity identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ rep_id | string | Opaque sales-rep id performing the activity. |
+ activity_timestamp | string | ISO-8601 timestamp of the activity. Public bundles filter to ``<= lead_created_at + snapshot_day``. |
+ activity_type | string | Activity mechanism (e.g. ``call``, ``email``, ``demo``, ``meeting``). |
+ activity_outcome | string | Logged outcome (e.g. ``connected``, ``voicemail``, ``no_answer``, ``meeting_set``). |
@@ -744,11 +890,11 @@ Schema / Columns (534
| Column | Type | Description |
- opportunity_id | string | |
- lead_id | string | |
- created_at | string | |
- stage | string | |
- estimated_acv | integer | |
+ opportunity_id | string | Opaque opportunity identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ created_at | string | ISO-8601 timestamp the opportunity was created. Public bundles filter rows to ``<= lead_created_at + snapshot_day``. |
+ stage | string | Current stage at snapshot time (e.g. ``prospecting``, ``demo``, ``negotiation``). |
+ estimated_acv | integer | Estimated annual contract value at snapshot time (USD). |
@@ -918,14 +1064,14 @@ Schema / Columns (534
| Column | Type | Description |
- account_id | string | |
- company_name | string | |
- industry | string | |
- region | string | |
- employee_band | string | |
- estimated_revenue_band | string | |
- process_maturity_band | string | |
- created_at | string | |
+ account_id | string | Opaque account identifier (e.g. ``acct_000001``). Primary key. |
+ company_name | string | Synthetic display name for the account (fictional). Not a feature in the snapshot. |
+ industry | string | Industry vertical of the buying organisation; one of the recipe's industry vocabulary. |
+ region | string | Geographic region of the account's headquarters (e.g. ``US``, ``UK``). |
+ employee_band | string | Banded employee headcount of the account (e.g. ``200-500``, ``500-1000``, ``1000-2000``). |
+ estimated_revenue_band | string | Banded estimated annual revenue of the account. |
+ process_maturity_band | string | Banded internal process-maturity score of the account (drives ICP fit). |
+ created_at | string | ISO-8601 timestamp when the account was first observed (synthetic creation time). |
@@ -934,14 +1080,14 @@ Schema / Columns (534
| Column | Type | Description |
- contact_id | string | |
- account_id | string | |
- job_title | string | |
- role_function | string | |
- seniority | string | |
- buyer_role | string | |
- email_domain_type | string | |
- created_at | string | |
+ contact_id | string | Opaque contact identifier (e.g. ``cont_000001``). Primary key. |
+ account_id | string | FK to ``accounts.account_id`` — the buying organisation this contact belongs to. |
+ job_title | string | Free-text job title (fictional). Used only for narrative colour; not a feature. |
+ role_function | string | Functional area of the contact (e.g. ``finance``, ``ops``, ``it``, ``procurement``). |
+ seniority | string | Seniority band of the contact (e.g. ``c_level``, ``vp``, ``director``, ``manager``). |
+ buyer_role | string | Buyer-role classification (``economic_buyer``, ``champion``, ``technical_evaluator``, ``end_user``). |
+ email_domain_type | string | Type of email domain (e.g. ``corporate``, ``free``); never resolves to a real domain. |
+ created_at | string | ISO-8601 timestamp when the contact record was first observed. |
@@ -950,13 +1096,13 @@ Schema / Columns (534
| Column | Type | Description |
- lead_id | string | |
- contact_id | string | |
- account_id | string | |
- lead_created_at | string | |
- lead_source | string | |
- first_touch_channel | string | |
- owner_rep_id | string | |
+ lead_id | string | Opaque lead identifier (e.g. ``lead_000001``). Primary key for the lead-scoring task. |
+ contact_id | string | FK to ``contacts.contact_id`` — the primary contact attached to this lead. |
+ account_id | string | FK to ``accounts.account_id`` — the buying organisation this lead belongs to. |
+ lead_created_at | string | ISO-8601 timestamp at which the lead was created (= snapshot anchor t=0). |
+ lead_source | string | Origination source of the lead (e.g. ``inbound_form``, ``sdr_outbound``, ``partner``). |
+ first_touch_channel | string | Marketing channel responsible for the first recorded touch. |
+ owner_rep_id | string | Opaque sales-rep id (e.g. ``rep_000001``) owning the lead at snapshot time. |
@@ -965,13 +1111,13 @@ Schema / Columns (534
| Column | Type | Description |
- touch_id | string | |
- lead_id | string | |
- touch_timestamp | string | |
- touch_type | string | |
- touch_channel | string | |
- touch_direction | string | |
- campaign_id | string | |
+ touch_id | string | Opaque touch identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ touch_timestamp | string | ISO-8601 timestamp of the touch. Public bundles filter to ``<= lead_created_at + snapshot_day`` per the redaction contract. |
+ touch_type | string | Mechanism of the touch (e.g. ``email``, ``call``, ``ad_view``, ``content_download``). |
+ touch_channel | string | Marketing/sales channel attribution (e.g. ``paid_search``, ``content``, ``cold_outreach``). |
+ touch_direction | string | ``inbound`` (lead-initiated) or ``outbound`` (vendor-initiated). |
+ campaign_id | string | Opaque campaign identifier attached to the touch, or null when unattributed. |
@@ -980,14 +1126,14 @@ Schema / Columns (534
| Column | Type | Description |
- session_id | string | |
- lead_id | string | |
- session_timestamp | string | |
- session_type | string | |
- page_views | integer | |
- pricing_page_views | integer | |
- demo_page_views | integer | |
- session_duration_seconds | integer | |
+ session_id | string | Opaque session identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ session_timestamp | string | ISO-8601 timestamp of the session start. Public bundles filter to ``<= lead_created_at + snapshot_day``. |
+ session_type | string | Session type (e.g. ``marketing_site``, ``trial``, ``demo``). |
+ page_views | integer | Total page views during the session. |
+ pricing_page_views | integer | Page views landing on a pricing URL during the session. |
+ demo_page_views | integer | Page views landing on a demo URL during the session. |
+ session_duration_seconds | integer | Session duration in seconds. |
@@ -996,12 +1142,12 @@ Schema / Columns (534
| Column | Type | Description |
- activity_id | string | |
- lead_id | string | |
- rep_id | string | |
- activity_timestamp | string | |
- activity_type | string | |
- activity_outcome | string | |
+ activity_id | string | Opaque sales-activity identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ rep_id | string | Opaque sales-rep id performing the activity. |
+ activity_timestamp | string | ISO-8601 timestamp of the activity. Public bundles filter to ``<= lead_created_at + snapshot_day``. |
+ activity_type | string | Activity mechanism (e.g. ``call``, ``email``, ``demo``, ``meeting``). |
+ activity_outcome | string | Logged outcome (e.g. ``connected``, ``voicemail``, ``no_answer``, ``meeting_set``). |
@@ -1010,11 +1156,11 @@ Schema / Columns (534
| Column | Type | Description |
- opportunity_id | string | |
- lead_id | string | |
- created_at | string | |
- stage | string | |
- estimated_acv | integer | |
+ opportunity_id | string | Opaque opportunity identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ created_at | string | ISO-8601 timestamp the opportunity was created. Public bundles filter rows to ``<= lead_created_at + snapshot_day``. |
+ stage | string | Current stage at snapshot time (e.g. ``prospecting``, ``demo``, ``negotiation``). |
+ estimated_acv | integer | Estimated annual contract value at snapshot time (USD). |
@@ -1184,14 +1330,14 @@ Schema / Columns (534
| Column | Type | Description |
- account_id | string | |
- company_name | string | |
- industry | string | |
- region | string | |
- employee_band | string | |
- estimated_revenue_band | string | |
- process_maturity_band | string | |
- created_at | string | |
+ account_id | string | Opaque account identifier (e.g. ``acct_000001``). Primary key. |
+ company_name | string | Synthetic display name for the account (fictional). Not a feature in the snapshot. |
+ industry | string | Industry vertical of the buying organisation; one of the recipe's industry vocabulary. |
+ region | string | Geographic region of the account's headquarters (e.g. ``US``, ``UK``). |
+ employee_band | string | Banded employee headcount of the account (e.g. ``200-500``, ``500-1000``, ``1000-2000``). |
+ estimated_revenue_band | string | Banded estimated annual revenue of the account. |
+ process_maturity_band | string | Banded internal process-maturity score of the account (drives ICP fit). |
+ created_at | string | ISO-8601 timestamp when the account was first observed (synthetic creation time). |
@@ -1200,14 +1346,14 @@ Schema / Columns (534
| Column | Type | Description |
- contact_id | string | |
- account_id | string | |
- job_title | string | |
- role_function | string | |
- seniority | string | |
- buyer_role | string | |
- email_domain_type | string | |
- created_at | string | |
+ contact_id | string | Opaque contact identifier (e.g. ``cont_000001``). Primary key. |
+ account_id | string | FK to ``accounts.account_id`` — the buying organisation this contact belongs to. |
+ job_title | string | Free-text job title (fictional). Used only for narrative colour; not a feature. |
+ role_function | string | Functional area of the contact (e.g. ``finance``, ``ops``, ``it``, ``procurement``). |
+ seniority | string | Seniority band of the contact (e.g. ``c_level``, ``vp``, ``director``, ``manager``). |
+ buyer_role | string | Buyer-role classification (``economic_buyer``, ``champion``, ``technical_evaluator``, ``end_user``). |
+ email_domain_type | string | Type of email domain (e.g. ``corporate``, ``free``); never resolves to a real domain. |
+ created_at | string | ISO-8601 timestamp when the contact record was first observed. |
@@ -1216,13 +1362,13 @@ Schema / Columns (534
| Column | Type | Description |
- lead_id | string | |
- contact_id | string | |
- account_id | string | |
- lead_created_at | string | |
- lead_source | string | |
- first_touch_channel | string | |
- owner_rep_id | string | |
+ lead_id | string | Opaque lead identifier (e.g. ``lead_000001``). Primary key for the lead-scoring task. |
+ contact_id | string | FK to ``contacts.contact_id`` — the primary contact attached to this lead. |
+ account_id | string | FK to ``accounts.account_id`` — the buying organisation this lead belongs to. |
+ lead_created_at | string | ISO-8601 timestamp at which the lead was created (= snapshot anchor t=0). |
+ lead_source | string | Origination source of the lead (e.g. ``inbound_form``, ``sdr_outbound``, ``partner``). |
+ first_touch_channel | string | Marketing channel responsible for the first recorded touch. |
+ owner_rep_id | string | Opaque sales-rep id (e.g. ``rep_000001``) owning the lead at snapshot time. |
@@ -1231,13 +1377,13 @@ Schema / Columns (534
| Column | Type | Description |
- touch_id | string | |
- lead_id | string | |
- touch_timestamp | string | |
- touch_type | string | |
- touch_channel | string | |
- touch_direction | string | |
- campaign_id | string | |
+ touch_id | string | Opaque touch identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ touch_timestamp | string | ISO-8601 timestamp of the touch. Public bundles filter to ``<= lead_created_at + snapshot_day`` per the redaction contract. |
+ touch_type | string | Mechanism of the touch (e.g. ``email``, ``call``, ``ad_view``, ``content_download``). |
+ touch_channel | string | Marketing/sales channel attribution (e.g. ``paid_search``, ``content``, ``cold_outreach``). |
+ touch_direction | string | ``inbound`` (lead-initiated) or ``outbound`` (vendor-initiated). |
+ campaign_id | string | Opaque campaign identifier attached to the touch, or null when unattributed. |
@@ -1246,14 +1392,14 @@ Schema / Columns (534
| Column | Type | Description |
- session_id | string | |
- lead_id | string | |
- session_timestamp | string | |
- session_type | string | |
- page_views | integer | |
- pricing_page_views | integer | |
- demo_page_views | integer | |
- session_duration_seconds | integer | |
+ session_id | string | Opaque session identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ session_timestamp | string | ISO-8601 timestamp of the session start. Public bundles filter to ``<= lead_created_at + snapshot_day``. |
+ session_type | string | Session type (e.g. ``marketing_site``, ``trial``, ``demo``). |
+ page_views | integer | Total page views during the session. |
+ pricing_page_views | integer | Page views landing on a pricing URL during the session. |
+ demo_page_views | integer | Page views landing on a demo URL during the session. |
+ session_duration_seconds | integer | Session duration in seconds. |
@@ -1262,12 +1408,12 @@ Schema / Columns (534
| Column | Type | Description |
- activity_id | string | |
- lead_id | string | |
- rep_id | string | |
- activity_timestamp | string | |
- activity_type | string | |
- activity_outcome | string | |
+ activity_id | string | Opaque sales-activity identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ rep_id | string | Opaque sales-rep id performing the activity. |
+ activity_timestamp | string | ISO-8601 timestamp of the activity. Public bundles filter to ``<= lead_created_at + snapshot_day``. |
+ activity_type | string | Activity mechanism (e.g. ``call``, ``email``, ``demo``, ``meeting``). |
+ activity_outcome | string | Logged outcome (e.g. ``connected``, ``voicemail``, ``no_answer``, ``meeting_set``). |
@@ -1276,11 +1422,11 @@ Schema / Columns (534
| Column | Type | Description |
- opportunity_id | string | |
- lead_id | string | |
- created_at | string | |
- stage | string | |
- estimated_acv | integer | |
+ opportunity_id | string | Opaque opportunity identifier. Primary key. |
+ lead_id | string | FK to ``leads.lead_id``. |
+ created_at | string | ISO-8601 timestamp the opportunity was created. Public bundles filter rows to ``<= lead_created_at + snapshot_day``. |
+ stage | string | Current stage at snapshot time (e.g. ``prospecting``, ``demo``, ``negotiation``). |
+ estimated_acv | integer | Estimated annual contract value at snapshot time (USD). |
diff --git a/release/claims_register.json b/release/claims_register.json
new file mode 100644
index 0000000..93070ca
--- /dev/null
+++ b/release/claims_register.json
@@ -0,0 +1,221 @@
+{
+ "claims": [
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.n_leads",
+ "category": "composition",
+ "id": "c01",
+ "text": "Three difficulty tiers (intro / intermediate / advanced), 5,000 leads each.",
+ "verifier": "leadforge validate"
+ },
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.n_accounts, $.n_contacts",
+ "category": "composition",
+ "id": "c02",
+ "text": "Each tier has 1,500 accounts and 4,200 contacts.",
+ "verifier": "leadforge validate"
+ },
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.tables (keys)",
+ "category": "composition",
+ "id": "c03",
+ "text": "Public bundles ship 7 snapshot-safe relational tables (accounts, contacts, leads, touches, sessions, sales_activities, opportunities).",
+ "verifier": "leadforge validate"
+ },
+ {
+ "backing_artifact": "release/intermediate_instructor/manifest.json",
+ "backing_path": "$.tables (keys)",
+ "category": "composition",
+ "id": "c04",
+ "text": "Instructor companion ships 9 tables (the 7 public ones plus customers and subscriptions).",
+ "verifier": "leadforge validate"
+ },
+ {
+ "backing_artifact": "release/metrics.json",
+ "backing_path": "$.tiers..medians.conversion_rate_test",
+ "category": "calibration",
+ "id": "c05",
+ "text": "Conversion rate (cross-seed median, seeds 42-46): intro 42.67%, intermediate 21.60%, advanced 8.40%.",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release/metrics.json",
+ "backing_path": "$.tiers..medians.lr_auc",
+ "category": "calibration",
+ "id": "c06",
+ "text": "Cross-seed median LR AUC: intro 0.879, intermediate 0.886, advanced 0.886.",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release/metrics.json",
+ "backing_path": "$.tiers..medians.lr_average_precision",
+ "category": "calibration",
+ "id": "c07",
+ "text": "Cross-seed median LR Average Precision: intro 0.761, intermediate 0.575, advanced 0.351.",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release/metrics.json",
+ "backing_path": "$.tiers..medians.precision_at_100",
+ "category": "calibration",
+ "id": "c08",
+ "text": "Cross-seed median P@100: intro 0.80, intermediate 0.59, advanced 0.34.",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release/metrics.json",
+ "backing_path": "$.tiers..medians.brier_score",
+ "category": "calibration",
+ "id": "c09",
+ "text": "Cross-seed median Brier score: intro 0.130, intermediate 0.110, advanced 0.061.",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release/metrics.json",
+ "backing_path": "$.cross_tier_ordering.{by_conversion_rate, by_average_precision, by_precision_at_100}",
+ "category": "difficulty",
+ "id": "c10",
+ "text": "Conversion-rate, AP, and P@100 orderings hold intro > intermediate > advanced.",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release//metrics.json",
+ "backing_path": "$.difficulty_knobs",
+ "category": "difficulty",
+ "id": "c11",
+ "text": "Difficulty knobs by tier: signal strength 0.90/0.70/0.50, noise scale 0.10/0.30/0.55, missing rate 2%/8%/18%.",
+ "verifier": "leadforge inspect"
+ },
+ {
+ "backing_artifact": "release/metrics.json",
+ "backing_path": "$.tiers..medians.gbm_minus_lr_auc",
+ "category": "limitations",
+ "id": "c12",
+ "text": "GBM-LR AUC delta is slightly negative in every tier (-0.0045 / -0.0072 / -0.0133); v1's snapshot is dominated by linear features.",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release/docs/channel_signal_audit.md",
+ "backing_path": "n/a (prose)",
+ "category": "limitations",
+ "id": "c13",
+ "text": "lead_source is weakly informative — out-of-sample univariate AUC ~0.50-0.52 across tiers, per-channel rate spread <=0.05.",
+ "verifier": "scripts/audit_channel_signal.py"
+ },
+ {
+ "backing_artifact": "release/metrics.json",
+ "backing_path": "$.cohort_shift..auc_degradation",
+ "category": "limitations",
+ "id": "c14",
+ "text": "Cohort-shift AUC degradation is small (v1 has no time-of-year drift baked in).",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.structural_redactions.columns.leads",
+ "category": "redaction",
+ "id": "c15",
+ "text": "Public leads.parquet drops conversion_timestamp and converted_within_90_days.",
+ "verifier": "scripts/probe_relational_leakage.py"
+ },
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.structural_redactions.columns.opportunities",
+ "category": "redaction",
+ "id": "c16",
+ "text": "Public opportunities.parquet drops close_outcome and closed_at.",
+ "verifier": "scripts/probe_relational_leakage.py"
+ },
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.structural_redactions.omitted_tables",
+ "category": "redaction",
+ "id": "c17",
+ "text": "Public bundles omit customers and subscriptions tables entirely.",
+ "verifier": "scripts/probe_relational_leakage.py"
+ },
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.relational_snapshot_safe, $.snapshot_day",
+ "category": "redaction",
+ "id": "c18",
+ "text": "Snapshot-filtered event tables (touches, sessions, sales_activities, opportunities) keep only rows with <= lead_created_at + snapshot_day.",
+ "verifier": "scripts/probe_relational_leakage.py"
+ },
+ {
+ "backing_artifact": "release//feature_dictionary.csv",
+ "backing_path": "row[name=='total_touches_all'].leakage_risk",
+ "category": "redaction",
+ "id": "c19",
+ "text": "total_touches_all is the deliberate leakage trap: it counts touches over the full 90-day window and is flagged leakage_risk=True.",
+ "verifier": "grep on feature_dictionary.csv"
+ },
+ {
+ "backing_artifact": "release//tasks/converted_within_90_days/task_manifest.json",
+ "backing_path": "n/a (whole file)",
+ "category": "splits",
+ "id": "c20",
+ "text": "Splits are 70/15/15 train/valid/test, deterministic given seed; recorded in tasks/converted_within_90_days/task_manifest.json.",
+ "verifier": "leadforge validate"
+ },
+ {
+ "backing_artifact": "release/docs/break_me_guide.md",
+ "backing_path": "section 5",
+ "category": "splits",
+ "id": "c21",
+ "text": "Splitter keyed on lead_id only — 518/557 (~93%) of test accounts also appear in train on the intermediate bundle. Use GroupKFold(account_id) for a generalisation-faithful number.",
+ "verifier": "scripts/probe_relational_leakage.py --max-accuracy"
+ },
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.recipe_id, $.seed, $.bundle_schema_version, $.package_version",
+ "category": "provenance",
+ "id": "c22",
+ "text": "Recipe b2b_saas_procurement_v1, canonical seed 42, cross-seed sweep 42-46, bundle schema version 5, package leadforge 1.0.0+.",
+ "verifier": "leadforge validate"
+ },
+ {
+ "backing_artifact": "release//manifest.json",
+ "backing_path": "$.tables.*.sha256, $.tasks.*.{train,valid,test}_sha256",
+ "category": "provenance",
+ "id": "c23",
+ "text": "Every file in the bundle is SHA-256 hashed in manifest.json; the bundle is verifiable end-to-end with `leadforge validate`.",
+ "verifier": "leadforge validate"
+ },
+ {
+ "backing_artifact": "release/docs/v1_acceptance_gates_bands.yaml",
+ "backing_path": "per_tier",
+ "category": "provenance",
+ "id": "c24",
+ "text": "Acceptance bands for every gate live as YAML at release/docs/v1_acceptance_gates_bands.yaml; bands are recipe gates, not achievable ranges.",
+ "verifier": "scripts/validate_release_candidate.py"
+ },
+ {
+ "backing_artifact": "release/README.md",
+ "backing_path": "section 'Intended uses'",
+ "category": "intended_use",
+ "id": "c25",
+ "text": "Intended uses: teaching baseline lead scoring, relational feature engineering, leakage detection, calibration / lift / P@K / value-aware ranking, model-family comparison under a controlled DGP.",
+ "verifier": "n/a (prose contract)"
+ },
+ {
+ "backing_artifact": "release/README.md",
+ "backing_path": "section 'Out-of-scope uses'",
+ "category": "out_of_scope",
+ "id": "c26",
+ "text": "Out of scope: production lead scoring, vendor benchmarking, causal-inference research requiring DGP recovery, demographic / fairness research.",
+ "verifier": "n/a (prose contract)"
+ }
+ ],
+ "notes": "This register is rendered from release/claims_register_source.yaml. Every claim in release/README.md should appear here. Agents and CI can use the (backing_artifact, backing_path) tuple to locate the source-of-truth value without parsing prose.",
+ "schema": {
+ "backing_artifact": "Path within the published bundle (or repo) that carries the source of truth. ```` is a placeholder for intro / intermediate / advanced.",
+ "backing_path": "JSON-path / YAML-path / column reference inside the backing artifact, or ``n/a`` for prose contracts and whole-file claims.",
+ "category": "One of: composition, calibration, redaction, difficulty, limitations, splits, provenance, out_of_scope, intended_use.",
+ "id": "Short stable identifier; quoted in CI failure messages.",
+ "text": "The claim as it appears in the README (verbatim, where practical).",
+ "verifier": "Free-form name of the script / probe / test that re-derives the claim end-to-end. ``n/a`` means the claim is a prose contract that is not mechanically verifiable."
+ }
+}
diff --git a/release/claims_register.md b/release/claims_register.md
new file mode 100644
index 0000000..9c76a72
--- /dev/null
+++ b/release/claims_register.md
@@ -0,0 +1,81 @@
+# Claims register — `leadforge-lead-scoring-v1`
+
+Every numerical / structural claim made in `release/README.md` (and
+copied onto the Kaggle / HuggingFace dataset pages), paired with the
+artifact and path that backs it. This file is auto-rendered from
+[`release/claims_register_source.yaml`](claims_register_source.yaml)
+by `scripts/build_claims_register.py`. Edit the YAML, not this file.
+
+Tip for AI reviewers: `claims_register.json` is the machine-readable
+twin of this document with the same data plus a schema block.
+
+## calibration
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c05` | Conversion rate (cross-seed median, seeds 42-46): intro 42.67%, intermediate 21.60%, advanced 8.40%. | `release/metrics.json` | `$.tiers..medians.conversion_rate_test` | `scripts/validate_release_candidate.py` |
+| `c06` | Cross-seed median LR AUC: intro 0.879, intermediate 0.886, advanced 0.886. | `release/metrics.json` | `$.tiers..medians.lr_auc` | `scripts/validate_release_candidate.py` |
+| `c07` | Cross-seed median LR Average Precision: intro 0.761, intermediate 0.575, advanced 0.351. | `release/metrics.json` | `$.tiers..medians.lr_average_precision` | `scripts/validate_release_candidate.py` |
+| `c08` | Cross-seed median P@100: intro 0.80, intermediate 0.59, advanced 0.34. | `release/metrics.json` | `$.tiers..medians.precision_at_100` | `scripts/validate_release_candidate.py` |
+| `c09` | Cross-seed median Brier score: intro 0.130, intermediate 0.110, advanced 0.061. | `release/metrics.json` | `$.tiers..medians.brier_score` | `scripts/validate_release_candidate.py` |
+
+## composition
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c01` | Three difficulty tiers (intro / intermediate / advanced), 5,000 leads each. | `release//manifest.json` | `$.n_leads` | `leadforge validate` |
+| `c02` | Each tier has 1,500 accounts and 4,200 contacts. | `release//manifest.json` | `$.n_accounts, $.n_contacts` | `leadforge validate` |
+| `c03` | Public bundles ship 7 snapshot-safe relational tables (accounts, contacts, leads, touches, sessions, sales_activities, opportunities). | `release//manifest.json` | `$.tables (keys)` | `leadforge validate` |
+| `c04` | Instructor companion ships 9 tables (the 7 public ones plus customers and subscriptions). | `release/intermediate_instructor/manifest.json` | `$.tables (keys)` | `leadforge validate` |
+
+## difficulty
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c10` | Conversion-rate, AP, and P@100 orderings hold intro > intermediate > advanced. | `release/metrics.json` | `$.cross_tier_ordering.{by_conversion_rate, by_average_precision, by_precision_at_100}` | `scripts/validate_release_candidate.py` |
+| `c11` | Difficulty knobs by tier: signal strength 0.90/0.70/0.50, noise scale 0.10/0.30/0.55, missing rate 2%/8%/18%. | `release//metrics.json` | `$.difficulty_knobs` | `leadforge inspect` |
+
+## intended_use
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c25` | Intended uses: teaching baseline lead scoring, relational feature engineering, leakage detection, calibration / lift / P@K / value-aware ranking, model-family comparison under a controlled DGP. | `release/README.md` | `section 'Intended uses'` | `n/a (prose contract)` |
+
+## limitations
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c12` | GBM-LR AUC delta is slightly negative in every tier (-0.0045 / -0.0072 / -0.0133); v1's snapshot is dominated by linear features. | `release/metrics.json` | `$.tiers..medians.gbm_minus_lr_auc` | `scripts/validate_release_candidate.py` |
+| `c13` | lead_source is weakly informative — out-of-sample univariate AUC ~0.50-0.52 across tiers, per-channel rate spread <=0.05. | `release/docs/channel_signal_audit.md` | `n/a (prose)` | `scripts/audit_channel_signal.py` |
+| `c14` | Cohort-shift AUC degradation is small (v1 has no time-of-year drift baked in). | `release/metrics.json` | `$.cohort_shift..auc_degradation` | `scripts/validate_release_candidate.py` |
+
+## out_of_scope
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c26` | Out of scope: production lead scoring, vendor benchmarking, causal-inference research requiring DGP recovery, demographic / fairness research. | `release/README.md` | `section 'Out-of-scope uses'` | `n/a (prose contract)` |
+
+## provenance
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c22` | Recipe b2b_saas_procurement_v1, canonical seed 42, cross-seed sweep 42-46, bundle schema version 5, package leadforge 1.0.0+. | `release//manifest.json` | `$.recipe_id, $.seed, $.bundle_schema_version, $.package_version` | `leadforge validate` |
+| `c23` | Every file in the bundle is SHA-256 hashed in manifest.json; the bundle is verifiable end-to-end with `leadforge validate`. | `release//manifest.json` | `$.tables.*.sha256, $.tasks.*.{train,valid,test}_sha256` | `leadforge validate` |
+| `c24` | Acceptance bands for every gate live as YAML at release/docs/v1_acceptance_gates_bands.yaml; bands are recipe gates, not achievable ranges. | `release/docs/v1_acceptance_gates_bands.yaml` | `per_tier` | `scripts/validate_release_candidate.py` |
+
+## redaction
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c15` | Public leads.parquet drops conversion_timestamp and converted_within_90_days. | `release//manifest.json` | `$.structural_redactions.columns.leads` | `scripts/probe_relational_leakage.py` |
+| `c16` | Public opportunities.parquet drops close_outcome and closed_at. | `release//manifest.json` | `$.structural_redactions.columns.opportunities` | `scripts/probe_relational_leakage.py` |
+| `c17` | Public bundles omit customers and subscriptions tables entirely. | `release//manifest.json` | `$.structural_redactions.omitted_tables` | `scripts/probe_relational_leakage.py` |
+| `c18` | Snapshot-filtered event tables (touches, sessions, sales_activities, opportunities) keep only rows with <= lead_created_at + snapshot_day. | `release//manifest.json` | `$.relational_snapshot_safe, $.snapshot_day` | `scripts/probe_relational_leakage.py` |
+| `c19` | total_touches_all is the deliberate leakage trap: it counts touches over the full 90-day window and is flagged leakage_risk=True. | `release//feature_dictionary.csv` | `row[name=='total_touches_all'].leakage_risk` | `grep on feature_dictionary.csv` |
+
+## splits
+
+| ID | Claim | Backing artifact | Path | Verifier |
+|---|---|---|---|---|
+| `c20` | Splits are 70/15/15 train/valid/test, deterministic given seed; recorded in tasks/converted_within_90_days/task_manifest.json. | `release//tasks/converted_within_90_days/task_manifest.json` | `n/a (whole file)` | `leadforge validate` |
+| `c21` | Splitter keyed on lead_id only — 518/557 (~93%) of test accounts also appear in train on the intermediate bundle. Use GroupKFold(account_id) for a generalisation-faithful number. | `release/docs/break_me_guide.md` | `section 5` | `scripts/probe_relational_leakage.py --max-accuracy` |
diff --git a/release/claims_register_source.yaml b/release/claims_register_source.yaml
new file mode 100644
index 0000000..4381232
--- /dev/null
+++ b/release/claims_register_source.yaml
@@ -0,0 +1,204 @@
+# Claims register source — every numerical / structural claim made in
+# release/README.md (and copied onto the Kaggle / HuggingFace dataset
+# pages) paired with the artifact and path that backs it.
+#
+# Schema per claim:
+# id: short stable identifier (claims_register.json uses this)
+# text: the claim as it appears in the README (verbatim)
+# category: one of {composition, calibration, redaction,
+# difficulty, limitations, splits, provenance,
+# out_of_scope, intended_use}
+# backing_artifact: path within the published bundle (or repo) that
+# carries the source of truth
+# backing_path: JSON-path / YAML-path / column reference inside the
+# backing artifact (when applicable; "n/a" for prose
+# artifacts and column-level claims that describe a
+# whole file)
+# verifier: free-form name of the script / probe / test that an
+# agent (or CI) can run to re-derive the claim
+#
+# This file is hand-edited. ``scripts/build_claims_register.py``
+# rewrites release/claims_register.{md,json} from it.
+
+claims:
+ - id: c01
+ text: "Three difficulty tiers (intro / intermediate / advanced), 5,000 leads each."
+ category: composition
+ backing_artifact: release//manifest.json
+ backing_path: $.n_leads
+ verifier: leadforge validate
+
+ - id: c02
+ text: "Each tier has 1,500 accounts and 4,200 contacts."
+ category: composition
+ backing_artifact: release//manifest.json
+ backing_path: $.n_accounts, $.n_contacts
+ verifier: leadforge validate
+
+ - id: c03
+ text: "Public bundles ship 7 snapshot-safe relational tables (accounts, contacts, leads, touches, sessions, sales_activities, opportunities)."
+ category: composition
+ backing_artifact: release//manifest.json
+ backing_path: $.tables (keys)
+ verifier: leadforge validate
+
+ - id: c04
+ text: "Instructor companion ships 9 tables (the 7 public ones plus customers and subscriptions)."
+ category: composition
+ backing_artifact: release/intermediate_instructor/manifest.json
+ backing_path: $.tables (keys)
+ verifier: leadforge validate
+
+ - id: c05
+ text: "Conversion rate (cross-seed median, seeds 42-46): intro 42.67%, intermediate 21.60%, advanced 8.40%."
+ category: calibration
+ backing_artifact: release/metrics.json
+ backing_path: $.tiers..medians.conversion_rate_test
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c06
+ text: "Cross-seed median LR AUC: intro 0.879, intermediate 0.886, advanced 0.886."
+ category: calibration
+ backing_artifact: release/metrics.json
+ backing_path: $.tiers..medians.lr_auc
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c07
+ text: "Cross-seed median LR Average Precision: intro 0.761, intermediate 0.575, advanced 0.351."
+ category: calibration
+ backing_artifact: release/metrics.json
+ backing_path: $.tiers..medians.lr_average_precision
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c08
+ text: "Cross-seed median P@100: intro 0.80, intermediate 0.59, advanced 0.34."
+ category: calibration
+ backing_artifact: release/metrics.json
+ backing_path: $.tiers..medians.precision_at_100
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c09
+ text: "Cross-seed median Brier score: intro 0.130, intermediate 0.110, advanced 0.061."
+ category: calibration
+ backing_artifact: release/metrics.json
+ backing_path: $.tiers..medians.brier_score
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c10
+ text: "Conversion-rate, AP, and P@100 orderings hold intro > intermediate > advanced."
+ category: difficulty
+ backing_artifact: release/metrics.json
+ backing_path: $.cross_tier_ordering.{by_conversion_rate, by_average_precision, by_precision_at_100}
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c11
+ text: "Difficulty knobs by tier: signal strength 0.90/0.70/0.50, noise scale 0.10/0.30/0.55, missing rate 2%/8%/18%."
+ category: difficulty
+ backing_artifact: release//metrics.json
+ backing_path: $.difficulty_knobs
+ verifier: leadforge inspect
+
+ - id: c12
+ text: "GBM-LR AUC delta is slightly negative in every tier (-0.0045 / -0.0072 / -0.0133); v1's snapshot is dominated by linear features."
+ category: limitations
+ backing_artifact: release/metrics.json
+ backing_path: $.tiers..medians.gbm_minus_lr_auc
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c13
+ text: "lead_source is weakly informative — out-of-sample univariate AUC ~0.50-0.52 across tiers, per-channel rate spread <=0.05."
+ category: limitations
+ backing_artifact: release/docs/channel_signal_audit.md
+ backing_path: n/a (prose)
+ verifier: scripts/audit_channel_signal.py
+
+ - id: c14
+ text: "Cohort-shift AUC degradation is small (v1 has no time-of-year drift baked in)."
+ category: limitations
+ backing_artifact: release/metrics.json
+ backing_path: $.cohort_shift..auc_degradation
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c15
+ text: "Public leads.parquet drops conversion_timestamp and converted_within_90_days."
+ category: redaction
+ backing_artifact: release//manifest.json
+ backing_path: $.structural_redactions.columns.leads
+ verifier: scripts/probe_relational_leakage.py
+
+ - id: c16
+ text: "Public opportunities.parquet drops close_outcome and closed_at."
+ category: redaction
+ backing_artifact: release//manifest.json
+ backing_path: $.structural_redactions.columns.opportunities
+ verifier: scripts/probe_relational_leakage.py
+
+ - id: c17
+ text: "Public bundles omit customers and subscriptions tables entirely."
+ category: redaction
+ backing_artifact: release//manifest.json
+ backing_path: $.structural_redactions.omitted_tables
+ verifier: scripts/probe_relational_leakage.py
+
+ - id: c18
+ text: "Snapshot-filtered event tables (touches, sessions, sales_activities, opportunities) keep only rows with <= lead_created_at + snapshot_day."
+ category: redaction
+ backing_artifact: release//manifest.json
+ backing_path: $.relational_snapshot_safe, $.snapshot_day
+ verifier: scripts/probe_relational_leakage.py
+
+ - id: c19
+ text: "total_touches_all is the deliberate leakage trap: it counts touches over the full 90-day window and is flagged leakage_risk=True."
+ category: redaction
+ backing_artifact: release//feature_dictionary.csv
+ backing_path: row[name=='total_touches_all'].leakage_risk
+ verifier: grep on feature_dictionary.csv
+
+ - id: c20
+ text: "Splits are 70/15/15 train/valid/test, deterministic given seed; recorded in tasks/converted_within_90_days/task_manifest.json."
+ category: splits
+ backing_artifact: release//tasks/converted_within_90_days/task_manifest.json
+ backing_path: n/a (whole file)
+ verifier: leadforge validate
+
+ - id: c21
+ text: "Splitter keyed on lead_id only — 518/557 (~93%) of test accounts also appear in train on the intermediate bundle. Use GroupKFold(account_id) for a generalisation-faithful number."
+ category: splits
+ backing_artifact: release/docs/break_me_guide.md
+ backing_path: section 5
+ verifier: scripts/probe_relational_leakage.py --max-accuracy
+
+ - id: c22
+ text: "Recipe b2b_saas_procurement_v1, canonical seed 42, cross-seed sweep 42-46, bundle schema version 5, package leadforge 1.0.0+."
+ category: provenance
+ backing_artifact: release//manifest.json
+ backing_path: $.recipe_id, $.seed, $.bundle_schema_version, $.package_version
+ verifier: leadforge validate
+
+ - id: c23
+ text: "Every file in the bundle is SHA-256 hashed in manifest.json; the bundle is verifiable end-to-end with `leadforge validate`."
+ category: provenance
+ backing_artifact: release//manifest.json
+ backing_path: $.tables.*.sha256, $.tasks.*.{train,valid,test}_sha256
+ verifier: leadforge validate
+
+ - id: c24
+ text: "Acceptance bands for every gate live as YAML at release/docs/v1_acceptance_gates_bands.yaml; bands are recipe gates, not achievable ranges."
+ category: provenance
+ backing_artifact: release/docs/v1_acceptance_gates_bands.yaml
+ backing_path: per_tier
+ verifier: scripts/validate_release_candidate.py
+
+ - id: c25
+ text: "Intended uses: teaching baseline lead scoring, relational feature engineering, leakage detection, calibration / lift / P@K / value-aware ranking, model-family comparison under a controlled DGP."
+ category: intended_use
+ backing_artifact: release/README.md
+ backing_path: section 'Intended uses'
+ verifier: n/a (prose contract)
+
+ - id: c26
+ text: "Out of scope: production lead scoring, vendor benchmarking, causal-inference research requiring DGP recovery, demographic / fairness research."
+ category: out_of_scope
+ backing_artifact: release/README.md
+ backing_path: section 'Out-of-scope uses'
+ verifier: n/a (prose contract)
diff --git a/release/docs/break_me_guide.md b/release/docs/break_me_guide.md
new file mode 100644
index 0000000..114bb4c
--- /dev/null
+++ b/release/docs/break_me_guide.md
@@ -0,0 +1,369 @@
+# Break Me — adversarial playbook for `leadforge-lead-scoring-v1`
+
+We *want* this dataset to be broken on purpose. The notebooks
+ship the headline walkthroughs (notebook 03 dissects the
+documented `total_touches_all` trap; notebook 04 covers
+calibration, value-aware ranking, and cohort shift). This guide
+is the **meta-recipe**: the patterns to look for on any
+synthetic teaching dataset, with worked-example pointers back
+into the v1 bundle so each pattern is grounded in a number
+you can reproduce.
+
+If you find one of these on `leadforge-lead-scoring-v1`,
+file an issue using one of the templates in
+[`.github/ISSUE_TEMPLATE/`](../../.github/ISSUE_TEMPLATE).
+Accepted findings are logged in
+[`v2_decision_log.md`](v2_decision_log.md).
+
+## Triage labels
+
+When you file an issue, suggest one of these labels in the
+title or body. The maintainer applies the final label.
+
+| Label | When |
+|---|---|
+| `critical-leakage` | The dataset reconstructs the label via a path that wasn't documented. Highest priority — blocks v1 if reproducible on the as-shipped bundle. |
+| `realism` | A modelled distribution disagrees with what a domain expert expects (industry mix, persona behaviour, funnel timing, channel attribution, pricing). Belongs in the realism issue template. |
+| `difficulty` | A tier sits outside its declared band on a metric documented in `release/validation/validation_report.md`. Likely a band recalibration in v2. |
+| `documentation` | A claim in the dataset card or notebooks doesn't match the artefact. Cheap to fix; please file. |
+| `platform` | Kaggle / HF artefact issue (broken link, malformed YAML, schema mismatch). Phase 5 territory. |
+| `notebook` | A notebook fails to execute, or its tolerance gate fires on a fresh checkout. |
+| `pedagogy` | The teaching framing is misleading even though the artefact is technically correct. |
+| `v2-idea` | A capability worth adding (cohort drift, channel-conditional probabilities, non-linear motifs). |
+| `out-of-scope-v1` | True observation, but explicitly deferred — the dataset card already documents it as a v1 simplification. |
+
+## The meta-recipe
+
+Notebook 03 §7 introduces a three-step recipe (read the feature
+dictionary → ablate, don't just probe → check the time window).
+This guide extends it with one more step that the notebook
+doesn't cover, then organises the patterns to apply each step
+to.
+
+1. **Read the feature dictionary first.** Every public bundle
+ ships `feature_dictionary.csv` with a `leakage_risk` column.
+ Treat that as the primary leakage audit before any modelling.
+2. **Ablate, don't just probe.** A standalone-AUC probe on a
+ single feature can rate a column as ~0.5 AUC while a tree
+ model extracts non-trivial lift from the same column once
+ it can combine it with the rest of the panel. Notebook 03
+ §4–§5 demonstrate the gap on `total_touches_all`
+ (standalone 0.531 → GBM lift +0.032 vs LR lift +0.009).
+3. **Check the time window.** If you have any event table
+ with timestamps, cross-check every aggregate feature against
+ `lead_created_at + snapshot_day`. The validation report's
+ `post_snapshot_aggregates` baseline (`$.tiers.intermediate.per_seed[*].baselines.post_snapshot_aggregates`)
+ bench-tests this same idea at scale.
+4. **Treat the train/test split as untrusted.** The split file
+ says one thing; what the model sees during fitting is what
+ matters. Sections 5 and 6 below cover the most common ways
+ the two diverge.
+
+The pattern catalogue below maps each pattern to the recipe
+step it operationalises.
+
+---
+
+## Leakage patterns
+
+### 1. Naming smells the dictionary should already flag
+
+A column whose name mentions `total`, `all`, `lifetime`,
+`final`, `outcome`, or any superlative that crosses the
+prediction horizon is suspicious by default on a snapshot-
+anchored task. `leadforge-lead-scoring-v1` ships exactly one
+such column — `total_touches_all` — and the
+`feature_dictionary.csv` row for it sets `leakage_risk=True`
+and explains *why* in the description.
+
+**How to detect on any dataset.** Grep the column list for
+`*_total`, `*_all`, `*_lifetime`, `*_final`, `*_outcome`,
+`current_*`, `is_*` (especially `is_won`, `is_closed`).
+Cross-check each hit against the dataset's stated prediction
+horizon and snapshot anchor. If the column name implies a
+window the snapshot can't have observed, the dictionary should
+either flag it or rename it; if neither, that's a `documentation`
+issue at minimum and probably `critical-leakage`.
+
+**Worked example.** Notebook 03 §2 shows the dictionary read
+in three lines of pandas; the column it surfaces is
+`total_touches_all`.
+
+### 2. The standalone-AUC undersell (tree-friendly leakage)
+
+A feature can score ~0.5 AUC as a single-column ranker and
+still hand a tree model material lift once interactions with
+other columns are available. The validation report's
+`post_snapshot_aggregates` baseline (HistGBM on the trap
+column alone, see
+[`leadforge/validation/release_quality.py`](../../leadforge/validation/release_quality.py))
+gives ~0.55 AUC on intermediate (median across seeds 42–46;
+0.52–0.61 across all tier × seed pairs) — the trap "looks"
+innocuous even when scored by a tree model on its own.
+Notebook 03 §5 then runs a full panel ablation and HistGBM
+extracts +0.032 AUC; LR with the same preprocessing only
+extracts +0.009 because it can't represent the relevant
+interaction.
+
+**How to detect on any dataset.** Don't audit leakage with
+single-feature AUC. For every column you flagged in pattern 1,
+fit two tree models on the same train/test split — one with
+the column, one without — and read the AUC delta. A delta
+larger than your sampling noise is a flag, regardless of the
+standalone number.
+
+**Worked example.** Notebook 03 §4 (standalone) and §5
+(ablation), with the side-by-side bar chart in §5.1. The
+sign-aware tolerance gate in §6 (`MIN_GBM_LIFT = 0.015`)
+formalises the asymmetry as a CI assertion.
+
+### 3. Time-window violations on engineered features
+
+The non-negotiable rule: no feature on a snapshot-anchored
+task may use events later than `lead_created_at + snapshot_day`.
+The public bundle's event tables (`touches`, `sessions`,
+`sales_activities`, `opportunities`) are pre-filtered to
+satisfy this rule (notebook 02 §3 verifies the contract on
+the bundle as shipped, including a *minimum headroom under
+cutoff* readout). The hazard you can still create yourself is
+to engineer a feature that joins back to a non-event table
+without filtering — for instance, joining `customers` (which
+exists only for *converted* leads) into a feature panel.
+
+**How to detect on any dataset.** For every per-lead
+aggregate you build, write the query as `SELECT … WHERE
+event.timestamp <= lead.created_at + INTERVAL ''`
+explicitly, even when the underlying table is already filtered.
+If the same SQL works against the instructor companion (full-
+horizon tables) AND the public bundle, you'll catch
+yourself if you accidentally rely on rows that exist only in
+the unfiltered view.
+
+**Worked example.** Notebook 02 §3 implements the per-table
+inline assertion. The validation report's
+`$.tiers..per_seed[*].baselines.post_snapshot_aggregates`
+HistGBM AUC documents what a model can recover when the rule
+is intentionally violated.
+
+### 4. Target-encoding leakage on test
+
+Mean-target encoding of a categorical feature is a textbook
+hazard: fit the encoding on the *full* train+test population
+and you've leaked test labels into the feature. Notebook 02
+§4.4 demonstrates the train-only-fit posture on `industry`
+(four industries — logistics, healthcare_non_clinical,
+manufacturing, professional_services — encoded by their
+training-split conversion rate, with a global-mean fallback
+for industries not seen in train). The leakage variant is a
+one-liner — `pd.concat([train, test]).groupby('industry')['target'].mean()`
+— and the notebook deliberately doesn't show it, because the
+lesson there is the discipline. This guide shows the leakage
+form (above) so you recognise it during code review.
+
+**How to detect on any dataset.** When mean-target encoding
+shows up in a notebook or pipeline, check three things in
+order: (a) the encoding's `.fit()` call sees only training
+labels; (b) the same encoding is applied to test via merge
+or join, never re-fitted; (c) categories present in test but
+not train fall back to a deterministic value (global mean is
+fine; computing a fallback from test is not). If the encoding
+is fit on test labels even partially — including via a
+"smoothed" encoder that uses pooled train+test counts — you
+have target leakage.
+
+**Worked example.** Notebook 02 §4.4 (train-only fit) and
+§4.5 (the merge that applies the encoding to test). The
+fallback-to-train-mean handling is in `attach_engineered`.
+
+---
+
+## Split discipline
+
+### 5. Train-test contamination
+
+The bundle ships a deterministic 70/15/15 split on `lead_id`
+(see `tasks//task_manifest.json`). That guarantees
+`lead_id` uniqueness across splits — but `account_id` and
+`contact_id` are *not* split on. On the as-shipped intermediate
+bundle, **518 of 557 test accounts (93 %) also appear in train**,
+and the contact-level overlap is similar in magnitude (the
+split is `lead_id`-keyed and `account_id` / `contact_id` are
+shared foreign keys); the same proportions hold on intro and
+advanced because the splitter is tier-invariant. Models can
+ride account- or contact-level signal across the split boundary
+in ways that don't generalise to a fresh account or fresh
+contact.
+
+**How to detect on any dataset.** Repeat the snippet below per
+group key — every reusable foreign-key column the dataset
+exposes (`account_id`, `contact_id`, and any derived strata
+like `industry × region` you bake into engineered features) is
+a separate group-leakage axis.
+
+```python
+import pandas as pd
+train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet")
+test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet")
+for key in ("account_id", "contact_id"):
+ overlap = set(train[key]) & set(test[key])
+ print(f"shared {key}: {len(overlap)} / {test[key].nunique()}")
+```
+
+If any overlap is non-empty *and* you've engineered any
+group-level features, retrain with group-aware splitting
+(e.g. `GroupKFold` on the relevant key) and re-read the AUC
+delta. The delta is the amount of "free" lift the random-split
+was buying you. The right framing isn't "remove the leak"; it's
+*report both numbers so the reader knows which is which.*
+
+**Worked example.** Notebook 02 §4.2 builds an account-level
+density feature using *only* train leads' touches — a
+defensive posture against this hazard. The
+`tasks/converted_within_90_days/task_manifest.json` records
+the split policy and is the right artefact to cite when filing
+an issue under this label. A bundle-level group-overlap audit
+isn't included in v1 — the validation report's split-leakage
+probe (`probe_split_id_overlap`) checks `lead_id` only;
+extending it to enumerate `account_id` and `contact_id`
+overlap is a `v2-idea` candidate.
+
+### 6. Cohort-by-segment evaluation
+
+Notebook 04 §7 demonstrates **tier-wide** cohort shift —
+sort leads chronologically, train on the first 85 %, score
+the last 15 % — and finds intermediate cohort-split AUC
+sits *higher* than random-split AUC by ~0.0155 (the v1
+simulator has no time drift baked in over the 90-day horizon).
+The richer stress test is **per-segment** cohort shift:
+chronological resplit *within* each industry, region, or
+revenue tier, and read the same delta per segment. Segment-
+conditional drift can hide inside a stable tier-wide number
+— industry A drifting up by 0.04 cancels industry B drifting
+down by 0.04 in the average.
+
+**How to detect on any dataset.** For each segment column
+(`industry`, `region`, `employee_band`,
+`estimated_revenue_band`), repeat the cohort-split protocol
+from notebook 04 §7 conditioned on that segment. Report the
+per-segment AUC degradation and the spread across segments.
+A spread larger than the tier's cross-seed GBM-AUC band
+(`$.tiers..spreads.gbm_auc` — same model the cohort-shift
+block uses) is a realism flag: the simulator is producing a
+homogeneous world that real production cohorts wouldn't be.
+
+**Worked example.** Notebook 04 §7 (tier-wide, validator-
+mirrored). The validation report's `cohort_shift..auc_degradation`
+field gives the v1 baseline you're trying to refine. v1
+intentionally runs only the tier-wide check; the per-segment
+audit is a `v2-idea` candidate.
+
+---
+
+## Metric and ranking traps
+
+### 7. Value-aware ranking surprises
+
+P(convert) ranking and `P(convert) × expected_acv` ranking
+are both reasonable depending on the operational question.
+Notebook 04 §5 shows the gap on this bundle — at top-50, ACV
+capture jumps from 0.16 (P-only) to 0.40 (P × ACV). The trap
+is reaching for one metric when the operational question
+demands the other and not noticing the inversion. AUC ranks
+*everything* by P(convert); a salesperson with capacity for
+50 leads cares about revenue-weighted top-50 capture.
+
+**How to detect on any dataset.** Compute both `precision_at_k`
+and `expected_acv_capture_at_k` for the same top-K. If their
+ranking of model variants disagrees, that's a finding — at
+minimum a `pedagogy` issue, possibly `realism` if the gap is
+so large it suggests the simulator's ACV column has unrealistic
+correlation with P(convert).
+
+**Worked example.** Notebook 04 §5 produces both curves
+side-by-side; the validation report's per-seed scalars live
+under
+`$.tiers..per_seed[*].expected_acv_capture_at_k.50`
+(and `.100` for top-100), keyed by string K.
+
+### 8. Threshold-vs-rank semantics
+
+A `precision >= threshold` operating point and a `top-K by
+rank` operating point are not the same thing when probabilities
+have ties. Notebook 04 §6 picks a threshold that "should"
+admit 50 leads and reads back `actually_above` as a defensive
+instrument — on the as-shipped intermediate bundle the realised
+count matches capacity, but the readout exists so a seed where
+ties cluster at the operating probability fails loud rather
+than silently inflating the slate.
+
+**How to detect on any dataset.** When you set a probability
+threshold for a fixed-capacity decision, always log the
+*realised* count above threshold, not just the threshold value.
+If realised > capacity by more than a few percent, ties are
+inflating the slate and you need either a finer probability
+grid (less likely to help on a calibrated model) or a
+secondary rank score to break ties.
+
+**Worked example.** Notebook 04 §6 prints
+`capacity / threshold / actually_above / precision / recall`
+and walks through the threshold sweep for context. The
+calibration-bin output in §3 is the related receipt — a model
+with poor bin-error is more likely to have ties at common
+probabilities.
+
+---
+
+## Robustness and realism
+
+### 9. Calibration drift across cohorts and segments
+
+The validation report tracks `calibration_max_bin_error`
+per tier (`$.tiers..medians.calibration_max_bin_error`)
+— intermediate ~0.25, intro ~0.25, advanced ~0.52. That's a
+single number per tier on a single split; in principle it can
+mask segment-conditional miscalibration. Whether v1 actually
+exhibits such drift is an open question — the per-segment
+audit is the way to find out. Notebook 04 §3 shows the
+tier-level reliability diagram on the public bundle; the
+analogous per-segment diagram is the next stress test.
+
+**How to detect on any dataset.** Reproduce notebook 04 §3's
+binning protocol *within* each segment column you care about
+(`industry`, `region`, `employee_band`,
+`estimated_revenue_band`). Report `max_bin_error` per segment
+and the spread across segments. A segment whose max-bin-error
+is materially worse than the tier-level number is a `realism`
+finding — the world isn't producing the correlation structure
+between segment and outcome that real production data would.
+
+**Worked example.** Notebook 04 §3 covers the tier-level
+case end-to-end. The cohort-shift block in §7 is the
+chronological analogue (calibration over time, in
+expectation, via AUC degradation as a coarse summary). v1
+doesn't ship a per-segment calibration audit; it's a
+`v2-idea`.
+
+---
+
+## What to do when you find one
+
+1. Reproduce the finding from a clean checkout against the
+ as-shipped bundle. Note the seed, tier, and the test-split
+ sha256 from `manifest.json` — under
+ `tasks.converted_within_90_days.test_sha256`. That single
+ hash uniquely identifies the bundle the finding was
+ reproduced on; the manifest also carries per-table hashes
+ under `tables..sha256` if a table-specific hash is
+ the right anchor for the finding.
+2. Pick the issue template that fits — leakage / contamination
+ / metric findings go in
+ [`dataset_breakage_report.yml`](../../.github/ISSUE_TEMPLATE/dataset_breakage_report.yml);
+ distributional / realism critiques go in
+ [`realism_feedback.yml`](../../.github/ISSUE_TEMPLATE/realism_feedback.yml).
+3. Suggest a triage label from the table at the top of this
+ guide. The maintainer applies the final label.
+4. Watch [`v2_decision_log.md`](v2_decision_log.md) for the
+ disposition. Accepted findings get an entry with a verdict
+ (`accepted-for-v2`, `deferred`, `wont-fix`,
+ `needs-investigation`) and a pointer to the resulting v2
+ work item.
diff --git a/release/docs/channel_signal_audit.md b/release/docs/channel_signal_audit.md
new file mode 100644
index 0000000..2cc3d56
--- /dev/null
+++ b/release/docs/channel_signal_audit.md
@@ -0,0 +1,66 @@
+# Channel-signal audit — leadforge-lead-scoring-v1
+
+Audit produced by `scripts/audit_channel_signal.py`; see `channel_signal_audit.json` for the machine-readable form.
+
+**Scope.** For every tier we compute per-channel conversion rates on the train split and the univariate AUC of channel against `converted_within_90_days`, scored as the empirical positive rate per channel (a 1-D Bayes classifier). Two AUCs are reported: an **in-sample** number (train rates → train labels — biased upward by construction) and an **out-of-sample** number (train rates → test labels — directly comparable to the `source_only` baselines in `release/validation/validation_report.json`).
+
+**Caveat on the industry benchmark.** The G2 / Gemini v2 numbers below are single-step **MQL→SQL** rates (recommendation #8 in `docs/external_review/summaries/recommendations_pass.md`). v1's label is **90-day closed-won**, the entire funnel resolved. The two metrics are not directly comparable; the table is reproduced for context only.
+
+## Industry benchmark (context, not target)
+
+| Channel | MQL→SQL conversion rate |
+|---|---|
+| Email | 0.50% |
+| PPC | 26.00% |
+| SEO | 51.00% |
+
+## Tier: `intro`
+
+`n_train = 3500` (90-day conversion rate 41.46%); `n_test = 750` (rate 42.67%).
+
+### Columns: `lead_source`, `first_touch_channel` (audit values identical)
+
+Per-channel rate spread (max − min): **0.0433** · In-sample univariate AUC: **0.5200** · Out-of-sample univariate AUC: **0.5014**
+
+| Channel | n (train) | Share (train) | Converted (train) | Train rate |
+|---|---:|---:|---:|---:|
+| `inbound_marketing` | 1570 | 44.86% | 682 | 43.44% |
+| `partner_referral` | 698 | 19.94% | 273 | 39.11% |
+| `sdr_outbound` | 1232 | 35.20% | 496 | 40.26% |
+
+## Tier: `intermediate`
+
+`n_train = 3500` (90-day conversion rate 20.14%); `n_test = 750` (rate 22.27%).
+
+### Columns: `lead_source`, `first_touch_channel` (audit values identical)
+
+Per-channel rate spread (max − min): **0.0365** · In-sample univariate AUC: **0.5212** · Out-of-sample univariate AUC: **0.5139**
+
+| Channel | n (train) | Share (train) | Converted (train) | Train rate |
+|---|---:|---:|---:|---:|
+| `inbound_marketing` | 1570 | 44.86% | 334 | 21.27% |
+| `partner_referral` | 698 | 19.94% | 123 | 17.62% |
+| `sdr_outbound` | 1232 | 35.20% | 248 | 20.13% |
+
+## Tier: `advanced`
+
+`n_train = 3500` (90-day conversion rate 7.91%); `n_test = 750` (rate 7.87%).
+
+### Columns: `lead_source`, `first_touch_channel` (audit values identical)
+
+Per-channel rate spread (max − min): **0.0056** · In-sample univariate AUC: **0.5083** · Out-of-sample univariate AUC: **0.5226**
+
+| Channel | n (train) | Share (train) | Converted (train) | Train rate |
+|---|---:|---:|---:|---:|
+| `inbound_marketing` | 1570 | 44.86% | 128 | 8.15% |
+| `partner_referral` | 698 | 19.94% | 53 | 7.59% |
+| `sdr_outbound` | 1232 | 35.20% | 96 | 7.79% |
+
+## Discussion
+
+The numbers above answer one question: *how strongly does channel alone signal 90-day conversion in v1?* They do not answer *whether v1 matches industry channel performance*, since the benchmarks measure a different funnel transition (single MQL→SQL step) and v1 measures the entire funnel resolved over 90 days. Treat the v1 numbers as an internal description of the simulator's channel signal.
+
+Two empirical observations a reader can make from the numbers above:
+
+1. **The out-of-sample univariate AUC is the comparable number** for any external baseline. It uses train-derived rates scored against held-out test labels — the same shape as the `source_only` HistGBM baseline reported in `release/validation/validation_report.json`, which is built on the same task splits with `lead_source` + `first_touch_channel` as the only features. The in-sample number is biased upward by construction — small at v1's N but visible — and is reported here for transparency rather than comparison.
+2. **The numerical conclusion is bundle-specific.** When the per-channel rate spread is small and the OOS univariate AUC is close to chance, channel alone is a weak feature for the bundle this audit was run against. v1's bundles currently produce that outcome (see the per-tier sections above) — consistent with the design: the simulator drives conversion through motif-family hazards keyed off latent traits, not channel-conditional probabilities. Channel-conditional encoding is tracked as post-v1 work in `docs/release/post_v1_roadmap.md`.
diff --git a/release/docs/feature_dictionary.md b/release/docs/feature_dictionary.md
new file mode 100644
index 0000000..790354a
--- /dev/null
+++ b/release/docs/feature_dictionary.md
@@ -0,0 +1,210 @@
+# Feature dictionary — `leadforge-lead-scoring-v1`
+
+Narrative companion to the per-tier `feature_dictionary.csv` shipped
+inside each public bundle. The CSV is the authoritative
+machine-readable spec (column / dtype / description / category /
+target flag / leakage flag); this document groups features by
+analytical role and adds the prose explanation, modelling
+recommendations, and pedagogical caveats that don't fit a CSV row.
+
+The grouping below covers every feature in the public student-facing
+snapshot — the same 32 columns ship in `intro`, `intermediate`, and
+`advanced` bundles. The instructor companion adds the hidden truth
+in `metadata/`; it does not change the feature list.
+
+| Category | Columns | Modelling default |
+|---|---|---|
+| Lead identity & timing | 4 | drop `lead_id`; keep `lead_created_at` for cohort splits, drop for production |
+| Lead source & channel | 2 | keep both |
+| Firmographics | 5 | keep all |
+| Personographics | 3 | keep all (categorical encoders welcome) |
+| Engagement (snapshot-window) | 10 | keep all |
+| Funnel & sales-process | 4 | keep all |
+| Value | 2 | keep all |
+| Leakage trap | 1 | **drop** unless deliberately demonstrating leakage |
+| Target | 1 | label — never used as a feature |
+
+## Lead identity and timing
+
+| Column | Dtype | Source | Modelling notes |
+|---|---|---|---|
+| `lead_id` | string | identity | Opaque, deterministic per run; not informative. Use as a join key or row index, never as a feature. |
+| `account_id` | string | identity | Foreign key into `tables/accounts.parquet`. Out-of-sample accounts may appear in test; if you fit account-level features, watch for cold-start. |
+| `contact_id` | string | identity | Foreign key into `tables/contacts.parquet`. Same warning. |
+| `lead_created_at` | string (ISO-8601) | simulation clock | Lead birthday; useful for cohort/time-shift evaluation (see `docs/release/v1_acceptance_gates.md` G6.4). Drop or bin it for production models — feeding raw timestamps to a linear model is rarely what you want. |
+
+## Lead source and channel
+
+Two columns describe how each lead entered the funnel. They are
+populated from the recipe's GTM-motion mix
+(`inbound_marketing` 45%, `sdr_outbound` 35%, `partner_referral`
+20%) and are identical between the two columns in v1 — both encode
+the same origination channel under different field names.
+
+| Column | Dtype | Why it might matter |
+|---|---|---|
+| `lead_source` | string | Origination channel; one of `inbound_marketing` / `sdr_outbound` / `partner_referral`. |
+| `first_touch_channel` | string | Marketing channel of the first recorded touch. Always equals `lead_source` in v1; the field exists to support post-v1 work where origination and first-touch can diverge. |
+
+**Caveat.** Per [`docs/release/channel_signal_audit.md`](channel_signal_audit.md),
+v1's channel signal is weak: per-channel rate spread ≤ 0.043 and
+univariate AUC ≤ 0.521 across all tiers, well below the G2 /
+Gemini v2 industry MQL→SQL band (SEO ~51%, PPC ~26%, Email <1%).
+Expect modest feature importance from these columns; do not expect
+channel to be a top-tier predictor in v1.
+
+## Firmographics (account-level)
+
+These describe the buying organisation. They come from the recipe's
+narrative spec (industry, region, employee bands, revenue bands)
+and from latent traits sampled per account. Five columns plus the
+`account_id` foreign key listed under "Lead identity and timing"
+above; all five are fair to use as features.
+
+| Column | Dtype | Why it might matter |
+|---|---|---|
+| `industry` | string | Categorical mix is fixed by the recipe (`manufacturing`, `logistics`, `professional_services`, `healthcare_non_clinical`); motif-family latent biases create modest cross-industry conversion-rate differences. |
+| `region` | string | `US` / `UK`. Currently a low-signal axis — the simulator does not model channel-by-region interactions. |
+| `employee_band` | string | Bands are aligned with the ICP range (200–2,000 employees, plus tails). Larger accounts trend toward higher expected ACV. |
+| `estimated_revenue_band` | string | Bands span `$1M-$10M` to `$200M+`; correlated with `employee_band` by design. |
+| `process_maturity_band` | string | A discretisation of the latent `process_maturity` trait — *visible* signal of `motif_family.fit_dominant`'s "fit beats engagement" story. |
+
+## Personographics (contact-level)
+
+These describe the primary contact attached to the lead. Three
+categorical features (the `contact_id` foreign key is listed
+under "Lead identity and timing"); all three are fair to use.
+
+| Column | Dtype | Why it might matter |
+|---|---|---|
+| `role_function` | string | Functional area: `finance`, `ops`, `it`, `procurement`. Drives demo-page views and the demo/trial path through `motif_family.demo_trial_mediated`. |
+| `seniority` | string | `c_suite` / `vp` / `director` / `manager` / `individual_contributor`. Strongly correlated with the latent `contact_authority` trait that gates `motif_family.buying_committee_friction`. |
+| `buyer_role` | string | `economic_buyer`, `champion`, `technical_evaluator`, `end_user`. Hand-mapped from `role_function` × `seniority`. |
+
+## Engagement (snapshot-window aggregates)
+
+Ten engagement features computed strictly over events on days
+`[0, snapshot_day]` (with `snapshot_day = 30` for v1). The simulator
+emits touches, sessions, and page views every day from
+`lead_created_at` onward; the renderer aggregates them up to but
+not past day 30. The 90-day label window resolves separately, so
+features cannot encode events that drove the late-window outcome.
+
+| Column | Dtype | What it captures |
+|---|---|---|
+| `touch_count` | Int64 | All marketing/sales touches in the snapshot window. |
+| `inbound_touch_count` | Int64 | Inbound touches only. |
+| `outbound_touch_count` | Int64 | Outbound touches only. |
+| `session_count` | Int64 | Web/trial session count. |
+| `pricing_page_views` | Int64 | Cumulative pricing-page views across sessions. |
+| `demo_page_views` | Int64 | Cumulative demo-page views across sessions. |
+| `total_session_duration_seconds` | Int64 | Cumulative seconds across all sessions. |
+| `touches_week_1` | Int64 | Touches in days 0–7 inclusive (early urgency proxy; the snapshot builder uses `_day <= 7`, which is 8 day values). |
+| `touches_last_7_days` | Int64 | Touches in the last 7 days of the snapshot window — for `snapshot_day=30`, days 24–30 inclusive (the snapshot builder uses `_day > snapshot_day - 7`). |
+| `days_since_first_touch` | Float64 | NaN if the lead has had zero touches by snapshot day. |
+
+## Funnel and sales-process
+
+The funnel state at snapshot day, exposed via four columns. None of
+these are terminal stages — `current_stage` (which can encode
+`closed_won` / `closed_lost`) is redacted from public bundles via
+the exposure layer.
+
+| Column | Dtype | What it captures |
+|---|---|---|
+| `activity_count` | Int64 | Sales-activity events (calls, demos, follow-ups) in the snapshot window. |
+| `days_since_last_touch` | Float64 | Recency of the most recent touch; NaN if zero touches. |
+| `opportunity_created` | boolean | Whether *any* opportunity was created by snapshot day, regardless of state. |
+| `has_open_opportunity` | boolean | Whether an opportunity existed in an open stage at snapshot day. |
+
+## Value
+
+Two value features. Both are useful as inputs to value-aware
+ranking (`expected_acv × P(convert)`); see notebook 4 once Phase 6
+ships.
+
+| Column | Dtype | What it captures |
+|---|---|---|
+| `opportunity_estimated_acv` | Float64 | Estimated ACV of the most recent open opportunity at snapshot day; NaN if no opportunity. |
+| `expected_acv` | Float64 | Falls back to a revenue-band midpoint heuristic when no opportunity exists, so it has fewer NaNs than `opportunity_estimated_acv`. |
+
+## Leakage trap (deliberate)
+
+| Column | Dtype | Why it ships |
+|---|---|---|
+| `total_touches_all` | Int64 | Counts touches across the full 90-day horizon — not the snapshot window. Flagged `leakage_risk=True` in the CSV (the per-bundle dictionary has columns `name,dtype,description,category,is_target,leakage_risk`); documented in `release/README.md`. The gap `total_touches_all − touch_count` carries label-correlated signal because high-converting leads accumulate more late-window touches in the simulator. **Drop this column from your features unless you are explicitly demonstrating leakage detection.** |
+
+## Target
+
+| Column | Dtype | Definition |
+|---|---|---|
+| `converted_within_90_days` | boolean | True iff a `closed_won` event occurred within 90 days of `lead_created_at`. Derived from simulated events; never sampled directly. |
+
+## Difficulty modulation
+
+Difficulty profiles distort the same feature set with different
+parameters; columns and dtypes are identical across tiers. The
+distortions are applied in `leadforge/render/snapshots.py` via
+`_apply_difficulty_distortions()`:
+
+- **Gaussian noise** on float features. `intro` 0.10, `intermediate`
+ 0.30, `advanced` 0.55 (multipliers applied to per-feature
+ standard deviations).
+- **MCAR missingness.** `intro` 2%, `intermediate` 8%,
+ `advanced` 18%.
+- **Outlier injection** at the same per-tier rate as missingness.
+- **Signal strength.** Latent-score weights are multiplied by 0.90
+ (`intro`), 0.70 (`intermediate`), and 0.50 (`advanced`),
+ weakening the link between latent traits and conversion as
+ difficulty rises.
+
+The conversion-rate band for each tier is recipe-defined; observed
+medians across the canonical seed sweep (42–46) are
+0.4267 (`intro`), 0.2160 (`intermediate`), 0.0840 (`advanced`).
+See `release/validation/validation_report.md` for the full
+cross-seed × cross-tier metrics panel.
+
+## Recommended modelling defaults
+
+A short opinionated checklist for a first model. Note: the flat
+`lead_scoring.csv` and the per-task Parquet splits ship every column
+in the table above, including the IDs — the recommendation is what to
+**use as features**, not what's in the file.
+
+1. **Identifiers — drop before fitting.** `lead_id` is opaque and
+ carries no signal; drop it. `account_id` / `contact_id` are joinable
+ keys, useful only when you're computing cross-table aggregates;
+ drop from the feature matrix unless you actually use them. Drop or
+ bin `lead_created_at` — feeding raw timestamps to a linear model
+ is rarely what you want; use it as the cohort key for time-shift
+ evaluation instead.
+2. **Trap — drop.** `total_touches_all` is the deliberate leakage
+ trap. Drop unless you're demonstrating leakage detection.
+3. **Categoricals — encode.** One-hot or target-encode `industry`,
+ `region`, `employee_band`, `estimated_revenue_band`,
+ `process_maturity_band`, `role_function`, `seniority`,
+ `buyer_role`, `lead_source`, `first_touch_channel`. The two
+ channel columns carry identical values in v1; pick one.
+4. **Engagement and funnel — keep all.** The `Float64` columns carry
+ NaN for "no event in window", which is itself a signal — encode
+ missingness explicitly rather than imputing to zero blindly.
+5. **Value-aware ranking.** Use `expected_acv` over
+ `opportunity_estimated_acv`; the latter is missing for leads
+ without an opportunity. Multiply by your model's predicted
+ probability for a default value-weighted ranker.
+6. **Cohort evaluation.** Sort by `lead_created_at` and split
+ chronologically; the random-split AUC is *not* the right number to
+ report if your downstream use is forecasting.
+
+## See also
+
+- `release/{intro,intermediate,advanced}/feature_dictionary.csv` —
+ the authoritative machine-readable spec, regenerated with each
+ bundle.
+- `release/README.md` — the dataset card.
+- `docs/release/generation_method.md` — how the underlying
+ events are generated.
+- `docs/release/channel_signal_audit.md` — how strongly each
+ channel column signals conversion in v1.
+- `release/validation/validation_report.md` — calibration, lift,
+ P@K, model-family deltas, cross-seed bands.
diff --git a/release/docs/generation_method.md b/release/docs/generation_method.md
new file mode 100644
index 0000000..12029d3
--- /dev/null
+++ b/release/docs/generation_method.md
@@ -0,0 +1,166 @@
+# Generation method — `leadforge-lead-scoring-v1`
+
+A standalone summary of how the dataset is generated, written for
+external readers. Read this before opening the bundle if you want to
+know what the data is and how much you can trust each piece of it; for
+the full architecture, see [`docs/leadforge_architecture_spec.md`].
+
+## What the dataset is
+
+`leadforge-lead-scoring-v1` is a synthetic mid-market B2B SaaS
+lead-scoring dataset generated by
+[leadforge](https://github.com/leadforge-dev/leadforge), an
+open-source Python framework. Every row, event, and edge is produced
+by code in this repository — there is no real CRM behind the data.
+The generator is deterministic given a fixed
+`(recipe, configuration, seed, package version)` tuple, and the
+recipe and seed are recorded in each bundle's `manifest.json`.
+
+The published family contains three difficulty tiers — `intro`,
+`intermediate`, and `advanced` — sharing one fictional company
+narrative ("Veridian Procure", a procurement / AP automation SaaS).
+The tiers differ only in noise, missingness, and signal strength,
+modulated by a difficulty profile that the simulator consumes; the
+underlying causal structure is identical. A separate
+`*_instructor` companion ships the full hidden truth (causal graph,
+latent registry, mechanism summary, full-horizon relational tables).
+
+## Generation pipeline at a glance
+
+Generation runs in five layers, top to bottom. Every layer is
+deterministic, every layer is seeded from a single root via named
+substreams, and every layer is testable in isolation.
+
+1. **Hidden world structure.** A directed acyclic graph (DAG) of
+ latent traits, lead states, sales-process states, and the
+ `Converted within 90 days` outcome node, sampled from one of five
+ *motif families* and then perturbed by stochastic rewiring. The
+ motif families are intentionally non-uniform: `fit_dominant`,
+ `intent_dominant`, `sales_execution_sensitive`,
+ `demo_trial_mediated`, `buying_committee_friction`. Two
+ independently-sampled bundles share neither the exact graph nor
+ the edge weights, but they share the constraint that the graph is
+ acyclic, every node is reachable from a root, and the outcome
+ node is reachable from every non-root subgraph.
+2. **Mechanism layer.** Every node in the sampled graph receives a
+ concrete mechanism — a logistic latent score, a Poisson intensity
+ for touch counts, a recency-decayed engagement intensity for
+ sessions, a categorical influence for source channel, a stage
+ transition hazard, a conversion hazard, etc. Mechanisms are
+ assigned by motif family, so a `fit_dominant` graph and an
+ `intent_dominant` graph end up with materially different
+ behavior at simulation time. Mechanism parameters are calibrated
+ so each tier hits its target conversion-rate band; the
+ `intermediate` tier is the canonical difficulty profile.
+3. **Population layer.** Accounts (1,500), contacts (4,200), and
+ leads (5,000) are drawn with deterministic foreign keys and
+ ID-stable namespaces (`acct_000001`, `lead_000001`, …). Each
+ entity carries a vector of latent traits seeded from the world
+ graph: account fit, process maturity, contact authority,
+ problem awareness, urgency, etc. Industry, region, employee
+ band, role, and seniority are all drawn from the recipe's
+ narrative spec; firmographic correlations come from
+ motif-family latent biases applied during sampling.
+4. **Simulation engine.** A 90-day discrete-time simulator
+ advances every lead day-by-day from MQL through the funnel
+ (`mql → sal → sql → demo_scheduled → demo_completed →
+ proposal_sent → negotiation → closed_won/closed_lost`). Each
+ day, hazards from the mechanism layer fire: stage transitions,
+ touches (inbound vs outbound, recency-decayed), web sessions
+ (pricing-page views, demo-page views), sales activities,
+ churn, and direct conversion for unusual fast paths. Once a
+ lead reaches `closed_won`, opportunities, customers, and
+ subscriptions materialise with deterministic foreign keys.
+ `converted_within_90_days` is *event-derived*: it is true iff
+ a `closed_won` event occurred within the configured label
+ window, never sampled directly.
+5. **Snapshot rendering.** For every lead, the renderer freezes a
+ feature snapshot at `snapshot_day` (30 days for v1).
+ Aggregates such as `touch_count`, `session_count`,
+ `pricing_page_views`, `expected_acv`, and
+ `days_since_last_touch` only see events on days
+ `[0, snapshot_day]`; the label resolves over the full 90-day
+ horizon. The deliberate exception is `total_touches_all`,
+ which counts the full-horizon touch history and is flagged as
+ a pedagogical leakage trap in the feature dictionary.
+
+## Bundle output
+
+Each bundle writes a fixed directory layout — a manifest, dataset
+card, feature dictionary, relational tables, and the
+`converted_within_90_days` task split. The manifest records the
+recipe, seed, package version, exposure mode, snapshot day, label
+window, schema version, table inventory with row counts, SHA-256
+hashes for every file, and the exact set of redacted columns. Two
+runs with the same `(recipe, seed, version)` produce byte-identical
+bundles modulo the wall-clock `generation_timestamp` field;
+`scripts/verify_hash_determinism.py` enforces this.
+
+The public (`student_public`) bundle and the instructor companion
+share the same generator run; they differ only in *what is
+published*. Filtering happens during rendering, not during
+simulation:
+
+- Public bundles route relational tables through
+ `to_dataframes_snapshot_safe`, which (a) filters event tables
+ per-lead by `lead_created_at + snapshot_day`, (b) drops
+ terminal-state columns from `leads` and `opportunities`, and
+ (c) omits `customers` and `subscriptions` entirely (their
+ presence is conversion-conditional).
+- Instructor companions skip the snapshot-safe writer and ship
+ full-horizon tables plus a `metadata/` directory containing the
+ hidden world graph, latent registry, mechanism summary, and
+ full world spec. They are not appropriate input for the
+ student-facing task.
+
+The exact column lists are pinned by `BANNED_LEAD_COLUMNS`,
+`BANNED_OPP_COLUMNS`, `BANNED_TABLES`, and
+`SNAPSHOT_FILTERED_TABLES` in
+`leadforge/validation/leakage_probes.py`; the validator imports the
+same constants the writer uses, so the contract is single-sourced.
+
+## Calibration and validation
+
+Difficulty calibration is empirical, not analytic: the
+intermediate tier is sampled, the conversion-rate band is checked,
+and the signal-strength multiplier is tuned until five seeds
+(42–46) hit the target band with stable variance. The intro and
+advanced tiers reuse the same mechanism assignments with different
+distortion parameters (Gaussian noise on float features, MCAR
+missingness, outlier injection) calibrated the same way.
+
+Every claim made about realism, calibration, or difficulty is
+backed by `release/validation/validation_report.md`, which is
+regenerated by `scripts/validate_release_candidate.py`. The driver
+runs the full release-quality panel — per-tier ROC-AUC, PR-AUC, log
+loss, Brier, calibration bins, lift, P@K, top-decile rate,
+expected-ACV capture, model-family deltas, cross-seed bands,
+random-vs-cohort split degradation, and the full leakage probe
+taxonomy — and exits non-zero if anything falls outside the bands
+declared in `docs/release/v1_acceptance_gates_bands.yaml`.
+
+## What this is not
+
+- Not a substitute for real CRM data. The vertical, narrative,
+ and motif families are deliberate fictions chosen to teach
+ lead-scoring patterns without exposing real customer data.
+- Not a benchmark. The difficulty tiers are calibrated for
+ pedagogy, not for cross-paper comparability.
+- Not a temporally rich dataset. The simulator runs in
+ daily steps over a 90-day horizon. Sales-cycle distributions
+ are whatever falls out of the daily hazards, not log-normal /
+ Weibull tails. Demographic strings are clean (no
+ free-text-job-title messiness). Both are tracked as post-v1
+ scope in `docs/release/post_v1_roadmap.md`.
+
+## Further reading
+
+For the deeper design rationale — why a DAG, why motif families,
+why event-derived labels, why public-vs-instructor — see
+[`docs/leadforge_design_doc.md`] and
+[`docs/leadforge_architecture_spec.md`]. Both documents are aimed at
+contributors and document the package internals; this doc stays at
+the conceptual level external readers need.
+
+[`docs/leadforge_design_doc.md`]: ../leadforge_design_doc.md
+[`docs/leadforge_architecture_spec.md`]: ../leadforge_architecture_spec.md
diff --git a/release/docs/relational_table_schemas.csv b/release/docs/relational_table_schemas.csv
new file mode 100644
index 0000000..60ee1db
--- /dev/null
+++ b/release/docs/relational_table_schemas.csv
@@ -0,0 +1,65 @@
+table,column,dtype,description,bundle_visibility
+accounts,account_id,string,"Opaque account identifier (e.g. ``acct_000001``). Primary key.",public+instructor
+accounts,company_name,string,"Synthetic display name for the account (fictional). Not a feature in the snapshot.",public+instructor
+accounts,industry,string,"Industry vertical of the buying organisation; one of the recipe's industry vocabulary.",public+instructor
+accounts,region,string,"Geographic region of the account's headquarters (e.g. ``US``, ``UK``).",public+instructor
+accounts,employee_band,string,"Banded employee headcount of the account (e.g. ``200-500``, ``500-1000``, ``1000-2000``).",public+instructor
+accounts,estimated_revenue_band,string,"Banded estimated annual revenue of the account.",public+instructor
+accounts,process_maturity_band,string,"Banded internal process-maturity score of the account (drives ICP fit).",public+instructor
+accounts,created_at,string,"ISO-8601 timestamp when the account was first observed (synthetic creation time).",public+instructor
+contacts,contact_id,string,"Opaque contact identifier (e.g. ``cont_000001``). Primary key.",public+instructor
+contacts,account_id,string,"FK to ``accounts.account_id`` — the buying organisation this contact belongs to.",public+instructor
+contacts,job_title,string,"Free-text job title (fictional). Used only for narrative colour; not a feature.",public+instructor
+contacts,role_function,string,"Functional area of the contact (e.g. ``finance``, ``ops``, ``it``, ``procurement``).",public+instructor
+contacts,seniority,string,"Seniority band of the contact (e.g. ``c_level``, ``vp``, ``director``, ``manager``).",public+instructor
+contacts,buyer_role,string,"Buyer-role classification (``economic_buyer``, ``champion``, ``technical_evaluator``, ``end_user``).",public+instructor
+contacts,email_domain_type,string,"Type of email domain (e.g. ``corporate``, ``free``); never resolves to a real domain.",public+instructor
+contacts,created_at,string,"ISO-8601 timestamp when the contact record was first observed.",public+instructor
+leads,lead_id,string,"Opaque lead identifier (e.g. ``lead_000001``). Primary key for the lead-scoring task.",public+instructor
+leads,contact_id,string,"FK to ``contacts.contact_id`` — the primary contact attached to this lead.",public+instructor
+leads,account_id,string,"FK to ``accounts.account_id`` — the buying organisation this lead belongs to.",public+instructor
+leads,lead_created_at,string,"ISO-8601 timestamp at which the lead was created (= snapshot anchor t=0).",public+instructor
+leads,lead_source,string,"Origination source of the lead (e.g. ``inbound_form``, ``sdr_outbound``, ``partner``).",public+instructor
+leads,first_touch_channel,string,"Marketing channel responsible for the first recorded touch.",public+instructor
+leads,owner_rep_id,string,"Opaque sales-rep id (e.g. ``rep_000001``) owning the lead at snapshot time.",public+instructor
+leads,current_stage,string,"Funnel stage at snapshot time. REDACTED in public bundles — the post-snapshot stage trajectory would leak the outcome.",instructor_only
+leads,is_sql,bool,"Whether the lead has been marked Sales Qualified by snapshot time. REDACTED in public bundles — derived from post-snapshot SDR activity.",instructor_only
+leads,converted_within_90_days,bool,"Target label (event-derived from a ``closed_won`` event within 90 days). REDACTED from ``tables/leads.parquet`` in public bundles; lives in ``tasks/converted_within_90_days/*.parquet`` instead.",instructor_only
+leads,conversion_timestamp,string,"ISO-8601 timestamp of the ``closed_won`` event, or null. REDACTED in public bundles.",instructor_only
+touches,touch_id,string,"Opaque touch identifier. Primary key.",public+instructor
+touches,lead_id,string,"FK to ``leads.lead_id``.",public+instructor
+touches,touch_timestamp,string,"ISO-8601 timestamp of the touch. Public bundles filter to ``<= lead_created_at + snapshot_day`` per the redaction contract.",public+instructor
+touches,touch_type,string,"Mechanism of the touch (e.g. ``email``, ``call``, ``ad_view``, ``content_download``).",public+instructor
+touches,touch_channel,string,"Marketing/sales channel attribution (e.g. ``paid_search``, ``content``, ``cold_outreach``).",public+instructor
+touches,touch_direction,string,"``inbound`` (lead-initiated) or ``outbound`` (vendor-initiated).",public+instructor
+touches,campaign_id,string,"Opaque campaign identifier attached to the touch, or null when unattributed.",public+instructor
+sessions,session_id,string,"Opaque session identifier. Primary key.",public+instructor
+sessions,lead_id,string,"FK to ``leads.lead_id``.",public+instructor
+sessions,session_timestamp,string,"ISO-8601 timestamp of the session start. Public bundles filter to ``<= lead_created_at + snapshot_day``.",public+instructor
+sessions,session_type,string,"Session type (e.g. ``marketing_site``, ``trial``, ``demo``).",public+instructor
+sessions,page_views,int64,"Total page views during the session.",public+instructor
+sessions,pricing_page_views,int64,"Page views landing on a pricing URL during the session.",public+instructor
+sessions,demo_page_views,int64,"Page views landing on a demo URL during the session.",public+instructor
+sessions,session_duration_seconds,int64,"Session duration in seconds.",public+instructor
+sales_activities,activity_id,string,"Opaque sales-activity identifier. Primary key.",public+instructor
+sales_activities,lead_id,string,"FK to ``leads.lead_id``.",public+instructor
+sales_activities,rep_id,string,"Opaque sales-rep id performing the activity.",public+instructor
+sales_activities,activity_timestamp,string,"ISO-8601 timestamp of the activity. Public bundles filter to ``<= lead_created_at + snapshot_day``.",public+instructor
+sales_activities,activity_type,string,"Activity mechanism (e.g. ``call``, ``email``, ``demo``, ``meeting``).",public+instructor
+sales_activities,activity_outcome,string,"Logged outcome (e.g. ``connected``, ``voicemail``, ``no_answer``, ``meeting_set``).",public+instructor
+opportunities,opportunity_id,string,"Opaque opportunity identifier. Primary key.",public+instructor
+opportunities,lead_id,string,"FK to ``leads.lead_id``.",public+instructor
+opportunities,created_at,string,"ISO-8601 timestamp the opportunity was created. Public bundles filter rows to ``<= lead_created_at + snapshot_day``.",public+instructor
+opportunities,stage,string,"Current stage at snapshot time (e.g. ``prospecting``, ``demo``, ``negotiation``).",public+instructor
+opportunities,estimated_acv,int64,"Estimated annual contract value at snapshot time (USD).",public+instructor
+opportunities,close_outcome,string,"Terminal outcome (``closed_won``/``closed_lost``). REDACTED in public bundles.",instructor_only
+opportunities,closed_at,string,"ISO-8601 timestamp of the terminal close event. REDACTED in public bundles.",instructor_only
+customers,customer_id,string,"Opaque customer identifier. Primary key. Entire ``customers`` table OMITTED from public bundles (its existence reconstructs the conversion label).",instructor_only
+customers,opportunity_id,string,"FK to ``opportunities.opportunity_id`` — the deal that converted into this customer record.",instructor_only
+customers,account_id,string,"FK to ``accounts.account_id``.",instructor_only
+customers,customer_start_at,string,"ISO-8601 timestamp the account became a paying customer.",instructor_only
+subscriptions,subscription_id,string,"Opaque subscription identifier. Primary key. Entire ``subscriptions`` table OMITTED from public bundles.",instructor_only
+subscriptions,customer_id,string,"FK to ``customers.customer_id``.",instructor_only
+subscriptions,plan_name,string,"Subscription plan name (e.g. ``starter``, ``team``, ``enterprise``).",instructor_only
+subscriptions,subscription_start_at,string,"ISO-8601 timestamp the subscription started.",instructor_only
+subscriptions,subscription_status,string,"Subscription status (e.g. ``active``, ``churned``).",instructor_only
diff --git a/release/docs/v1_acceptance_gates_bands.yaml b/release/docs/v1_acceptance_gates_bands.yaml
new file mode 100644
index 0000000..f3b5f5e
--- /dev/null
+++ b/release/docs/v1_acceptance_gates_bands.yaml
@@ -0,0 +1,155 @@
+# Acceptance bands for `leadforge-lead-scoring-v1`.
+#
+# Operational knob — bands are tuned between releases without a code
+# change. Loaded by `leadforge.validation.difficulty.load_bands()` and
+# consumed by `scripts/validate_release_candidate.py`.
+#
+# Calibration convention: each band fits the cross-seed median ± 2× the
+# observed max-min spread on the canonical N=5 sweep (seeds 42–46) over
+# `release/{intro,intermediate,advanced}/`. A 2× factor on the
+# max-min spread is conservative: it widens the band beyond the
+# observed range so a future seed at the tail of the distribution still
+# passes, but stays tight enough to flag genuine drift between releases.
+# Symmetric one-sided bands (`max:` or `min:` only) are used where the
+# gate is intrinsically one-sided (Brier "lower is better"; calibration
+# error has no meaningful lower bound). See
+# `docs/release/v1_acceptance_gates.md` for the narrative gate descriptions
+# and the median values that produced each band.
+#
+# Initial calibration: 2026-05-06 against the regenerated PR 2.2 release
+# bundles (BUNDLE_SCHEMA_VERSION 5; seed 42 timestamp 2026-05-05).
+# Re-tune when:
+# - the recipe / mechanism layer changes (median shifts);
+# - the difficulty profiles change (per-tier band shapes change);
+# - a release candidate fails a band that the actual data still meets
+# (the spread underestimated the tail; widen the offending bound).
+
+per_tier:
+ intro:
+ # G7.1.1 — conversion rate. Median 0.4267, spread 0.0920;
+ # band = [0.4267 - 2×0.0920, 0.4267 + 2×0.0920] ≈ [0.24, 0.61].
+ conversion_rate_test: {min: 0.24, max: 0.61}
+ # G7.1.2 — LR AUC. Median 0.8788, spread 0.0272.
+ lr_auc: {min: 0.82, max: 0.94}
+ # G7.1.3 — GBM AUC. Median 0.8729, spread 0.0232.
+ gbm_auc: {min: 0.82, max: 0.92}
+ # G7.1.4 — GBM-vs-LR delta. Median -0.0045, spread 0.0225. v1's
+ # snapshot is dominated by linear features (engagement aggregates +
+ # firmographics), so HistGBM does not consistently beat LR; the
+ # band fits the data and the cross-tier-ordering gate (G7.4.4) is
+ # documented as a known-finding-for-v2 in v1_acceptance_gates.md.
+ gbm_minus_lr_auc: {min: -0.05, max: 0.05}
+ # G7.1.5 — LR Average Precision. Median 0.7608, spread 0.0670.
+ lr_average_precision: {min: 0.62, max: 0.90}
+ # G7.1.6 — P@100. Median 0.80; observed range [0.75, 0.82]. Band
+ # widened to [0.65, 0.95] to absorb tail-seed swings on the
+ # cross-seed sweep.
+ precision_at_100: {min: 0.65, max: 0.95}
+ # G7.1.7 — Brier (lower is better). Median 0.1301, spread 0.0184.
+ brier_score: {max: 0.17}
+ # G7.1.8 — calibration max-bin error. Median 0.2497, spread 0.1960.
+ # Calibration spreads are huge because empty bins make the metric
+ # noisy at small per-bin n; the band reflects that and only flags
+ # outright miscalibration (every bin off).
+ calibration_max_bin_error: {max: 0.65}
+ intermediate:
+ # G7.2.1 — conversion rate. Median 0.2160, spread 0.0467.
+ conversion_rate_test: {min: 0.12, max: 0.31}
+ # G7.2.2 — LR AUC. Median 0.8859, spread 0.0230.
+ lr_auc: {min: 0.84, max: 0.93}
+ # G7.2.3 — GBM AUC. Median 0.8755, spread 0.0270.
+ gbm_auc: {min: 0.82, max: 0.93}
+ # G7.2.4 — GBM-vs-LR delta. Median -0.0072, spread 0.0152.
+ gbm_minus_lr_auc: {min: -0.04, max: 0.03}
+ # G7.2.5 — LR AP. Median 0.5752, spread 0.0863.
+ lr_average_precision: {min: 0.40, max: 0.75}
+ # G7.2.6 — P@100. Median 0.59; observed range [0.54, 0.63].
+ precision_at_100: {min: 0.45, max: 0.75}
+ # G7.2.7 — Brier. Median 0.1096, spread 0.0161.
+ brier_score: {max: 0.14}
+ # G7.2.8 — calibration max-bin error. Median 0.2490, spread 0.3215.
+ calibration_max_bin_error: {max: 0.90}
+ advanced:
+ # G7.3.1 — conversion rate. Median 0.0840, spread 0.0200.
+ conversion_rate_test: {min: 0.04, max: 0.12}
+ # G7.3.2 — LR AUC. Median 0.8861, spread 0.0401.
+ lr_auc: {min: 0.81, max: 0.97}
+ # G7.3.3 — GBM AUC. Median 0.8726, spread 0.0171.
+ gbm_auc: {min: 0.84, max: 0.91}
+ # G7.3.4 — GBM-vs-LR delta. Median -0.0133, spread 0.0251.
+ gbm_minus_lr_auc: {min: -0.06, max: 0.04}
+ # G7.3.5 — LR AP. Median 0.3514, spread 0.0814.
+ lr_average_precision: {min: 0.19, max: 0.52}
+ # G7.3.6 — P@100. Median 0.34; observed range [0.30, 0.40].
+ precision_at_100: {min: 0.20, max: 0.55}
+ # G7.3.7 — Brier. Median 0.0611, spread 0.0152.
+ brier_score: {max: 0.09}
+ # G7.3.8 — calibration max-bin error. Median 0.5234, spread 0.4828.
+ # Class imbalance inflates per-bin variance — the metric is noisy
+ # at this tier; band loose enough to admit observed range without
+ # green-lighting total miscalibration.
+ calibration_max_bin_error: {max: 1.0}
+
+# G8.1 — cross-seed stability tolerance. Spread = max - min of the
+# headline metric across the N=5 seeds. Bands are uniform across tiers
+# (PR 3.3 reports per-tier spread but applies one tolerance to all).
+# Bound by the largest observed per-tier spread × 1.5.
+cross_seed_spread:
+ lr_auc: {max: 0.06}
+ gbm_auc: {max: 0.05}
+ gbm_minus_lr_auc: {max: 0.05}
+ lr_average_precision: {max: 0.13}
+ brier_score: {max: 0.04}
+ conversion_rate_test: {max: 0.15}
+
+# G6.4 — cohort-shift AUC degradation. v1's bundles are roughly
+# IID-balanced over the 90-day horizon (no time-of-year drift baked in),
+# so the cohort split AUC stays close to random; observed range across
+# tiers is roughly [-0.02, 0.02]. The band admits ε-positive lower
+# bounds (since "cohort harder than random" is the *intent* of the
+# gate) but accepts that v1 doesn't yet meet it; the lower bound is
+# loose to fit observed data. v2 should explicitly inject seasonality
+# / quarterly close cycles to make this gate bite.
+cohort_shift:
+ auc_degradation: {min: -0.05, max: 0.10}
+
+# Tiers required to be present for the cross-tier ordering gates
+# (G7.4.*) to be evaluated as failures rather than skipped. PR 3.3's
+# release run has all three; partial development runs (e.g. one-tier
+# `--no-rebuild` against a stale workdir) will skip with a warning.
+cross_tier_required: [intro, intermediate, advanced]
+
+# Leakage-probe thresholds fed to `leakage_probes.run_split_probes` per
+# tier. Global rather than per-tier because the contract ("IDs carry no
+# signal", "post-snapshot aggregates can't ace the task on their own")
+# is the same for all difficulty tiers. Suspect-stage columns are
+# typically absent on student_public bundles — the probe skips
+# gracefully when the columns aren't there, so a single declaration
+# covers every tier without per-tier overrides.
+leakage_probes:
+ # G5.3 — ID-only baseline AUC ceiling. Observed median per tier
+ # ~0.49–0.51 with max 0.56; band 0.60 admits stratified-CV variance
+ # without green-lighting genuine ID-encoded leakage.
+ id_only_max_auc: 0.60
+ # Split-label-drift max delta. Not numbered as a distinct gate in
+ # v1_acceptance_gates.md (G6.1/.2/.3/.4 cover ID overlap / near-dups /
+ # cohort-time-shift); split-label-drift findings surface under the
+ # generic ``leakage:split_label_drift`` channel id rather than a G6.x.
+ # IID train/test splits should rarely drift more than a couple of
+ # percentage points; 10% allows for the small `valid` split (15% of
+ # leads) without flagging routine sampling variance.
+ label_drift_max: 0.10
+ # G5.1 — post-snapshot aggregates as a feature subset. Just
+ # `total_touches_all` for v1 (the deliberate pedagogical trap).
+ # Observed max AUC 0.62; band 0.95 because the trap is *meant* to be
+ # predictive — we only flag the case where it solo-dominates the
+ # task.
+ feature_subsets:
+ post_snapshot_aggregates:
+ max_auc: 0.95
+ columns: [total_touches_all]
+ # G5.2 — suspect-stage columns; redacted on student_public so the
+ # probe skips, but declared here so the contract is visible.
+ suspect_stage:
+ max_auc: 0.95
+ columns: [current_stage, is_sql]
diff --git a/release/docs/v2_decision_log.md b/release/docs/v2_decision_log.md
new file mode 100644
index 0000000..41e5df1
--- /dev/null
+++ b/release/docs/v2_decision_log.md
@@ -0,0 +1,48 @@
+# v2 Decision Log — `leadforge-lead-scoring-v2`
+
+This log tracks every external finding against
+`leadforge-lead-scoring-v1` and the disposition the maintainer
+took on each one. It exists so a contributor in 2027 can see
+*why* a v2 design call was made (or why a v1 quirk was kept).
+
+The log starts empty. The first real entry will be added when
+the first issue lands; the schema below is what that entry
+will fill in.
+
+## Schema
+
+Each row is one disposition. Add new rows at the bottom; never
+edit historical entries.
+
+| Field | Required | Format | Notes |
+|---|---|---|---|
+| `received_at` | yes | `YYYY-MM-DD` | Date the finding was received (issue opened / reviewer comment / direct message). Use the wall-clock date in the maintainer's timezone. |
+| `source` | yes | one of `issue:#NNN`, `pr:#NNN`, `email`, `direct` | Where the finding came in. `issue` and `pr` link via the GitHub number. |
+| `topic` | yes | one short phrase | What the finding is about — e.g. "expected_acv realism", "industry conversion rates", "cohort-by-segment drift". |
+| `severity` | yes | `low` / `medium` / `high` | Reporter's claim, sanity-checked by the maintainer. `high` is the equivalent of the breakage-report `high` severity tier. |
+| `verdict` | yes | one of `accepted-for-v2`, `deferred`, `wont-fix`, `needs-investigation` | See vocabulary below. |
+| `next_step` | yes | one sentence | What concretely happens next (or has happened). Free-form but specific — "tracked in v2 milestone as #NNN", "documented as v1 simplification in dataset card", etc. |
+| `link` | optional | URL or path | Pointer to the resulting commit, doc change, or v2 work item. Empty for `wont-fix` and `needs-investigation`. |
+
+### Verdict vocabulary
+
+| Verdict | When |
+|---|---|
+| `accepted-for-v2` | The finding is real and the fix lands in v2. There should be a linked v2 milestone work item. |
+| `deferred` | The finding is real but the fix is post-v2 (or unsized). Counts as a backlog entry, not a v2 commitment. |
+| `wont-fix` | The finding is correct but the design call is intentional. The dataset card or roadmap should already document it; if not, the entry should result in a doc update. |
+| `needs-investigation` | The finding is plausible but not yet reproduced or scoped. Stays in this state for at most one cycle; the maintainer must promote it to one of the other three verdicts before declaring v2 ready. |
+
+## Log
+
+| received_at | source | topic | severity | verdict | next_step | link |
+|---|---|---|---|---|---|---|
+| 2026-05-08 | pr:#76 | F002 — Gaussian noise on float features produces non-physical values (negative ACV, negative day-deltas, day-deltas > snapshot_day=30) without disclosure in `dataset_card.md` Caveats | medium | accepted-for-v2 | Add a "Noise artefacts" bullet to the per-tier `dataset_card.md` Caveats section in v2. Requires touching `leadforge/narrative/dataset_card.py` (auto-rendered file), so out of scope for PR 7.1's no-bundle-regen rule | release/validation/llm_critique_raw_20260508T204359.124834Z.json#F002 |
+| 2026-05-08 | pr:#76 | F003 — `release/README.md` `](../foo)` relative links would 404 on Kaggle / Hugging Face if shipped as-is | medium | wont-fix | Already treated by `scripts/_release_common.py::rewrite_release_links()` — both platform packagers (PR 5.1, 5.2) rewrite `](../foo)` → GitHub blob URL at packaging time before the README is inlined onto Kaggle / HF; the as-committed `release/README.md` keeps the relative paths so it renders correctly on github.com. The LLM critique didn't have visibility into the platform packagers (intentional — they're not in the input bundle) and made a wrong inference | scripts/_release_common.py |
+| 2026-05-08 | pr:#76 | F005 — `calibration_max_bin_error = 0.5234` on advanced tier is driven by an n=2 high-probability bin; `validation_report.md` headline table reports the value with no minimum-bin-count footnote | medium | accepted-for-v2 | Either compute `calibration_max_bin_error` only over bins with `n >= 20`, OR expose both raw and n-weighted variants and add a footnote. Not a 1-line change — touches `leadforge/validation/release_quality.py`'s metric definition and would require regenerating `validation_report.{json,md}`, which PR 7.1's brief explicitly forbids ("`validation_report.{json,md}` should not need regeneration for this PR") | release/validation/llm_critique_raw_20260508T204359.124834Z.json#F005 |
+| 2026-05-08 | pr:#76 | Missing — Datasheets §Biases enumeration in `release/README.md` (industry/region/persona uniformity, channel-conditional independence) | medium | accepted-for-v2 | The README's "Known limitations" lists individual symptoms (weak channel signal, flat AUC across tiers); a dedicated §Biases section listing the *generative* bias axes is a v2 polish item | release/validation/llm_critique_raw_20260508T204359.124834Z.json#missing-biases |
+| 2026-05-08 | pr:#76 | Missing — Datasheets §Privacy in `release/README.md` (no real CRM seed, no PII-shaped strings, public-artefacts-only reproducibility) | medium | accepted-for-v2 | The README treats "fictional" as sufficient privacy disclosure; an explicit Privacy section will land in v2 alongside §Biases | release/validation/llm_critique_raw_20260508T204359.124834Z.json#missing-privacy |
+| 2026-05-08 | pr:#76 | Missing — per-bundle `dataset_card.md` Group-split warning section disclosing `account_id` / `contact_id` overlap | high | accepted-for-v2 | The README-side warning is added in PR 7.1 (resolves F001's load-bearing path); replicating it into the auto-rendered per-tier `dataset_card.md` requires the same `leadforge/narrative/dataset_card.py` change as F002 and lands in v2 | release/README.md ("Group-leakage warning"), release/validation/llm_critique_raw_20260508T204359.124834Z.json#missing-group-split |
+| 2026-05-08 | pr:#76 | Q1 — does the simulator window event tables before or after Gaussian-noise injection on float features (the 43.46-day `days_since_first_touch` finding) | low | wont-fix | Intended noise artefact, not a windowing bug. Float features pass through `_apply_difficulty_distortions()` *after* snapshot-window aggregation, so additive Gaussian noise on `days_since_first_touch` can push the value past the 30-day snapshot. F002 captures the disclosure side; the mechanism itself is correct | leadforge/mechanisms/measurement.py |
+| 2026-05-08 | pr:#76 | Q2 — `top_decile_rate` naming clarity (precision-at-top-10 vs recall-at-top-10) | low | accepted-for-v2 | Rename to `top_decile_precision` (current implementation is precision at top 10 %) in v2 alongside any other release-quality field renames; touches `leadforge/validation/release_quality.py` public API | release/validation/llm_critique_raw_20260508T204359.124834Z.json#Q2 |
+| 2026-05-08 | pr:#76 | Q3 — does Kaggle / Hugging Face upload include `docs/release/` and `docs/external_review/` subtrees | low | wont-fix | No — only `release/` ships per the platform packagers (`scripts/package_kaggle_release.py`, `scripts/package_hf_release.py`). Cross-tree links are rewritten to GitHub blob URLs by `_release_common.py::rewrite_release_links()`. F003's verdict above carries the answer | scripts/_release_common.py |
diff --git a/release/huggingface-instructor/README.md b/release/huggingface-instructor/README.md
index 6725379..61ac5ad 100644
--- a/release/huggingface-instructor/README.md
+++ b/release/huggingface-instructor/README.md
@@ -55,6 +55,8 @@ on the public bundle.
│ ├── tables/*.parquet # full-horizon tables (incl. customers, subscriptions)
│ ├── tasks/converted_within_90_days/{train,valid,test}.parquet
│ └── metadata/ # world_spec, graph.{graphml,json}, latent_registry, etc.
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
├── README.md # this file (HF dataset card)
├── dataset-cover-image.png # dataset thumbnail
└── LICENSE
@@ -143,6 +145,25 @@ customers = pd.read_parquet(
every parquet file.
- **Bundle schema version.** 5 (matches the public dataset).
+## Agent-reviewable artifacts
+
+The companion ships the same self-contained review surface as the public
+bundle so an AI reviewer (or a researcher without GitHub access) can
+verify claims locally:
+
+- ``docs/`` — vendored copies of the generation method, leakage probes
+ contract, acceptance bands, break-me guide, v2 decision log, and the
+ per-relational-table column descriptions (`relational_table_schemas.csv`).
+- ``claims_register.{md,json}`` — every numerical / structural claim
+ in this card paired with the artifact and path that backs it.
+- ``intermediate/manifest.json`` and ``intermediate/feature_dictionary.csv``
+ — SHA-256-hashed provenance and the authoritative column spec.
+
+The instructor companion intentionally omits the top-level
+``metrics.json`` (cross-tier medians would be misleading for a single
+tier). Use the public dataset's ``metrics.json`` when comparing tier
+behaviour.
+
## Maintenance, license
We *want* the dataset to be broken. See the
diff --git a/release/huggingface/README.md b/release/huggingface/README.md
index b78b512..e8fe2bc 100644
--- a/release/huggingface/README.md
+++ b/release/huggingface/README.md
@@ -74,11 +74,15 @@ rose materially in 2024).
.
├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier
│ ├── manifest.json # provenance + file hashes
+│ ├── metrics.json # per-tier headline metrics (medians + spreads)
│ ├── dataset_card.md # auto-rendered per-bundle card
│ ├── feature_dictionary.csv # authoritative column spec
│ ├── lead_scoring.csv # flat convenience CSV (all splits)
│ ├── tables/*.parquet # 7 snapshot-safe relational tables
│ └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
+├── metrics.json # top-level cross-tier metrics summary
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
├── README.md # this file (HF dataset card)
├── dataset-cover-image.png # dataset thumbnail
└── LICENSE
@@ -90,6 +94,35 @@ hidden causal structure (DAG, latent registry, mechanism summary)
under `metadata/`. The full layout is documented in each bundle's
`manifest.json`.
+### Agent-reviewable artifacts
+
+The published bundle is self-contained for AI review and offline
+auditing — every numeric / structural claim on this page can be
+verified without following an external link:
+
+- **`metrics.json` (root) + `/metrics.json`** — deterministic
+ JSON view of the headline LR AUC / AP / P@100 / Brier / conversion
+ rate / cohort-shift / cross-tier-ordering medians, with JSON-path
+ back-references to `validation/validation_report.json` (the
+ source of truth).
+- **`claims_register.{md,json}`** — every numerical or structural
+ claim on this page paired with the artifact and path that backs it.
+ Rendered from `claims_register_source.yaml` by
+ `scripts/build_claims_register.py`.
+- **`docs/`** — vendored copies of `generation_method.md`,
+ `channel_signal_audit.md`, `break_me_guide.md`,
+ `feature_dictionary.md`, `v1_acceptance_gates_bands.yaml`,
+ `v2_decision_log.md`, plus a hand-authored
+ `relational_table_schemas.csv` documenting every column of every
+ relational table. These match the GitHub-blob links cited below but
+ ship inside the bundle so a reviewer never needs network access.
+- **`/manifest.json`** — SHA-256 hash for every file plus the
+ full redaction contract (`structural_redactions.columns`,
+ `omitted_tables`, `relational_snapshot_safe`, `snapshot_day`).
+- Kaggle / HuggingFace preview pages additionally inject a
+ `schema.org/Dataset` JSON-LD block in their `` for agent
+ ingestion without HTML parsing.
+
## Quick start
```python
diff --git a/release/kaggle/dataset-metadata.json b/release/kaggle/dataset-metadata.json
index cf44659..133c743 100644
--- a/release/kaggle/dataset-metadata.json
+++ b/release/kaggle/dataset-metadata.json
@@ -1,6 +1,6 @@
{
"collaborators": [],
- "description": "# LeadForge: Synthetic B2B Lead Scoring Dataset (`leadforge-lead-scoring-v1`)\n\nA relational, reproducible, three-tier synthetic CRM dataset family for\nteaching lead scoring at scale. Generated by\n[leadforge](https://github.com/leadforge-dev/leadforge), an\nopen-source Python framework for synthetic CRM/funnel data. The\nframework version is decoupled from the dataset version: the package\nstays at `1.x`; the dataset is published under the explicit `…-v1`\ntag.\n\n## Why lead scoring matters in 2024–2026\n\nMid-market SaaS vendors entered 2024–2026 with growth slowing and\ncustomer-acquisition costs rising[^macro], so predicting *which* leads\nconvert within a fixed window has moved from a marketing nicety to a\nsurvival skill. This dataset teaches that skill on a relational\nsubstrate, with the realistic confusions (snapshot-window discipline,\nleakage traps, channel signal weaker than vendor blogs imply) that\nstudents will hit when they finally get hands on real CRM data.\n\n[^macro]: Macroeconomic framing summarised in\n[`docs/external_review/summaries/gemini_v2_summary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/external_review/summaries/gemini_v2_summary.md)\n(median public-SaaS growth 30%→25% from 2023 to 2025; New CAC Ratio\nrose materially in 2024).\n\n## What's inside\n\n```\n.\n├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier\n│ ├── manifest.json # provenance + file hashes\n│ ├── dataset_card.md # auto-rendered per-bundle card\n│ ├── feature_dictionary.csv # authoritative column spec\n│ ├── lead_scoring.csv # flat convenience CSV (all splits)\n│ ├── tables/*.parquet # 7 snapshot-safe relational tables\n│ └── tasks/converted_within_90_days/{train,valid,test}.parquet\n├── dataset-metadata.json # Kaggle dataset metadata\n├── dataset-cover-image.png # Kaggle cover image\n├── README.md # Kaggle package README\n└── LICENSE\n```\n\n`student_public` bundles ship the snapshot-safe relational view;\n`research_instructor` companions ship the full-horizon view plus the\nhidden causal structure (DAG, latent registry, mechanism summary)\nunder `metadata/`. The full layout is documented in each bundle's\n`manifest.json`.\n\n## Quick start\n\n```python\n# Flat CSV\ndf = pd.read_csv(\"intermediate/lead_scoring.csv\")\n\n# Parquet task splits (recommended)\ntrain = pd.read_parquet(\"intermediate/tasks/converted_within_90_days/train.parquet\")\ntest = pd.read_parquet(\"intermediate/tasks/converted_within_90_days/test.parquet\")\n\n# Relational tables (feature engineering — example)\nleads = pd.read_parquet(\"intermediate/tables/leads.parquet\")\ntouches = pd.read_parquet(\"intermediate/tables/touches.parquet\")\nmy_touch_count = (\n touches.groupby(\"lead_id\").size().rename(\"my_touch_count\").reset_index()\n)\nfeatures = leads.merge(my_touch_count, on=\"lead_id\", how=\"left\")\n\n# Reproduce from source\n# pip install leadforge\n# leadforge generate --recipe b2b_saas_procurement_v1 --seed 42 \\\n# --mode student_public --difficulty intermediate --out my_bundle\n```\n\nThe label `converted_within_90_days` resolves over a 90-day window;\nengagement features (`touch_count`, `session_count`, etc.) are\ncomputed strictly over events on days `[0, 30]`. The deliberate\nexception is `total_touches_all`, the leakage trap — flagged\n`leakage_risk=True` in `feature_dictionary.csv`. Drop it from your\nfeature set unless you're demonstrating leakage detection.\n\n## Dataset summary\n\n| | Intro | Intermediate | Advanced |\n|---|---|---|---|\n| Leads | 5,000 | 5,000 | 5,000 |\n| Accounts | 1,500 | 1,500 | 1,500 |\n| Contacts | 4,200 | 4,200 | 4,200 |\n| Snapshot columns | 32 / 34* | 32 / 34* | 32 / 34* |\n| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` |\n| Conversion rate (acceptance band, gate G7.\\*) | 24–61% | 12–31% | 4–12% |\n| Conversion rate (observed median, seeds 42–46) | 42.67% | 21.60% | 8.40% |\n| Signal strength | 0.90 | 0.70 | 0.50 |\n| Noise scale | 0.10 | 0.30 | 0.55 |\n| Missing rate | 2% | 8% | 18% |\n\n\\* `student_public` / `research_instructor`. Difficulty is modulated\nby the simulation engine — signal strength on latent-trait weights,\nGaussian noise on float features, MCAR missingness, outlier rate —\nnot post-hoc label flipping. The acceptance band is the recipe\ngate's tolerance window (`v1_acceptance_gates_bands.yaml` G7.\\*),\nnot the achievable range — observed five-seed spreads sit\ncomfortably inside the band.\n\n## The scenario\n\n**Veridian Technologies** is a fictional Series B startup (Austin, US)\nselling **Veridian Procure**, a procurement / AP automation SaaS, to\nmid-market firms (200–2,000 employees) in the US and UK. The funnel\nruns through inbound marketing (45%), SDR outbound (35%), and\npartner referrals (20%); four personas drive deals (VP Finance, AP\nManager, IT Director, Procurement Manager). **Task:** predict whether\na lead converts (`closed_won`) within 90 days. ACV bands are\n$18k–$120k. See\n[`docs/release/generation_method.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/generation_method.md)\nfor the full DGP, and the deeper \"what's modelled / approximate / not\nmodelled\" breakdown that this README only summarises.\n\n## Public vs instructor: what's redacted\n\nFiltering happens **during rendering**, not during simulation. The\nredaction contract is single-sourced in\n[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py);\nthe snapshot-safe writer and the validator import the same constants,\nso they cannot drift apart.\n\n| Source-of-truth constant | Public bundle treatment |\n|---|---|\n| `BANNED_LEAD_COLUMNS = (\"converted_within_90_days\", \"conversion_timestamp\")` | Dropped from `tables/leads.parquet` |\n| `BANNED_OPP_COLUMNS = (\"close_outcome\", \"closed_at\")` | Dropped from `tables/opportunities.parquet` |\n| `BANNED_TABLES = (\"customers\", \"subscriptions\")` | Omitted from public bundles |\n| `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` |\n| Snapshot redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` |\n| `total_touches_all` (deliberate trap) | **Retained in both modes**; flagged `leakage_risk=True` |\n\nEach bundle's `manifest.json` records `relational_snapshot_safe`,\n`redacted_columns`, and `snapshot_day`, so the bundle is\nself-describing.\n\n## Calibration\n\nEvery realism / calibration / difficulty claim in this README is\nbacked by\n[`validation/validation_report.md`](https://github.com/leadforge-dev/leadforge/blob/main/release/validation/validation_report.md),\nregenerated by\n[`scripts/validate_release_candidate.py`](https://github.com/leadforge-dev/leadforge/blob/main/scripts/validate_release_candidate.py)\nwith bands declared in\n[`docs/release/v1_acceptance_gates_bands.yaml`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v1_acceptance_gates_bands.yaml).\nHeadline cross-seed medians (seeds 42–46):\n\n| Tier | LR AUC | AP | P@100 | Brier |\n|---|---|---|---|---|\n| intro | 0.879 | 0.761 | 0.80 | 0.130 |\n| intermediate | 0.886 | 0.575 | 0.59 | 0.110 |\n| advanced | 0.886 | 0.351 | 0.34 | 0.061 |\n\nAP, P@100, conversion-rate, and lift orderings hold across the\nintended difficulty axis (intro > intermediate > advanced).\n\n## Intended uses\n\n- Teaching baseline lead-scoring on a flat snapshot.\n- Teaching relational feature engineering against snapshot-safe tables.\n- Teaching leakage detection (the `total_touches_all` trap is\n designed to be discoverable).\n- Teaching calibration, lift, P@K, value-aware ranking\n (`expected_acv × P(convert)`), and cohort-shift evaluation.\n- Comparing model families under a controlled DGP.\n\n## Out-of-scope uses\n\n- **Production lead scoring.** The company, product, and customers are\n fictional.\n- **Vendor benchmarking / paper baselines.** Difficulty tiers are\n calibrated for pedagogy, not cross-paper comparability.\n- **Causal-inference research that requires recovery of the true DGP.**\n The instructor companion exposes the hidden graph for teaching, not\n designed counterfactuals.\n- **Demographic / fairness research.** v1 does not model protected\n attributes.\n\n## Known limitations\n\n- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across\n every tier. Difficulty is visible in AP, P@K, Brier, and value\n capture. Treat AUC as a sanity check, not a difficulty signal.\n- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta\n is slightly negative in every tier (intro −0.0045, intermediate\n −0.0072, advanced −0.0133); v1's snapshot is dominated by linear\n features. v2 will inject non-linear interactions in the simulator.\n- **Channel signal is weak.** Per\n [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md),\n out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across\n all tiers and the per-channel rate spread is ≤0.05. The simulator\n does not encode channel-conditional probabilities; channel-conditional\n encoding is post-v1 work.\n- **Cohort-shift degradation is small.** v1 has no time-of-year drift\n baked in; the cohort-shift gate (G6.4) is informational and will\n bite in v2.\n\n## Composition\n\n- **Entities.** Accounts, contacts, leads, touches, sessions,\n sales_activities, opportunities (public); plus customers and\n subscriptions (instructor only). Per-row counts per bundle live in\n `manifest.json`.\n- **Features.** 32 public columns grouped by analytical role in\n [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md);\n the per-bundle `feature_dictionary.csv` is the authoritative\n machine-readable spec.\n- **Label.** `converted_within_90_days` (boolean), event-derived from\n the simulator. Never sampled directly.\n- **Splits.** 70/15/15 train/valid/test, deterministic given seed;\n recorded in `tasks/converted_within_90_days/task_manifest.json`.\n **Group-leakage warning:** the splitter is keyed on `lead_id` only,\n not on `account_id` or `contact_id`. On the as-shipped intermediate\n bundle, **518 of 557 test accounts (≈93 %) also appear in train**;\n the contact-level overlap is similar in magnitude. A flat baseline\n trained on the random split rides account-level signal across the\n split boundary. For a generalisation-faithful number, retrain with\n `GroupKFold(account_id)` (or `contact_id`) and report both — see\n [`break_me_guide.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/break_me_guide.md) §5 for the\n detection recipe.\n- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package\n version stamped in `manifest.json`.\n\n## Maintenance, adversarial framing, license\n\nWe *want* the dataset to be broken. The\n[break-me guide](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/break_me_guide.md) catalogues\nnine adversarial patterns to look for (leakage, split\ncontamination, ranking inversions, calibration drift) with\nworked-example pointers back into the notebooks. Issue\ntemplates ship under `.github/ISSUE_TEMPLATE/`: a\n[breakage report](https://github.com/leadforge-dev/leadforge/blob/main/.github/ISSUE_TEMPLATE/dataset_breakage_report.yml)\nform for findings on the bundle itself, and a\n[realism feedback](https://github.com/leadforge-dev/leadforge/blob/main/.github/ISSUE_TEMPLATE/realism_feedback.yml)\nform for distributional critiques. Accepted findings are\nlogged in\n[`docs/release/v2_decision_log.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v2_decision_log.md).\nFile issues at\n[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge);\nPRs welcome.\n\n| Field | Value |\n|---|---|\n| Generator | leadforge `1.0.0+` |\n| Recipe | `b2b_saas_procurement_v1` |\n| Canonical seed | 42 (cross-seed sweep: 42–46) |\n| Bundle schema version | 5 |\n| Format | Parquet (canonical) + CSV (convenience) |\n| License | MIT — see [LICENSE](LICENSE) |\n\nVerify integrity with `leadforge validate `; every file\nis hashed in `manifest.json`.\n",
+ "description": "# LeadForge: Synthetic B2B Lead Scoring Dataset (`leadforge-lead-scoring-v1`)\n\nA relational, reproducible, three-tier synthetic CRM dataset family for\nteaching lead scoring at scale. Generated by\n[leadforge](https://github.com/leadforge-dev/leadforge), an\nopen-source Python framework for synthetic CRM/funnel data. The\nframework version is decoupled from the dataset version: the package\nstays at `1.x`; the dataset is published under the explicit `…-v1`\ntag.\n\n## Why lead scoring matters in 2024–2026\n\nMid-market SaaS vendors entered 2024–2026 with growth slowing and\ncustomer-acquisition costs rising[^macro], so predicting *which* leads\nconvert within a fixed window has moved from a marketing nicety to a\nsurvival skill. This dataset teaches that skill on a relational\nsubstrate, with the realistic confusions (snapshot-window discipline,\nleakage traps, channel signal weaker than vendor blogs imply) that\nstudents will hit when they finally get hands on real CRM data.\n\n[^macro]: Macroeconomic framing summarised in\n[`docs/external_review/summaries/gemini_v2_summary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/external_review/summaries/gemini_v2_summary.md)\n(median public-SaaS growth 30%→25% from 2023 to 2025; New CAC Ratio\nrose materially in 2024).\n\n## What's inside\n\n```\n.\n├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier\n│ ├── manifest.json # provenance + file hashes\n│ ├── metrics.json # per-tier headline metrics (medians + spreads)\n│ ├── dataset_card.md # auto-rendered per-bundle card\n│ ├── feature_dictionary.csv # authoritative column spec\n│ ├── lead_scoring.csv # flat convenience CSV (all splits)\n│ ├── tables/*.parquet # 7 snapshot-safe relational tables\n│ └── tasks/converted_within_90_days/{train,valid,test}.parquet\n├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)\n├── metrics.json # top-level cross-tier metrics summary\n├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)\n├── dataset-metadata.json # Kaggle dataset metadata\n├── dataset-cover-image.png # Kaggle cover image\n├── README.md # Kaggle package README\n└── LICENSE\n```\n\n`student_public` bundles ship the snapshot-safe relational view;\n`research_instructor` companions ship the full-horizon view plus the\nhidden causal structure (DAG, latent registry, mechanism summary)\nunder `metadata/`. The full layout is documented in each bundle's\n`manifest.json`.\n\n### Agent-reviewable artifacts\n\nThe published bundle is self-contained for AI review and offline\nauditing — every numeric / structural claim on this page can be\nverified without following an external link:\n\n- **`metrics.json` (root) + `/metrics.json`** — deterministic\n JSON view of the headline LR AUC / AP / P@100 / Brier / conversion\n rate / cohort-shift / cross-tier-ordering medians, with JSON-path\n back-references to `validation/validation_report.json` (the\n source of truth).\n- **`claims_register.{md,json}`** — every numerical or structural\n claim on this page paired with the artifact and path that backs it.\n Rendered from `claims_register_source.yaml` by\n `scripts/build_claims_register.py`.\n- **`docs/`** — vendored copies of `generation_method.md`,\n `channel_signal_audit.md`, `break_me_guide.md`,\n `feature_dictionary.md`, `v1_acceptance_gates_bands.yaml`,\n `v2_decision_log.md`, plus a hand-authored\n `relational_table_schemas.csv` documenting every column of every\n relational table. These match the GitHub-blob links cited below but\n ship inside the bundle so a reviewer never needs network access.\n- **`/manifest.json`** — SHA-256 hash for every file plus the\n full redaction contract (`structural_redactions.columns`,\n `omitted_tables`, `relational_snapshot_safe`, `snapshot_day`).\n- Kaggle / HuggingFace preview pages additionally inject a\n `schema.org/Dataset` JSON-LD block in their `` for agent\n ingestion without HTML parsing.\n\n## Quick start\n\n```python\n# Flat CSV\ndf = pd.read_csv(\"intermediate/lead_scoring.csv\")\n\n# Parquet task splits (recommended)\ntrain = pd.read_parquet(\"intermediate/tasks/converted_within_90_days/train.parquet\")\ntest = pd.read_parquet(\"intermediate/tasks/converted_within_90_days/test.parquet\")\n\n# Relational tables (feature engineering — example)\nleads = pd.read_parquet(\"intermediate/tables/leads.parquet\")\ntouches = pd.read_parquet(\"intermediate/tables/touches.parquet\")\nmy_touch_count = (\n touches.groupby(\"lead_id\").size().rename(\"my_touch_count\").reset_index()\n)\nfeatures = leads.merge(my_touch_count, on=\"lead_id\", how=\"left\")\n\n# Reproduce from source\n# pip install leadforge\n# leadforge generate --recipe b2b_saas_procurement_v1 --seed 42 \\\n# --mode student_public --difficulty intermediate --out my_bundle\n```\n\nThe label `converted_within_90_days` resolves over a 90-day window;\nengagement features (`touch_count`, `session_count`, etc.) are\ncomputed strictly over events on days `[0, 30]`. The deliberate\nexception is `total_touches_all`, the leakage trap — flagged\n`leakage_risk=True` in `feature_dictionary.csv`. Drop it from your\nfeature set unless you're demonstrating leakage detection.\n\n## Dataset summary\n\n| | Intro | Intermediate | Advanced |\n|---|---|---|---|\n| Leads | 5,000 | 5,000 | 5,000 |\n| Accounts | 1,500 | 1,500 | 1,500 |\n| Contacts | 4,200 | 4,200 | 4,200 |\n| Snapshot columns | 32 / 34* | 32 / 34* | 32 / 34* |\n| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` |\n| Conversion rate (acceptance band, gate G7.\\*) | 24–61% | 12–31% | 4–12% |\n| Conversion rate (observed median, seeds 42–46) | 42.67% | 21.60% | 8.40% |\n| Signal strength | 0.90 | 0.70 | 0.50 |\n| Noise scale | 0.10 | 0.30 | 0.55 |\n| Missing rate | 2% | 8% | 18% |\n\n\\* `student_public` / `research_instructor`. Difficulty is modulated\nby the simulation engine — signal strength on latent-trait weights,\nGaussian noise on float features, MCAR missingness, outlier rate —\nnot post-hoc label flipping. The acceptance band is the recipe\ngate's tolerance window (`v1_acceptance_gates_bands.yaml` G7.\\*),\nnot the achievable range — observed five-seed spreads sit\ncomfortably inside the band.\n\n## The scenario\n\n**Veridian Technologies** is a fictional Series B startup (Austin, US)\nselling **Veridian Procure**, a procurement / AP automation SaaS, to\nmid-market firms (200–2,000 employees) in the US and UK. The funnel\nruns through inbound marketing (45%), SDR outbound (35%), and\npartner referrals (20%); four personas drive deals (VP Finance, AP\nManager, IT Director, Procurement Manager). **Task:** predict whether\na lead converts (`closed_won`) within 90 days. ACV bands are\n$18k–$120k. See\n[`docs/release/generation_method.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/generation_method.md)\nfor the full DGP, and the deeper \"what's modelled / approximate / not\nmodelled\" breakdown that this README only summarises.\n\n## Public vs instructor: what's redacted\n\nFiltering happens **during rendering**, not during simulation. The\nredaction contract is single-sourced in\n[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py);\nthe snapshot-safe writer and the validator import the same constants,\nso they cannot drift apart.\n\n| Source-of-truth constant | Public bundle treatment |\n|---|---|\n| `BANNED_LEAD_COLUMNS = (\"converted_within_90_days\", \"conversion_timestamp\")` | Dropped from `tables/leads.parquet` |\n| `BANNED_OPP_COLUMNS = (\"close_outcome\", \"closed_at\")` | Dropped from `tables/opportunities.parquet` |\n| `BANNED_TABLES = (\"customers\", \"subscriptions\")` | Omitted from public bundles |\n| `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` |\n| Snapshot redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` |\n| `total_touches_all` (deliberate trap) | **Retained in both modes**; flagged `leakage_risk=True` |\n\nEach bundle's `manifest.json` records `relational_snapshot_safe`,\n`redacted_columns`, and `snapshot_day`, so the bundle is\nself-describing.\n\n## Calibration\n\nEvery realism / calibration / difficulty claim in this README is\nbacked by\n[`validation/validation_report.md`](https://github.com/leadforge-dev/leadforge/blob/main/release/validation/validation_report.md),\nregenerated by\n[`scripts/validate_release_candidate.py`](https://github.com/leadforge-dev/leadforge/blob/main/scripts/validate_release_candidate.py)\nwith bands declared in\n[`docs/release/v1_acceptance_gates_bands.yaml`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v1_acceptance_gates_bands.yaml).\nHeadline cross-seed medians (seeds 42–46):\n\n| Tier | LR AUC | AP | P@100 | Brier |\n|---|---|---|---|---|\n| intro | 0.879 | 0.761 | 0.80 | 0.130 |\n| intermediate | 0.886 | 0.575 | 0.59 | 0.110 |\n| advanced | 0.886 | 0.351 | 0.34 | 0.061 |\n\nAP, P@100, conversion-rate, and lift orderings hold across the\nintended difficulty axis (intro > intermediate > advanced).\n\n## Intended uses\n\n- Teaching baseline lead-scoring on a flat snapshot.\n- Teaching relational feature engineering against snapshot-safe tables.\n- Teaching leakage detection (the `total_touches_all` trap is\n designed to be discoverable).\n- Teaching calibration, lift, P@K, value-aware ranking\n (`expected_acv × P(convert)`), and cohort-shift evaluation.\n- Comparing model families under a controlled DGP.\n\n## Out-of-scope uses\n\n- **Production lead scoring.** The company, product, and customers are\n fictional.\n- **Vendor benchmarking / paper baselines.** Difficulty tiers are\n calibrated for pedagogy, not cross-paper comparability.\n- **Causal-inference research that requires recovery of the true DGP.**\n The instructor companion exposes the hidden graph for teaching, not\n designed counterfactuals.\n- **Demographic / fairness research.** v1 does not model protected\n attributes.\n\n## Known limitations\n\n- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across\n every tier. Difficulty is visible in AP, P@K, Brier, and value\n capture. Treat AUC as a sanity check, not a difficulty signal.\n- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta\n is slightly negative in every tier (intro −0.0045, intermediate\n −0.0072, advanced −0.0133); v1's snapshot is dominated by linear\n features. v2 will inject non-linear interactions in the simulator.\n- **Channel signal is weak.** Per\n [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md),\n out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across\n all tiers and the per-channel rate spread is ≤0.05. The simulator\n does not encode channel-conditional probabilities; channel-conditional\n encoding is post-v1 work.\n- **Cohort-shift degradation is small.** v1 has no time-of-year drift\n baked in; the cohort-shift gate (G6.4) is informational and will\n bite in v2.\n\n## Composition\n\n- **Entities.** Accounts, contacts, leads, touches, sessions,\n sales_activities, opportunities (public); plus customers and\n subscriptions (instructor only). Per-row counts per bundle live in\n `manifest.json`.\n- **Features.** 32 public columns grouped by analytical role in\n [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md);\n the per-bundle `feature_dictionary.csv` is the authoritative\n machine-readable spec.\n- **Label.** `converted_within_90_days` (boolean), event-derived from\n the simulator. Never sampled directly.\n- **Splits.** 70/15/15 train/valid/test, deterministic given seed;\n recorded in `tasks/converted_within_90_days/task_manifest.json`.\n **Group-leakage warning:** the splitter is keyed on `lead_id` only,\n not on `account_id` or `contact_id`. On the as-shipped intermediate\n bundle, **518 of 557 test accounts (≈93 %) also appear in train**;\n the contact-level overlap is similar in magnitude. A flat baseline\n trained on the random split rides account-level signal across the\n split boundary. For a generalisation-faithful number, retrain with\n `GroupKFold(account_id)` (or `contact_id`) and report both — see\n [`break_me_guide.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/break_me_guide.md) §5 for the\n detection recipe.\n- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package\n version stamped in `manifest.json`.\n\n## Maintenance, adversarial framing, license\n\nWe *want* the dataset to be broken. The\n[break-me guide](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/break_me_guide.md) catalogues\nnine adversarial patterns to look for (leakage, split\ncontamination, ranking inversions, calibration drift) with\nworked-example pointers back into the notebooks. Issue\ntemplates ship under `.github/ISSUE_TEMPLATE/`: a\n[breakage report](https://github.com/leadforge-dev/leadforge/blob/main/.github/ISSUE_TEMPLATE/dataset_breakage_report.yml)\nform for findings on the bundle itself, and a\n[realism feedback](https://github.com/leadforge-dev/leadforge/blob/main/.github/ISSUE_TEMPLATE/realism_feedback.yml)\nform for distributional critiques. Accepted findings are\nlogged in\n[`docs/release/v2_decision_log.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v2_decision_log.md).\nFile issues at\n[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge);\nPRs welcome.\n\n| Field | Value |\n|---|---|\n| Generator | leadforge `1.0.0+` |\n| Recipe | `b2b_saas_procurement_v1` |\n| Canonical seed | 42 (cross-seed sweep: 42–46) |\n| Bundle schema version | 5 |\n| Format | Parquet (canonical) + CSV (convenience) |\n| License | MIT — see [LICENSE](LICENSE) |\n\nVerify integrity with `leadforge validate `; every file\nis hashed in `manifest.json`.\n",
"expectedUpdateFrequency": "never",
"id": "leadforge/leadforge-lead-scoring-v1",
"image": "dataset-cover-image.png",
@@ -612,34 +612,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque account identifier (e.g. ``acct_000001``). Primary key.",
"name": "account_id",
"type": "string"
},
{
+ "description": "Synthetic display name for the account (fictional). Not a feature in the snapshot.",
"name": "company_name",
"type": "string"
},
{
+ "description": "Industry vertical of the buying organisation; one of the recipe's industry vocabulary.",
"name": "industry",
"type": "string"
},
{
+ "description": "Geographic region of the account's headquarters (e.g. ``US``, ``UK``).",
"name": "region",
"type": "string"
},
{
+ "description": "Banded employee headcount of the account (e.g. ``200-500``, ``500-1000``, ``1000-2000``).",
"name": "employee_band",
"type": "string"
},
{
+ "description": "Banded estimated annual revenue of the account.",
"name": "estimated_revenue_band",
"type": "string"
},
{
+ "description": "Banded internal process-maturity score of the account (drives ICP fit).",
"name": "process_maturity_band",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp when the account was first observed (synthetic creation time).",
"name": "created_at",
"type": "string"
}
@@ -652,34 +660,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque contact identifier (e.g. ``cont_000001``). Primary key.",
"name": "contact_id",
"type": "string"
},
{
+ "description": "FK to ``accounts.account_id`` — the buying organisation this contact belongs to.",
"name": "account_id",
"type": "string"
},
{
+ "description": "Free-text job title (fictional). Used only for narrative colour; not a feature.",
"name": "job_title",
"type": "string"
},
{
+ "description": "Functional area of the contact (e.g. ``finance``, ``ops``, ``it``, ``procurement``).",
"name": "role_function",
"type": "string"
},
{
+ "description": "Seniority band of the contact (e.g. ``c_level``, ``vp``, ``director``, ``manager``).",
"name": "seniority",
"type": "string"
},
{
+ "description": "Buyer-role classification (``economic_buyer``, ``champion``, ``technical_evaluator``, ``end_user``).",
"name": "buyer_role",
"type": "string"
},
{
+ "description": "Type of email domain (e.g. ``corporate``, ``free``); never resolves to a real domain.",
"name": "email_domain_type",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp when the contact record was first observed.",
"name": "created_at",
"type": "string"
}
@@ -692,30 +708,37 @@
"schema": {
"fields": [
{
+ "description": "Opaque lead identifier (e.g. ``lead_000001``). Primary key for the lead-scoring task.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "FK to ``contacts.contact_id`` — the primary contact attached to this lead.",
"name": "contact_id",
"type": "string"
},
{
+ "description": "FK to ``accounts.account_id`` — the buying organisation this lead belongs to.",
"name": "account_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp at which the lead was created (= snapshot anchor t=0).",
"name": "lead_created_at",
"type": "string"
},
{
+ "description": "Origination source of the lead (e.g. ``inbound_form``, ``sdr_outbound``, ``partner``).",
"name": "lead_source",
"type": "string"
},
{
+ "description": "Marketing channel responsible for the first recorded touch.",
"name": "first_touch_channel",
"type": "string"
},
{
+ "description": "Opaque sales-rep id (e.g. ``rep_000001``) owning the lead at snapshot time.",
"name": "owner_rep_id",
"type": "string"
}
@@ -728,30 +751,37 @@
"schema": {
"fields": [
{
+ "description": "Opaque touch identifier. Primary key.",
"name": "touch_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the touch. Public bundles filter to ``<= lead_created_at + snapshot_day`` per the redaction contract.",
"name": "touch_timestamp",
"type": "string"
},
{
+ "description": "Mechanism of the touch (e.g. ``email``, ``call``, ``ad_view``, ``content_download``).",
"name": "touch_type",
"type": "string"
},
{
+ "description": "Marketing/sales channel attribution (e.g. ``paid_search``, ``content``, ``cold_outreach``).",
"name": "touch_channel",
"type": "string"
},
{
+ "description": "``inbound`` (lead-initiated) or ``outbound`` (vendor-initiated).",
"name": "touch_direction",
"type": "string"
},
{
+ "description": "Opaque campaign identifier attached to the touch, or null when unattributed.",
"name": "campaign_id",
"type": "string"
}
@@ -764,34 +794,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque session identifier. Primary key.",
"name": "session_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the session start. Public bundles filter to ``<= lead_created_at + snapshot_day``.",
"name": "session_timestamp",
"type": "string"
},
{
+ "description": "Session type (e.g. ``marketing_site``, ``trial``, ``demo``).",
"name": "session_type",
"type": "string"
},
{
+ "description": "Total page views during the session.",
"name": "page_views",
"type": "integer"
},
{
+ "description": "Page views landing on a pricing URL during the session.",
"name": "pricing_page_views",
"type": "integer"
},
{
+ "description": "Page views landing on a demo URL during the session.",
"name": "demo_page_views",
"type": "integer"
},
{
+ "description": "Session duration in seconds.",
"name": "session_duration_seconds",
"type": "integer"
}
@@ -804,26 +842,32 @@
"schema": {
"fields": [
{
+ "description": "Opaque sales-activity identifier. Primary key.",
"name": "activity_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "Opaque sales-rep id performing the activity.",
"name": "rep_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the activity. Public bundles filter to ``<= lead_created_at + snapshot_day``.",
"name": "activity_timestamp",
"type": "string"
},
{
+ "description": "Activity mechanism (e.g. ``call``, ``email``, ``demo``, ``meeting``).",
"name": "activity_type",
"type": "string"
},
{
+ "description": "Logged outcome (e.g. ``connected``, ``voicemail``, ``no_answer``, ``meeting_set``).",
"name": "activity_outcome",
"type": "string"
}
@@ -836,22 +880,27 @@
"schema": {
"fields": [
{
+ "description": "Opaque opportunity identifier. Primary key.",
"name": "opportunity_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp the opportunity was created. Public bundles filter rows to ``<= lead_created_at + snapshot_day``.",
"name": "created_at",
"type": "string"
},
{
+ "description": "Current stage at snapshot time (e.g. ``prospecting``, ``demo``, ``negotiation``).",
"name": "stage",
"type": "string"
},
{
+ "description": "Estimated annual contract value at snapshot time (USD).",
"name": "estimated_acv",
"type": "integer"
}
@@ -862,6 +911,10 @@
"description": "Intro tier auto-rendered dataset card.",
"path": "intro/dataset_card.md"
},
+ {
+ "description": "Intro tier headline metrics (cross-seed medians + spreads, difficulty knobs, JSON-path back-reference to validation_report.json).",
+ "path": "intro/metrics.json"
+ },
{
"description": "Intro tier provenance manifest (recipe, seed, package version, file hashes, snapshot_day, redaction contract).",
"path": "intro/manifest.json"
@@ -1457,34 +1510,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque account identifier (e.g. ``acct_000001``). Primary key.",
"name": "account_id",
"type": "string"
},
{
+ "description": "Synthetic display name for the account (fictional). Not a feature in the snapshot.",
"name": "company_name",
"type": "string"
},
{
+ "description": "Industry vertical of the buying organisation; one of the recipe's industry vocabulary.",
"name": "industry",
"type": "string"
},
{
+ "description": "Geographic region of the account's headquarters (e.g. ``US``, ``UK``).",
"name": "region",
"type": "string"
},
{
+ "description": "Banded employee headcount of the account (e.g. ``200-500``, ``500-1000``, ``1000-2000``).",
"name": "employee_band",
"type": "string"
},
{
+ "description": "Banded estimated annual revenue of the account.",
"name": "estimated_revenue_band",
"type": "string"
},
{
+ "description": "Banded internal process-maturity score of the account (drives ICP fit).",
"name": "process_maturity_band",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp when the account was first observed (synthetic creation time).",
"name": "created_at",
"type": "string"
}
@@ -1497,34 +1558,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque contact identifier (e.g. ``cont_000001``). Primary key.",
"name": "contact_id",
"type": "string"
},
{
+ "description": "FK to ``accounts.account_id`` — the buying organisation this contact belongs to.",
"name": "account_id",
"type": "string"
},
{
+ "description": "Free-text job title (fictional). Used only for narrative colour; not a feature.",
"name": "job_title",
"type": "string"
},
{
+ "description": "Functional area of the contact (e.g. ``finance``, ``ops``, ``it``, ``procurement``).",
"name": "role_function",
"type": "string"
},
{
+ "description": "Seniority band of the contact (e.g. ``c_level``, ``vp``, ``director``, ``manager``).",
"name": "seniority",
"type": "string"
},
{
+ "description": "Buyer-role classification (``economic_buyer``, ``champion``, ``technical_evaluator``, ``end_user``).",
"name": "buyer_role",
"type": "string"
},
{
+ "description": "Type of email domain (e.g. ``corporate``, ``free``); never resolves to a real domain.",
"name": "email_domain_type",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp when the contact record was first observed.",
"name": "created_at",
"type": "string"
}
@@ -1537,30 +1606,37 @@
"schema": {
"fields": [
{
+ "description": "Opaque lead identifier (e.g. ``lead_000001``). Primary key for the lead-scoring task.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "FK to ``contacts.contact_id`` — the primary contact attached to this lead.",
"name": "contact_id",
"type": "string"
},
{
+ "description": "FK to ``accounts.account_id`` — the buying organisation this lead belongs to.",
"name": "account_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp at which the lead was created (= snapshot anchor t=0).",
"name": "lead_created_at",
"type": "string"
},
{
+ "description": "Origination source of the lead (e.g. ``inbound_form``, ``sdr_outbound``, ``partner``).",
"name": "lead_source",
"type": "string"
},
{
+ "description": "Marketing channel responsible for the first recorded touch.",
"name": "first_touch_channel",
"type": "string"
},
{
+ "description": "Opaque sales-rep id (e.g. ``rep_000001``) owning the lead at snapshot time.",
"name": "owner_rep_id",
"type": "string"
}
@@ -1573,30 +1649,37 @@
"schema": {
"fields": [
{
+ "description": "Opaque touch identifier. Primary key.",
"name": "touch_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the touch. Public bundles filter to ``<= lead_created_at + snapshot_day`` per the redaction contract.",
"name": "touch_timestamp",
"type": "string"
},
{
+ "description": "Mechanism of the touch (e.g. ``email``, ``call``, ``ad_view``, ``content_download``).",
"name": "touch_type",
"type": "string"
},
{
+ "description": "Marketing/sales channel attribution (e.g. ``paid_search``, ``content``, ``cold_outreach``).",
"name": "touch_channel",
"type": "string"
},
{
+ "description": "``inbound`` (lead-initiated) or ``outbound`` (vendor-initiated).",
"name": "touch_direction",
"type": "string"
},
{
+ "description": "Opaque campaign identifier attached to the touch, or null when unattributed.",
"name": "campaign_id",
"type": "string"
}
@@ -1609,34 +1692,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque session identifier. Primary key.",
"name": "session_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the session start. Public bundles filter to ``<= lead_created_at + snapshot_day``.",
"name": "session_timestamp",
"type": "string"
},
{
+ "description": "Session type (e.g. ``marketing_site``, ``trial``, ``demo``).",
"name": "session_type",
"type": "string"
},
{
+ "description": "Total page views during the session.",
"name": "page_views",
"type": "integer"
},
{
+ "description": "Page views landing on a pricing URL during the session.",
"name": "pricing_page_views",
"type": "integer"
},
{
+ "description": "Page views landing on a demo URL during the session.",
"name": "demo_page_views",
"type": "integer"
},
{
+ "description": "Session duration in seconds.",
"name": "session_duration_seconds",
"type": "integer"
}
@@ -1649,26 +1740,32 @@
"schema": {
"fields": [
{
+ "description": "Opaque sales-activity identifier. Primary key.",
"name": "activity_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "Opaque sales-rep id performing the activity.",
"name": "rep_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the activity. Public bundles filter to ``<= lead_created_at + snapshot_day``.",
"name": "activity_timestamp",
"type": "string"
},
{
+ "description": "Activity mechanism (e.g. ``call``, ``email``, ``demo``, ``meeting``).",
"name": "activity_type",
"type": "string"
},
{
+ "description": "Logged outcome (e.g. ``connected``, ``voicemail``, ``no_answer``, ``meeting_set``).",
"name": "activity_outcome",
"type": "string"
}
@@ -1681,22 +1778,27 @@
"schema": {
"fields": [
{
+ "description": "Opaque opportunity identifier. Primary key.",
"name": "opportunity_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp the opportunity was created. Public bundles filter rows to ``<= lead_created_at + snapshot_day``.",
"name": "created_at",
"type": "string"
},
{
+ "description": "Current stage at snapshot time (e.g. ``prospecting``, ``demo``, ``negotiation``).",
"name": "stage",
"type": "string"
},
{
+ "description": "Estimated annual contract value at snapshot time (USD).",
"name": "estimated_acv",
"type": "integer"
}
@@ -1707,6 +1809,10 @@
"description": "Intermediate tier auto-rendered dataset card.",
"path": "intermediate/dataset_card.md"
},
+ {
+ "description": "Intermediate tier headline metrics (cross-seed medians + spreads, difficulty knobs, JSON-path back-reference to validation_report.json).",
+ "path": "intermediate/metrics.json"
+ },
{
"description": "Intermediate tier provenance manifest (recipe, seed, package version, file hashes, snapshot_day, redaction contract).",
"path": "intermediate/manifest.json"
@@ -2302,34 +2408,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque account identifier (e.g. ``acct_000001``). Primary key.",
"name": "account_id",
"type": "string"
},
{
+ "description": "Synthetic display name for the account (fictional). Not a feature in the snapshot.",
"name": "company_name",
"type": "string"
},
{
+ "description": "Industry vertical of the buying organisation; one of the recipe's industry vocabulary.",
"name": "industry",
"type": "string"
},
{
+ "description": "Geographic region of the account's headquarters (e.g. ``US``, ``UK``).",
"name": "region",
"type": "string"
},
{
+ "description": "Banded employee headcount of the account (e.g. ``200-500``, ``500-1000``, ``1000-2000``).",
"name": "employee_band",
"type": "string"
},
{
+ "description": "Banded estimated annual revenue of the account.",
"name": "estimated_revenue_band",
"type": "string"
},
{
+ "description": "Banded internal process-maturity score of the account (drives ICP fit).",
"name": "process_maturity_band",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp when the account was first observed (synthetic creation time).",
"name": "created_at",
"type": "string"
}
@@ -2342,34 +2456,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque contact identifier (e.g. ``cont_000001``). Primary key.",
"name": "contact_id",
"type": "string"
},
{
+ "description": "FK to ``accounts.account_id`` — the buying organisation this contact belongs to.",
"name": "account_id",
"type": "string"
},
{
+ "description": "Free-text job title (fictional). Used only for narrative colour; not a feature.",
"name": "job_title",
"type": "string"
},
{
+ "description": "Functional area of the contact (e.g. ``finance``, ``ops``, ``it``, ``procurement``).",
"name": "role_function",
"type": "string"
},
{
+ "description": "Seniority band of the contact (e.g. ``c_level``, ``vp``, ``director``, ``manager``).",
"name": "seniority",
"type": "string"
},
{
+ "description": "Buyer-role classification (``economic_buyer``, ``champion``, ``technical_evaluator``, ``end_user``).",
"name": "buyer_role",
"type": "string"
},
{
+ "description": "Type of email domain (e.g. ``corporate``, ``free``); never resolves to a real domain.",
"name": "email_domain_type",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp when the contact record was first observed.",
"name": "created_at",
"type": "string"
}
@@ -2382,30 +2504,37 @@
"schema": {
"fields": [
{
+ "description": "Opaque lead identifier (e.g. ``lead_000001``). Primary key for the lead-scoring task.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "FK to ``contacts.contact_id`` — the primary contact attached to this lead.",
"name": "contact_id",
"type": "string"
},
{
+ "description": "FK to ``accounts.account_id`` — the buying organisation this lead belongs to.",
"name": "account_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp at which the lead was created (= snapshot anchor t=0).",
"name": "lead_created_at",
"type": "string"
},
{
+ "description": "Origination source of the lead (e.g. ``inbound_form``, ``sdr_outbound``, ``partner``).",
"name": "lead_source",
"type": "string"
},
{
+ "description": "Marketing channel responsible for the first recorded touch.",
"name": "first_touch_channel",
"type": "string"
},
{
+ "description": "Opaque sales-rep id (e.g. ``rep_000001``) owning the lead at snapshot time.",
"name": "owner_rep_id",
"type": "string"
}
@@ -2418,30 +2547,37 @@
"schema": {
"fields": [
{
+ "description": "Opaque touch identifier. Primary key.",
"name": "touch_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the touch. Public bundles filter to ``<= lead_created_at + snapshot_day`` per the redaction contract.",
"name": "touch_timestamp",
"type": "string"
},
{
+ "description": "Mechanism of the touch (e.g. ``email``, ``call``, ``ad_view``, ``content_download``).",
"name": "touch_type",
"type": "string"
},
{
+ "description": "Marketing/sales channel attribution (e.g. ``paid_search``, ``content``, ``cold_outreach``).",
"name": "touch_channel",
"type": "string"
},
{
+ "description": "``inbound`` (lead-initiated) or ``outbound`` (vendor-initiated).",
"name": "touch_direction",
"type": "string"
},
{
+ "description": "Opaque campaign identifier attached to the touch, or null when unattributed.",
"name": "campaign_id",
"type": "string"
}
@@ -2454,34 +2590,42 @@
"schema": {
"fields": [
{
+ "description": "Opaque session identifier. Primary key.",
"name": "session_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the session start. Public bundles filter to ``<= lead_created_at + snapshot_day``.",
"name": "session_timestamp",
"type": "string"
},
{
+ "description": "Session type (e.g. ``marketing_site``, ``trial``, ``demo``).",
"name": "session_type",
"type": "string"
},
{
+ "description": "Total page views during the session.",
"name": "page_views",
"type": "integer"
},
{
+ "description": "Page views landing on a pricing URL during the session.",
"name": "pricing_page_views",
"type": "integer"
},
{
+ "description": "Page views landing on a demo URL during the session.",
"name": "demo_page_views",
"type": "integer"
},
{
+ "description": "Session duration in seconds.",
"name": "session_duration_seconds",
"type": "integer"
}
@@ -2494,26 +2638,32 @@
"schema": {
"fields": [
{
+ "description": "Opaque sales-activity identifier. Primary key.",
"name": "activity_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "Opaque sales-rep id performing the activity.",
"name": "rep_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp of the activity. Public bundles filter to ``<= lead_created_at + snapshot_day``.",
"name": "activity_timestamp",
"type": "string"
},
{
+ "description": "Activity mechanism (e.g. ``call``, ``email``, ``demo``, ``meeting``).",
"name": "activity_type",
"type": "string"
},
{
+ "description": "Logged outcome (e.g. ``connected``, ``voicemail``, ``no_answer``, ``meeting_set``).",
"name": "activity_outcome",
"type": "string"
}
@@ -2526,22 +2676,27 @@
"schema": {
"fields": [
{
+ "description": "Opaque opportunity identifier. Primary key.",
"name": "opportunity_id",
"type": "string"
},
{
+ "description": "FK to ``leads.lead_id``.",
"name": "lead_id",
"type": "string"
},
{
+ "description": "ISO-8601 timestamp the opportunity was created. Public bundles filter rows to ``<= lead_created_at + snapshot_day``.",
"name": "created_at",
"type": "string"
},
{
+ "description": "Current stage at snapshot time (e.g. ``prospecting``, ``demo``, ``negotiation``).",
"name": "stage",
"type": "string"
},
{
+ "description": "Estimated annual contract value at snapshot time (USD).",
"name": "estimated_acv",
"type": "integer"
}
@@ -2552,9 +2707,57 @@
"description": "Advanced tier auto-rendered dataset card.",
"path": "advanced/dataset_card.md"
},
+ {
+ "description": "Advanced tier headline metrics (cross-seed medians + spreads, difficulty knobs, JSON-path back-reference to validation_report.json).",
+ "path": "advanced/metrics.json"
+ },
{
"description": "Advanced tier provenance manifest (recipe, seed, package version, file hashes, snapshot_day, redaction contract).",
"path": "advanced/manifest.json"
+ },
+ {
+ "description": "Top-level cross-tier headline metrics (medians + spreads + cohort-shift + cross-tier ordering booleans). Machine-readable summary backing the README's Calibration table.",
+ "path": "metrics.json"
+ },
+ {
+ "description": "Claims register (human-readable table). Rendered from `claims_register_source.yaml`.",
+ "path": "claims_register.md"
+ },
+ {
+ "description": "Claims register (machine-readable). Each numerical / structural claim in the README paired with its backing artifact and JSON / YAML path.",
+ "path": "claims_register.json"
+ },
+ {
+ "description": "Claims-register source YAML — hand-edited; `claims_register.{md,json}` are rendered from this.",
+ "path": "claims_register_source.yaml"
+ },
+ {
+ "description": "Adversarial-framing guide: nine breakage patterns (leakage, split contamination, ranking inversions, calibration drift) with worked-example detection recipes.",
+ "path": "docs/break_me_guide.md"
+ },
+ {
+ "description": "Empirical backing for the 'channel signal is weak' claim — out-of-sample univariate AUCs of `lead_source` per tier.",
+ "path": "docs/channel_signal_audit.md"
+ },
+ {
+ "description": "Long-form per-feature documentation grouped by analytical role; companion to the per-tier `feature_dictionary.csv` machine-readable spec.",
+ "path": "docs/feature_dictionary.md"
+ },
+ {
+ "description": "Generation method (DGP description) — what is and isn't modelled by the simulator.",
+ "path": "docs/generation_method.md"
+ },
+ {
+ "description": "Per-column descriptions for the 7 public relational tables (and the 2 instructor-only ones) — surfaced into the schema-section of this page.",
+ "path": "docs/relational_table_schemas.csv"
+ },
+ {
+ "description": "Operational acceptance bands per gate (G5–G8); the source-of-truth thresholds the validator checks against.",
+ "path": "docs/v1_acceptance_gates_bands.yaml"
+ },
+ {
+ "description": "Accepted-for-v2 findings register — issues flagged in v1 that are scoped to the v2 release.",
+ "path": "docs/v2_decision_log.md"
}
],
"subtitle": "Three-tier synthetic CRM funnel for leakage-aware lead scoring",
diff --git a/release/metrics.json b/release/metrics.json
new file mode 100644
index 0000000..827de9f
--- /dev/null
+++ b/release/metrics.json
@@ -0,0 +1,219 @@
+{
+ "acceptance_bands": {
+ "file": "release/docs/v1_acceptance_gates_bands.yaml",
+ "format": "yaml"
+ },
+ "cohort_shift": {
+ "advanced": {
+ "auc_degradation": 0.0098,
+ "cohort_split_auc": 0.8628,
+ "random_split_auc": 0.8726,
+ "seed": 42
+ },
+ "intermediate": {
+ "auc_degradation": -0.0155,
+ "cohort_split_auc": 0.8908,
+ "random_split_auc": 0.8754,
+ "seed": 42
+ },
+ "intro": {
+ "auc_degradation": 0.0156,
+ "cohort_split_auc": 0.8573,
+ "random_split_auc": 0.8729,
+ "seed": 42
+ }
+ },
+ "cross_tier_ordering": {
+ "average_precision_intermediate_gt_advanced": true,
+ "average_precision_intro_gt_intermediate": true,
+ "by_average_precision": [
+ "intro",
+ "intermediate",
+ "advanced"
+ ],
+ "by_conversion_rate": [
+ "intro",
+ "intermediate",
+ "advanced"
+ ],
+ "by_gbm_minus_lr": [
+ "intro",
+ "intermediate",
+ "advanced"
+ ],
+ "by_precision_at_100": [
+ "intro",
+ "intermediate",
+ "advanced"
+ ],
+ "conversion_rate_intermediate_gt_advanced": true,
+ "conversion_rate_intro_gt_intermediate": true,
+ "gbm_minus_lr_positive_in_every_tier": false,
+ "precision_at_100_intermediate_gt_advanced": true,
+ "precision_at_100_intro_gt_intermediate": true
+ },
+ "generation_timestamp": "2026-05-06T07:38:31+00:00",
+ "notes": "Headline metrics surfaced in the README are cross-seed medians over the canonical N=5 sweep (seeds 42-46). Per-seed values live under tiers..per_seed in validation_report.json.",
+ "package_version": "1.0.0",
+ "release_id": "leadforge-lead-scoring-v1",
+ "seeds": [
+ 42,
+ 43,
+ 44,
+ 45,
+ 46
+ ],
+ "source_of_truth": {
+ "file": "release/validation/validation_report.json",
+ "regenerated_by": "scripts/validate_release_candidate.py"
+ },
+ "tiers": {
+ "advanced": {
+ "acceptance_bands": {
+ "file": "release/docs/v1_acceptance_gates_bands.yaml",
+ "yaml_path": "per_tier.advanced"
+ },
+ "difficulty_knobs": {
+ "missing_rate": 0.18,
+ "noise_scale": 0.55,
+ "signal_strength": 0.5
+ },
+ "medians": {
+ "brier_score": 0.0611,
+ "calibration_max_bin_error": 0.5234,
+ "conversion_rate_test": 0.084,
+ "gbm_auc": 0.8726,
+ "gbm_average_precision": 0.3239,
+ "gbm_minus_lr_auc": -0.0133,
+ "log_loss": 0.1947,
+ "lr_auc": 0.8861,
+ "lr_average_precision": 0.3514,
+ "precision_at_100": 0.34,
+ "top_decile_rate": 0.3333
+ },
+ "n_seeds": 5,
+ "seeds": [
+ 42,
+ 43,
+ 44,
+ 45,
+ 46
+ ],
+ "source_of_truth": {
+ "file": "release/validation/validation_report.json",
+ "json_path": "$.tiers.advanced"
+ },
+ "spreads_max_minus_min": {
+ "brier_score": 0.0152,
+ "calibration_max_bin_error": 0.4828,
+ "conversion_rate_test": 0.02,
+ "gbm_auc": 0.0171,
+ "gbm_average_precision": 0.0324,
+ "gbm_minus_lr_auc": 0.0251,
+ "log_loss": 0.0535,
+ "lr_auc": 0.0401,
+ "lr_average_precision": 0.0814,
+ "top_decile_rate": 0.0533
+ },
+ "tier": "advanced"
+ },
+ "intermediate": {
+ "acceptance_bands": {
+ "file": "release/docs/v1_acceptance_gates_bands.yaml",
+ "yaml_path": "per_tier.intermediate"
+ },
+ "difficulty_knobs": {
+ "missing_rate": 0.08,
+ "noise_scale": 0.3,
+ "signal_strength": 0.7
+ },
+ "medians": {
+ "brier_score": 0.1096,
+ "calibration_max_bin_error": 0.249,
+ "conversion_rate_test": 0.216,
+ "gbm_auc": 0.8755,
+ "gbm_average_precision": 0.5621,
+ "gbm_minus_lr_auc": -0.0072,
+ "log_loss": 0.33,
+ "lr_auc": 0.8859,
+ "lr_average_precision": 0.5752,
+ "precision_at_100": 0.59,
+ "top_decile_rate": 0.5867
+ },
+ "n_seeds": 5,
+ "seeds": [
+ 42,
+ 43,
+ 44,
+ 45,
+ 46
+ ],
+ "source_of_truth": {
+ "file": "release/validation/validation_report.json",
+ "json_path": "$.tiers.intermediate"
+ },
+ "spreads_max_minus_min": {
+ "brier_score": 0.0161,
+ "calibration_max_bin_error": 0.3215,
+ "conversion_rate_test": 0.0467,
+ "gbm_auc": 0.027,
+ "gbm_average_precision": 0.0593,
+ "gbm_minus_lr_auc": 0.0152,
+ "log_loss": 0.035,
+ "lr_auc": 0.023,
+ "lr_average_precision": 0.0863,
+ "top_decile_rate": 0.12
+ },
+ "tier": "intermediate"
+ },
+ "intro": {
+ "acceptance_bands": {
+ "file": "release/docs/v1_acceptance_gates_bands.yaml",
+ "yaml_path": "per_tier.intro"
+ },
+ "difficulty_knobs": {
+ "missing_rate": 0.02,
+ "noise_scale": 0.1,
+ "signal_strength": 0.9
+ },
+ "medians": {
+ "brier_score": 0.1301,
+ "calibration_max_bin_error": 0.2497,
+ "conversion_rate_test": 0.4267,
+ "gbm_auc": 0.8729,
+ "gbm_average_precision": 0.7527,
+ "gbm_minus_lr_auc": -0.0045,
+ "log_loss": 0.4008,
+ "lr_auc": 0.8788,
+ "lr_average_precision": 0.7608,
+ "precision_at_100": 0.8,
+ "top_decile_rate": 0.7733
+ },
+ "n_seeds": 5,
+ "seeds": [
+ 42,
+ 43,
+ 44,
+ 45,
+ 46
+ ],
+ "source_of_truth": {
+ "file": "release/validation/validation_report.json",
+ "json_path": "$.tiers.intro"
+ },
+ "spreads_max_minus_min": {
+ "brier_score": 0.0184,
+ "calibration_max_bin_error": 0.196,
+ "conversion_rate_test": 0.092,
+ "gbm_auc": 0.0232,
+ "gbm_average_precision": 0.06,
+ "gbm_minus_lr_auc": 0.0225,
+ "log_loss": 0.0557,
+ "lr_auc": 0.0272,
+ "lr_average_precision": 0.067,
+ "top_decile_rate": 0.08
+ },
+ "tier": "intro"
+ }
+ }
+}
diff --git a/scripts/_preview_common.py b/scripts/_preview_common.py
index 2338a1d..1437369 100644
--- a/scripts/_preview_common.py
+++ b/scripts/_preview_common.py
@@ -21,6 +21,7 @@
from __future__ import annotations
import http.server
+import json
import sys
import webbrowser
from pathlib import Path
@@ -61,6 +62,87 @@ def plural(n: int, singular: str, plural_form: str | None = None) -> str:
return f"{n} {word}"
+def render_jsonld_dataset(
+ *,
+ name: str,
+ description: str,
+ license_url: str,
+ keywords: list[str],
+ citation: str | None = None,
+ distribution_paths: list[str] | None = None,
+ same_as: list[str] | None = None,
+ creator: str | None = None,
+ version: str | None = None,
+) -> str:
+ """Render a schema.org ``Dataset`` JSON-LD ``'
+
+
+def _encoding_format_for(path: str) -> str:
+ """Map a filename suffix to a MIME-ish encoding-format token.
+
+ Limited to the suffixes used in the release bundle (parquet, CSV,
+ JSON, Markdown, YAML, PNG). Falls back to ``application/octet-
+ stream`` for unknowns — keeps the JSON-LD block well-typed without
+ surprising consumers with empty strings.
+ """
+
+ suffix = path.rsplit(".", 1)[-1].lower() if "." in path else ""
+ return {
+ "parquet": "application/vnd.apache.parquet",
+ "csv": "text/csv",
+ "json": "application/json",
+ "md": "text/markdown",
+ "yaml": "application/x-yaml",
+ "yml": "application/x-yaml",
+ "png": "image/png",
+ }.get(suffix, "application/octet-stream")
+
+
def render_cover(filename: str) -> str:
"""Render a sibling-relative cover-image block.
diff --git a/scripts/_release_common.py b/scripts/_release_common.py
index 4d5f4d9..8dcb34b 100644
--- a/scripts/_release_common.py
+++ b/scripts/_release_common.py
@@ -95,13 +95,17 @@ class ValidationError:
release/
├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier
│ ├── manifest.json # provenance + file hashes
+│ ├── metrics.json # per-tier headline metrics (medians + spreads)
│ ├── dataset_card.md # auto-rendered per-bundle card
│ ├── feature_dictionary.csv # authoritative column spec
│ ├── lead_scoring.csv # flat convenience CSV (all splits)
│ ├── tables/*.parquet # 7 snapshot-safe relational tables
│ └── tasks/converted_within_90_days/{train,valid,test}.parquet
├── intermediate_instructor/ # research companion: full-horizon tables + metadata/
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
├── notebooks/ # 01 baseline · 02 relational · 03 leakage · 04 calibration
+├── metrics.json # top-level cross-tier metrics summary
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
└── validation/ # validation_report.{json,md} + figures
```"""
@@ -299,3 +303,64 @@ def load_manifest(path: Path) -> dict[str, Any]:
if not isinstance(payload, dict):
raise ValueError(f"manifest.json at {path} is not a JSON object")
return payload
+
+
+# ---------------------------------------------------------------------------
+# Per-table column descriptions (vendored under release/docs/)
+# ---------------------------------------------------------------------------
+
+#: Path within the release tree of the per-table column descriptions
+#: hand-authored CSV. Keyed by ``(table, column)``; consumed by the
+#: Kaggle packager so ``resources[].schema.fields[].description`` is
+#: populated for parquet tables (the preview's ``col__desc`` column
+#: was previously empty for relational tables — a thin spot for AI
+#: reviewers who can't open the parquet directly).
+RELATIONAL_TABLE_SCHEMAS_REL: Final[Path] = Path("docs/relational_table_schemas.csv")
+
+
+def load_relational_column_descriptions(release_dir: Path) -> dict[tuple[str, str], str]:
+ """Load per-table column descriptions keyed by ``(table, column)``.
+
+ Returns an empty dict if the CSV is missing — callers should treat
+ the description as optional (matches the pre-PR behaviour where
+ parquet schemas shipped without column docs).
+ """
+
+ import csv
+
+ path = release_dir / RELATIONAL_TABLE_SCHEMAS_REL
+ if not path.is_file():
+ return {}
+ descriptions: dict[tuple[str, str], str] = {}
+ with path.open(encoding="utf-8") as f:
+ for row in csv.DictReader(f):
+ table = row.get("table", "").strip()
+ column = row.get("column", "").strip()
+ description = (row.get("description") or "").strip()
+ if table and column and description:
+ descriptions[(table, column)] = description
+ return descriptions
+
+
+# ---------------------------------------------------------------------------
+# Agent-reviewable artifact set
+# ---------------------------------------------------------------------------
+
+#: Files at the release root that should ship in every platform's upload
+#: tree to make the bundle self-contained for agent / human review
+#: without needing GitHub access. Path tuples are ``(source_rel,
+#: optional_required)``: ``required=True`` causes the packager to
+#: surface a ValidationError if the file is missing at packaging time
+#: (these are committed artifacts; their absence indicates the release
+#: was incomplete).
+AGENT_REVIEWABLE_ROOT_FILES: Final[tuple[tuple[str, bool], ...]] = (
+ ("metrics.json", True),
+ ("claims_register.md", True),
+ ("claims_register.json", True),
+ ("claims_register_source.yaml", False),
+)
+
+#: Sub-directory under the release root containing vendored docs
+#: (DGP description, leakage / acceptance bands, break-me guide, etc.).
+#: Copied wholesale into the upload tree when present.
+AGENT_REVIEWABLE_DOCS_DIR: Final[str] = "docs"
diff --git a/scripts/build_claims_register.py b/scripts/build_claims_register.py
new file mode 100644
index 0000000..5920fce
--- /dev/null
+++ b/scripts/build_claims_register.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+"""Render the claims register from its YAML source.
+
+``release/claims_register_source.yaml`` is the hand-edited source of
+truth: every numerical / structural claim in the README plus a pointer
+to the artifact and path that backs it. This script renders two
+machine-friendly outputs into the release tree:
+
+* ``release/claims_register.json`` — structured payload an agent can
+ parse without YAML support. Includes the same claim metadata plus
+ a top-level ``schema`` block describing the field semantics so a
+ fresh agent doesn't have to infer them.
+* ``release/claims_register.md`` — table-rendered version of the same
+ data for humans skimming on GitHub or Kaggle.
+
+Both files are deterministic: same source YAML → byte-identical
+output. ``--check`` mode reports drift as exit-code-1 without
+overwriting (CI use).
+
+Exit codes: 0 success / 1 ``--check`` mode and outputs are stale /
+2 pre-flight error (source missing / malformed).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any, Final
+
+import yaml
+
+REPO_ROOT: Final[Path] = Path(__file__).resolve().parent.parent
+DEFAULT_RELEASE_DIR: Final[Path] = REPO_ROOT / "release"
+DEFAULT_SOURCE: Final[Path] = DEFAULT_RELEASE_DIR / "claims_register_source.yaml"
+
+#: Allowed category vocabulary; failing this is a build error.
+VALID_CATEGORIES: Final[frozenset[str]] = frozenset(
+ {
+ "composition",
+ "calibration",
+ "redaction",
+ "difficulty",
+ "limitations",
+ "splits",
+ "provenance",
+ "out_of_scope",
+ "intended_use",
+ }
+)
+
+#: Required keys on every claim entry.
+REQUIRED_CLAIM_KEYS: Final[tuple[str, ...]] = (
+ "id",
+ "text",
+ "category",
+ "backing_artifact",
+ "backing_path",
+ "verifier",
+)
+
+#: Schema description embedded in the JSON output so an agent landing
+#: on ``claims_register.json`` without other context can interpret the
+#: fields it sees.
+SCHEMA_DOC: Final[dict[str, str]] = {
+ "id": "Short stable identifier; quoted in CI failure messages.",
+ "text": "The claim as it appears in the README (verbatim, where practical).",
+ "category": (
+ "One of: composition, calibration, redaction, difficulty, limitations, "
+ "splits, provenance, out_of_scope, intended_use."
+ ),
+ "backing_artifact": (
+ "Path within the published bundle (or repo) that carries the source of "
+ "truth. ```` is a placeholder for intro / intermediate / "
+ "advanced."
+ ),
+ "backing_path": (
+ "JSON-path / YAML-path / column reference inside the backing artifact, "
+ "or ``n/a`` for prose contracts and whole-file claims."
+ ),
+ "verifier": (
+ "Free-form name of the script / probe / test that re-derives the "
+ "claim end-to-end. ``n/a`` means the claim is a prose contract that "
+ "is not mechanically verifiable."
+ ),
+}
+
+
+def _validate(claims: list[dict[str, Any]]) -> list[str]:
+ """Return a list of human-readable validation errors (empty = OK)."""
+
+ errors: list[str] = []
+ seen_ids: set[str] = set()
+ for idx, claim in enumerate(claims):
+ if not isinstance(claim, dict):
+ errors.append(f"claims[{idx}] is not a mapping")
+ continue
+ for key in REQUIRED_CLAIM_KEYS:
+ if key not in claim or claim.get(key) in (None, ""):
+ errors.append(f"claims[{idx}] missing required key {key!r}")
+ cid = claim.get("id")
+ if isinstance(cid, str):
+ if cid in seen_ids:
+ errors.append(f"duplicate claim id {cid!r}")
+ seen_ids.add(cid)
+ category = claim.get("category")
+ if isinstance(category, str) and category not in VALID_CATEGORIES:
+ errors.append(f"claims[{idx}] category {category!r} not in {sorted(VALID_CATEGORIES)}")
+ return errors
+
+
+def load_claims(source_path: Path) -> list[dict[str, Any]]:
+ """Load and validate the claims YAML."""
+
+ if not source_path.is_file():
+ raise FileNotFoundError(f"claims source not found at {source_path}")
+ parsed = yaml.safe_load(source_path.read_text(encoding="utf-8"))
+ if not isinstance(parsed, dict) or "claims" not in parsed:
+ raise ValueError(f"{source_path}: expected top-level mapping with 'claims' key")
+ claims = parsed["claims"]
+ if not isinstance(claims, list) or not claims:
+ raise ValueError(f"{source_path}: 'claims' must be a non-empty list")
+ errors = _validate(claims)
+ if errors:
+ raise ValueError(f"{source_path} is invalid:\n - " + "\n - ".join(errors))
+ return [dict(c) for c in claims]
+
+
+def render_json(claims: list[dict[str, Any]]) -> str:
+ """Deterministic JSON output with the schema embedded."""
+
+ payload = {
+ "schema": SCHEMA_DOC,
+ "claims": [
+ {
+ "id": c["id"],
+ "text": c["text"],
+ "category": c["category"],
+ "backing_artifact": c["backing_artifact"],
+ "backing_path": c["backing_path"],
+ "verifier": c["verifier"],
+ }
+ for c in claims
+ ],
+ "notes": (
+ "This register is rendered from release/claims_register_source.yaml. "
+ "Every claim in release/README.md should appear here. Agents and CI "
+ "can use the (backing_artifact, backing_path) tuple to locate the "
+ "source-of-truth value without parsing prose."
+ ),
+ }
+ return json.dumps(payload, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+
+
+def _escape_md(text: str) -> str:
+ """Escape pipe characters so the cell doesn't break the table."""
+
+ return text.replace("|", "\\|")
+
+
+def render_markdown(claims: list[dict[str, Any]]) -> str:
+ """Render a single GitHub-flavoured markdown table.
+
+ Categories are grouped for readability; within a category, claim
+ ids preserve source-file order.
+ """
+
+ grouped: dict[str, list[dict[str, Any]]] = {}
+ for claim in claims:
+ grouped.setdefault(claim["category"], []).append(claim)
+
+ lines = [
+ "# Claims register — `leadforge-lead-scoring-v1`",
+ "",
+ "Every numerical / structural claim made in `release/README.md` (and",
+ "copied onto the Kaggle / HuggingFace dataset pages), paired with the",
+ "artifact and path that backs it. This file is auto-rendered from",
+ "[`release/claims_register_source.yaml`](claims_register_source.yaml)",
+ "by `scripts/build_claims_register.py`. Edit the YAML, not this file.",
+ "",
+ "Tip for AI reviewers: `claims_register.json` is the machine-readable",
+ "twin of this document with the same data plus a schema block.",
+ "",
+ ]
+
+ for category in sorted(grouped):
+ lines.append(f"## {category}")
+ lines.append("")
+ lines.append("| ID | Claim | Backing artifact | Path | Verifier |")
+ lines.append("|---|---|---|---|---|")
+ for claim in grouped[category]:
+ row = (
+ f"| `{claim['id']}` "
+ f"| {_escape_md(claim['text'])} "
+ f"| `{_escape_md(claim['backing_artifact'])}` "
+ f"| `{_escape_md(claim['backing_path'])}` "
+ f"| `{_escape_md(claim['verifier'])}` |"
+ )
+ lines.append(row)
+ lines.append("")
+
+ # Single trailing newline (no blank line at EOF) so the
+ # ``end-of-file-fixer`` pre-commit hook is a no-op against the
+ # rendered file.
+ while lines and lines[-1] == "":
+ lines.pop()
+ return "\n".join(lines) + "\n"
+
+
+def write_register(
+ release_dir: Path,
+ source_path: Path,
+ *,
+ check_only: bool,
+) -> list[Path]:
+ """Write (or check) the rendered files. Returns the stale list."""
+
+ claims = load_claims(source_path)
+ json_path = release_dir / "claims_register.json"
+ md_path = release_dir / "claims_register.md"
+
+ stale: list[Path] = []
+
+ def _write(path: Path, content: str) -> None:
+ rel = path.relative_to(REPO_ROOT) if path.is_relative_to(REPO_ROOT) else path
+ existing = path.read_text(encoding="utf-8") if path.is_file() else None
+ if existing != content:
+ stale.append(rel)
+ if not check_only:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(content, encoding="utf-8")
+
+ _write(json_path, render_json(claims))
+ _write(md_path, render_markdown(claims))
+ return stale
+
+
+def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ prog="build_claims_register",
+ description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ parser.add_argument(
+ "--release-dir",
+ type=Path,
+ default=DEFAULT_RELEASE_DIR,
+ help="release tree (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--source",
+ type=Path,
+ default=DEFAULT_SOURCE,
+ help="path to claims_register_source.yaml (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--check",
+ action="store_true",
+ help="report stale outputs as exit-code-1 without overwriting (CI use)",
+ )
+ return parser.parse_args(argv)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+ args = parse_args(argv)
+
+ try:
+ stale = write_register(args.release_dir, args.source, check_only=args.check)
+ except FileNotFoundError as exc:
+ print(f"error: {exc}", file=sys.stderr)
+ return 2
+ except ValueError as exc:
+ print(f"error: {exc}", file=sys.stderr)
+ return 2
+
+ if args.check:
+ if stale:
+ print("error: claims register is stale:", file=sys.stderr)
+ for path in stale:
+ print(f" - {path}", file=sys.stderr)
+ print(
+ "run `python scripts/build_claims_register.py` to refresh.",
+ file=sys.stderr,
+ )
+ return 1
+ print("claims register is up to date.", file=sys.stderr)
+ return 0
+
+ if stale:
+ for path in stale:
+ print(f"wrote {path}", file=sys.stderr)
+ else:
+ print("claims register is already up to date.", file=sys.stderr)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/build_release_metrics.py b/scripts/build_release_metrics.py
new file mode 100644
index 0000000..7864815
--- /dev/null
+++ b/scripts/build_release_metrics.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+"""Emit machine-readable metrics summaries for agent reviewers.
+
+Headline metrics (LR AUC, AP, P@100, Brier, conversion rate, GBM-LR
+delta, cohort-shift, cross-tier ordering) currently live only in the
+README's markdown table. An AI reviewer landing on the published
+bundle would have to parse prose to verify any of them.
+
+This script reads ``release/validation/validation_report.json`` (the
+authoritative output of ``scripts/validate_release_candidate.py``) and
+writes:
+
+* ``release/metrics.json`` — top-level summary covering all three
+ tiers + cross-tier ordering + cohort-shift, with explicit JSON-path
+ back-references to the source-of-truth file. Lives at the bundle
+ root so the Kaggle and HuggingFace upload trees pick it up by
+ default.
+* ``release//metrics.json`` (per tier, one of intro / intermediate
+ / advanced) — the per-tier slice plus difficulty knobs from the
+ recipe so each bundle is independently inspectable.
+
+Both files are deterministic: same ``validation_report.json`` →
+byte-identical output. ``--check`` mode reports drift as exit-code-1
+without overwriting (CI use).
+
+Exit codes: 0 success / 1 ``--check`` mode and metrics are stale /
+2 pre-flight error (validation_report.json missing / malformed).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any, Final
+
+REPO_ROOT: Final[Path] = Path(__file__).resolve().parent.parent
+
+DEFAULT_RELEASE_DIR: Final[Path] = REPO_ROOT / "release"
+DEFAULT_REPORT_PATH: Final[Path] = DEFAULT_RELEASE_DIR / "validation" / "validation_report.json"
+
+#: Per-tier "difficulty knobs" surfaced in the README. Sourced once
+#: here so the per-tier metrics file can include them inline; if the
+#: recipe ever changes these, update both this constant and the
+#: README's "Dataset summary" table.
+DIFFICULTY_KNOBS: Final[dict[str, dict[str, float]]] = {
+ "intro": {"signal_strength": 0.90, "noise_scale": 0.10, "missing_rate": 0.02},
+ "intermediate": {"signal_strength": 0.70, "noise_scale": 0.30, "missing_rate": 0.08},
+ "advanced": {"signal_strength": 0.50, "noise_scale": 0.55, "missing_rate": 0.18},
+}
+
+TIER_ORDER: Final[tuple[str, ...]] = ("intro", "intermediate", "advanced")
+
+#: Subset of headline metrics we surface in the metrics files. The
+#: full per-seed payload stays in ``validation_report.json``; this is
+#: the at-a-glance view an agent can verify without parsing every
+#: nested key.
+HEADLINE_KEYS: Final[tuple[str, ...]] = (
+ "lr_auc",
+ "gbm_auc",
+ "gbm_minus_lr_auc",
+ "lr_average_precision",
+ "gbm_average_precision",
+ "brier_score",
+ "log_loss",
+ "calibration_max_bin_error",
+ "conversion_rate_test",
+ "top_decile_rate",
+)
+
+
+def _round(value: Any, ndigits: int) -> Any:
+ """Round a numeric value to ``ndigits``, leaving non-numerics alone.
+
+ ``None`` and NaN are preserved as JSON ``null`` for downstream
+ consumers (some metrics legitimately have no value in some seeds).
+ """
+
+ if value is None:
+ return None
+ if isinstance(value, float) and math.isnan(value):
+ return None
+ if isinstance(value, int | float):
+ return round(float(value), ndigits)
+ return value
+
+
+def _precision_at_100_median(per_seed: list[dict[str, Any]]) -> float | None:
+ """Compute the cross-seed median of P@100.
+
+ ``per_seed[*].precision_at_k`` is a dict ``{"50": 0.84, "100": 0.80}``
+ in ``validation_report.json``; the median is not stored in
+ ``medians`` and has to be computed here.
+ """
+
+ values = []
+ for seed_block in per_seed:
+ pk = seed_block.get("precision_at_k") or {}
+ val = pk.get("100")
+ if val is not None:
+ values.append(float(val))
+ if not values:
+ return None
+ values.sort()
+ n = len(values)
+ return values[n // 2] if n % 2 else 0.5 * (values[n // 2 - 1] + values[n // 2])
+
+
+def _tier_summary(tier: str, tier_block: dict[str, Any]) -> dict[str, Any]:
+ """Per-tier slice for the metrics files."""
+
+ medians = tier_block.get("medians", {})
+ spreads = tier_block.get("spreads", {})
+ per_seed = tier_block.get("per_seed", []) or []
+
+ p100 = _precision_at_100_median(per_seed)
+
+ medians_out = {key: _round(medians.get(key), 4) for key in HEADLINE_KEYS}
+ spreads_out = {key: _round(spreads.get(key), 4) for key in HEADLINE_KEYS}
+ if p100 is not None:
+ medians_out["precision_at_100"] = _round(p100, 4)
+
+ n_seeds = len(per_seed)
+
+ return {
+ "tier": tier,
+ "n_seeds": n_seeds,
+ "seeds": list(tier_block.get("seeds", [])) or sorted(int(s.get("seed")) for s in per_seed),
+ "difficulty_knobs": DIFFICULTY_KNOBS.get(tier, {}),
+ "medians": medians_out,
+ "spreads_max_minus_min": spreads_out,
+ "source_of_truth": {
+ "file": "release/validation/validation_report.json",
+ "json_path": f"$.tiers.{tier}",
+ },
+ "acceptance_bands": {
+ "file": "release/docs/v1_acceptance_gates_bands.yaml",
+ "yaml_path": f"per_tier.{tier}",
+ },
+ }
+
+
+def build_top_level_metrics(report: dict[str, Any]) -> dict[str, Any]:
+ """Assemble the top-level ``release/validation/metrics.json`` payload."""
+
+ tiers = report.get("tiers", {})
+ cohort = report.get("cohort_shift", {})
+ ordering = report.get("cross_tier_ordering", {})
+
+ tier_summaries = {
+ tier: _tier_summary(tier, tiers[tier]) for tier in TIER_ORDER if tier in tiers
+ }
+
+ cohort_out = {
+ tier: {
+ "random_split_auc": _round(cohort.get(tier, {}).get("random_split_auc"), 4),
+ "cohort_split_auc": _round(cohort.get(tier, {}).get("cohort_split_auc"), 4),
+ "auc_degradation": _round(cohort.get(tier, {}).get("auc_degradation"), 4),
+ "seed": cohort.get(tier, {}).get("seed"),
+ }
+ for tier in TIER_ORDER
+ if tier in cohort
+ }
+
+ return {
+ "release_id": report.get("release_id"),
+ "package_version": report.get("package_version"),
+ "generation_timestamp": report.get("generation_timestamp"),
+ "seeds": list(report.get("seeds", [])),
+ "tiers": tier_summaries,
+ "cross_tier_ordering": ordering,
+ "cohort_shift": cohort_out,
+ "source_of_truth": {
+ "file": "release/validation/validation_report.json",
+ "regenerated_by": "scripts/validate_release_candidate.py",
+ },
+ "acceptance_bands": {
+ "file": "release/docs/v1_acceptance_gates_bands.yaml",
+ "format": "yaml",
+ },
+ "notes": (
+ "Headline metrics surfaced in the README are cross-seed medians over "
+ "the canonical N=5 sweep (seeds 42-46). Per-seed values live under "
+ "tiers..per_seed in validation_report.json."
+ ),
+ }
+
+
+def _render_json(payload: dict[str, Any]) -> str:
+ """Deterministic JSON renderer matching the project's conventions."""
+
+ return json.dumps(payload, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+
+
+def write_metrics(
+ release_dir: Path,
+ report_path: Path,
+ *,
+ check_only: bool,
+) -> tuple[list[Path], dict[str, Any]]:
+ """Write (or check) the metrics files. Returns ``(stale, top_level)``."""
+
+ if not report_path.is_file():
+ raise FileNotFoundError(f"validation report not found at {report_path}")
+ report = json.loads(report_path.read_text(encoding="utf-8"))
+ if not isinstance(report, dict):
+ raise ValueError(f"{report_path} is not a JSON object")
+
+ top_level = build_top_level_metrics(report)
+ stale: list[Path] = []
+
+ def _write(path: Path, content: str) -> None:
+ path_rel = path.relative_to(REPO_ROOT) if path.is_relative_to(REPO_ROOT) else path
+ existing = path.read_text(encoding="utf-8") if path.is_file() else None
+ if existing != content:
+ stale.append(path_rel)
+ if not check_only:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(content, encoding="utf-8")
+
+ _write(release_dir / "metrics.json", _render_json(top_level))
+
+ for tier, summary in top_level["tiers"].items():
+ tier_dir = release_dir / tier
+ # Per-tier bundle dirs are gitignored; skip when absent so the
+ # script is safe to run on a fresh checkout that hasn't rebuilt
+ # the bundles yet. The release-day workflow always regenerates
+ # bundles first, then this script, so the production path
+ # populates them.
+ if not tier_dir.is_dir():
+ continue
+ _write(tier_dir / "metrics.json", _render_json(summary))
+
+ return stale, top_level
+
+
+def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ prog="build_release_metrics",
+ description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ parser.add_argument(
+ "--release-dir",
+ type=Path,
+ default=DEFAULT_RELEASE_DIR,
+ help="release tree (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--report-path",
+ type=Path,
+ default=DEFAULT_REPORT_PATH,
+ help="path to validation_report.json (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--check",
+ action="store_true",
+ help="report stale metrics as exit-code-1 without overwriting (CI use)",
+ )
+ return parser.parse_args(argv)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+ args = parse_args(argv)
+
+ try:
+ stale, _ = write_metrics(args.release_dir, args.report_path, check_only=args.check)
+ except FileNotFoundError as exc:
+ print(f"error: {exc}", file=sys.stderr)
+ return 2
+ except ValueError as exc:
+ print(f"error: {exc}", file=sys.stderr)
+ return 2
+
+ if args.check:
+ if stale:
+ print("error: metrics files are stale:", file=sys.stderr)
+ for path in stale:
+ print(f" - {path}", file=sys.stderr)
+ print(
+ "run `python scripts/build_release_metrics.py` to refresh them.",
+ file=sys.stderr,
+ )
+ return 1
+ print("metrics files are up to date.", file=sys.stderr)
+ return 0
+
+ if stale:
+ for path in stale:
+ print(f"wrote {path}", file=sys.stderr)
+ else:
+ print("metrics files are already up to date.", file=sys.stderr)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/package_hf_release.py b/scripts/package_hf_release.py
index 31ca3fd..650d7e5 100644
--- a/scripts/package_hf_release.py
+++ b/scripts/package_hf_release.py
@@ -57,6 +57,8 @@
sys.path.insert(0, str(Path(__file__).resolve().parent))
from _release_common import ( # noqa: E402,F401 — must follow sys.path insert
+ AGENT_REVIEWABLE_DOCS_DIR,
+ AGENT_REVIEWABLE_ROOT_FILES,
GITHUB_BLOB_BASE,
SOURCE_TREE_BLOCK,
ValidationError,
@@ -142,11 +144,15 @@
.
├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier
│ ├── manifest.json # provenance + file hashes
+│ ├── metrics.json # per-tier headline metrics (medians + spreads)
│ ├── dataset_card.md # auto-rendered per-bundle card
│ ├── feature_dictionary.csv # authoritative column spec
│ ├── lead_scoring.csv # flat convenience CSV (all splits)
│ ├── tables/*.parquet # 7 snapshot-safe relational tables
│ └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
+├── metrics.json # top-level cross-tier metrics summary
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
├── README.md # this file (HF dataset card)
├── dataset-cover-image.png # dataset thumbnail
└── LICENSE
@@ -161,6 +167,8 @@
│ ├── tables/*.parquet # full-horizon tables (incl. customers, subscriptions)
│ ├── tasks/converted_within_90_days/{train,valid,test}.parquet
│ └── metadata/ # world_spec, graph.{graphml,json}, latent_registry, etc.
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
├── README.md # this file (HF dataset card)
├── dataset-cover-image.png # dataset thumbnail
└── LICENSE
@@ -298,6 +306,25 @@ def _hf_public_readme_text(readme: str) -> str:
every parquet file.
- **Bundle schema version.** 5 (matches the public dataset).
+## Agent-reviewable artifacts
+
+The companion ships the same self-contained review surface as the public
+bundle so an AI reviewer (or a researcher without GitHub access) can
+verify claims locally:
+
+- ``docs/`` — vendored copies of the generation method, leakage probes
+ contract, acceptance bands, break-me guide, v2 decision log, and the
+ per-relational-table column descriptions (`relational_table_schemas.csv`).
+- ``claims_register.{{md,json}}`` — every numerical / structural claim
+ in this card paired with the artifact and path that backs it.
+- ``intermediate/manifest.json`` and ``intermediate/feature_dictionary.csv``
+ — SHA-256-hashed provenance and the authoritative column spec.
+
+The instructor companion intentionally omits the top-level
+``metrics.json`` (cross-tier medians would be misleading for a single
+tier). Use the public dataset's ``metrics.json`` when comparing tier
+behaviour.
+
## Maintenance, license
We *want* the dataset to be broken. See the
@@ -697,6 +724,37 @@ def assemble_upload_dir(
if license_src.exists():
replace_file(license_src, upload_dir / "LICENSE")
+ # Agent-reviewable root files (metrics.json, claims_register.*).
+ # The public variant ships the cross-tier ``metrics.json``; the
+ # instructor companion intentionally omits it (single-tier dataset
+ # — cross-tier numbers would mislead). Both variants ship the
+ # claims register and the vendored docs subtree so an AI reviewer
+ # never has to follow github.com/blob/main/... links to verify
+ # whatever's on the README.
+ public_root_files = {
+ "metrics.json",
+ "claims_register.md",
+ "claims_register.json",
+ "claims_register_source.yaml",
+ }
+ instructor_root_files = {
+ "claims_register.md",
+ "claims_register.json",
+ "claims_register_source.yaml",
+ }
+ allow_for_variant = public_root_files if variant == "public" else instructor_root_files
+ for rel, _required in AGENT_REVIEWABLE_ROOT_FILES:
+ if rel not in allow_for_variant:
+ continue
+ src = release_dir / rel
+ if src.is_file():
+ replace_file(src, upload_dir / rel)
+
+ # Vendored docs subtree.
+ docs_src = release_dir / AGENT_REVIEWABLE_DOCS_DIR
+ if docs_src.is_dir():
+ replace_dir(docs_src, upload_dir / AGENT_REVIEWABLE_DOCS_DIR)
+
# Per-tier bundles — full directory copies. The instructor variant
# flattens its source dir name.
if variant == "public":
diff --git a/scripts/package_kaggle_release.py b/scripts/package_kaggle_release.py
index 2de9401..6fda4cd 100644
--- a/scripts/package_kaggle_release.py
+++ b/scripts/package_kaggle_release.py
@@ -66,10 +66,13 @@
# rewritten README content; its presence here is a public-symbol
# contract, not a local consumer.
from _release_common import ( # noqa: E402,F401 — must follow sys.path insert
+ AGENT_REVIEWABLE_DOCS_DIR,
+ AGENT_REVIEWABLE_ROOT_FILES,
GITHUB_BLOB_BASE,
SOURCE_TREE_BLOCK,
ValidationError,
load_manifest,
+ load_relational_column_descriptions,
replace_dir,
replace_file,
resolve_cover_image_path,
@@ -223,11 +226,15 @@ class UserSource:
.
├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier
│ ├── manifest.json # provenance + file hashes
+│ ├── metrics.json # per-tier headline metrics (medians + spreads)
│ ├── dataset_card.md # auto-rendered per-bundle card
│ ├── feature_dictionary.csv # authoritative column spec
│ ├── lead_scoring.csv # flat convenience CSV (all splits)
│ ├── tables/*.parquet # 7 snapshot-safe relational tables
│ └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── docs/ # vendored DGP / leakage / break-me docs (agent-readable)
+├── metrics.json # top-level cross-tier metrics summary
+├── claims_register.{md,json} # claims → backing-artifact map (agent-readable)
├── dataset-metadata.json # Kaggle dataset metadata
├── dataset-cover-image.png # Kaggle cover image
├── README.md # Kaggle package README
@@ -519,18 +526,41 @@ def _kaggle_type_from_arrow(dtype: pa.DataType) -> str:
return "string"
-def fields_from_parquet(path: Path) -> tuple[FieldDescriptor, ...]:
+def fields_from_parquet(
+ path: Path,
+ *,
+ column_descriptions: dict[tuple[str, str], str] | None = None,
+ table_name: str | None = None,
+) -> tuple[FieldDescriptor, ...]:
"""Read parquet schema from ``path`` and return ``FieldDescriptor`` rows.
Kaggle accepts Frictionless schemas on parquet resources too; the
parquet file's own Arrow metadata is the ground truth for column
order and types, so we read directly rather than mirroring a CSV
- header. ``description`` is omitted for parquet fields — relational
- tables don't have per-column docs in the bundle.
+ header. When the caller passes ``column_descriptions`` (loaded
+ from ``release/docs/relational_table_schemas.csv``) and a
+ ``table_name``, descriptions are attached to each field — the
+ earlier behaviour shipped empty ``col__desc`` cells in the preview
+ HTML for every relational table, which left agent reviewers without
+ per-column documentation for ``touches.touch_timestamp`` etc.
+ Tables not present in the descriptions map fall back to the prior
+ no-description shape.
"""
schema = pq.read_schema(path)
- return tuple(FieldDescriptor(name=f.name, type=_kaggle_type_from_arrow(f.type)) for f in schema)
+ fields: list[FieldDescriptor] = []
+ for f in schema:
+ description: str | None = None
+ if column_descriptions is not None and table_name is not None:
+ description = column_descriptions.get((table_name, f.name))
+ fields.append(
+ FieldDescriptor(
+ name=f.name,
+ type=_kaggle_type_from_arrow(f.type),
+ description=description,
+ )
+ )
+ return tuple(fields)
# ``_load_manifest`` is now ``load_manifest`` in ``_release_common``.
@@ -546,8 +576,10 @@ def build_tier_resources(
Order: flat CSV (with full ``schema.fields``) → feature dictionary
→ task splits (parquet, schema from Arrow) → relational tables
- (parquet, schema from Arrow) → dataset card → manifest. Kaggle
- renders this list in declared order on the dataset page.
+ (parquet, schema from Arrow, per-column descriptions from
+ ``release/docs/relational_table_schemas.csv`` when present) →
+ dataset card → per-tier metrics.json → manifest. Kaggle renders
+ this list in declared order on the dataset page.
"""
tier_dir = release_dir / tier
@@ -563,6 +595,8 @@ def build_tier_resources(
table_inventory = manifest.get("tables", {})
snapshot_day = manifest.get("snapshot_day")
+ column_descriptions = load_relational_column_descriptions(release_dir)
+
resources: list[Resource] = []
resources.append(
@@ -609,7 +643,13 @@ def build_tier_resources(
description=(
f"{tier.capitalize()} tier `{table}` relational table{suffix} — snapshot-safe."
),
- schema=ResourceSchema(fields=fields_from_parquet(table_path)),
+ schema=ResourceSchema(
+ fields=fields_from_parquet(
+ table_path,
+ column_descriptions=column_descriptions,
+ table_name=table,
+ )
+ ),
)
)
@@ -619,6 +659,16 @@ def build_tier_resources(
description=f"{tier.capitalize()} tier auto-rendered dataset card.",
)
)
+ if (tier_dir / "metrics.json").is_file():
+ resources.append(
+ Resource(
+ path=f"{tier}/metrics.json",
+ description=(
+ f"{tier.capitalize()} tier headline metrics (cross-seed medians + spreads, "
+ f"difficulty knobs, JSON-path back-reference to validation_report.json)."
+ ),
+ )
+ )
resources.append(
Resource(
path=f"{tier}/manifest.json",
@@ -631,6 +681,103 @@ def build_tier_resources(
return tuple(resources)
+# ---------------------------------------------------------------------------
+# Agent-reviewable root-level resources (docs/, claims register, metrics)
+# ---------------------------------------------------------------------------
+
+
+#: Per-vendored-doc description used in the Kaggle resources list.
+#: Same map used by both the metadata builder and the upload-tree
+#: assembler so the list of agent-reviewable files is single-sourced.
+_AGENT_DOC_DESCRIPTIONS: Final[dict[str, str]] = {
+ "docs/generation_method.md": (
+ "Generation method (DGP description) — what is and isn't modelled by the simulator."
+ ),
+ "docs/channel_signal_audit.md": (
+ "Empirical backing for the 'channel signal is weak' claim — out-of-sample univariate "
+ "AUCs of `lead_source` per tier."
+ ),
+ "docs/break_me_guide.md": (
+ "Adversarial-framing guide: nine breakage patterns (leakage, split contamination, "
+ "ranking inversions, calibration drift) with worked-example detection recipes."
+ ),
+ "docs/feature_dictionary.md": (
+ "Long-form per-feature documentation grouped by analytical role; companion to the "
+ "per-tier `feature_dictionary.csv` machine-readable spec."
+ ),
+ "docs/v1_acceptance_gates_bands.yaml": (
+ "Operational acceptance bands per gate (G5–G8); the source-of-truth thresholds the "
+ "validator checks against."
+ ),
+ "docs/v2_decision_log.md": (
+ "Accepted-for-v2 findings register — issues flagged in v1 that are scoped to the v2 "
+ "release."
+ ),
+ "docs/relational_table_schemas.csv": (
+ "Per-column descriptions for the 7 public relational tables (and the 2 "
+ "instructor-only ones) — surfaced into the schema-section of this page."
+ ),
+}
+
+
+def _agent_reviewable_resources(release_dir: Path) -> list[Resource]:
+ """Resources for the top-level agent-reviewable artifacts.
+
+ These describe the files the assembler will copy into the upload
+ root: ``metrics.json``, ``claims_register.{md,json}``,
+ ``claims_register_source.yaml`` (when present), and every file
+ under ``docs/``. Skipping files that don't exist on disk keeps
+ the metadata in sync with whatever the maintainer actually
+ assembled — running the script on a freshly-cloned checkout
+ won't pretend that ungenerated files will appear in the upload.
+ """
+
+ resources: list[Resource] = []
+
+ if (release_dir / "metrics.json").is_file():
+ resources.append(
+ Resource(
+ path="metrics.json",
+ description=(
+ "Top-level cross-tier headline metrics (medians + spreads + cohort-shift "
+ "+ cross-tier ordering booleans). Machine-readable summary backing the "
+ "README's Calibration table."
+ ),
+ )
+ )
+
+ for filename in ("claims_register.md", "claims_register.json", "claims_register_source.yaml"):
+ if (release_dir / filename).is_file():
+ if filename.endswith(".json"):
+ desc = (
+ "Claims register (machine-readable). Each numerical / structural claim in "
+ "the README paired with its backing artifact and JSON / YAML path."
+ )
+ elif filename.endswith(".md"):
+ desc = (
+ "Claims register (human-readable table). Rendered from "
+ "`claims_register_source.yaml`."
+ )
+ else:
+ desc = (
+ "Claims-register source YAML — hand-edited; `claims_register.{md,json}` "
+ "are rendered from this."
+ )
+ resources.append(Resource(path=filename, description=desc))
+
+ docs_dir = release_dir / AGENT_REVIEWABLE_DOCS_DIR
+ if docs_dir.is_dir():
+ for filename in sorted(p.name for p in docs_dir.iterdir() if p.is_file()):
+ rel = f"{AGENT_REVIEWABLE_DOCS_DIR}/{filename}"
+ description = _AGENT_DOC_DESCRIPTIONS.get(
+ rel,
+ f"Vendored release doc ({filename}).",
+ )
+ resources.append(Resource(path=rel, description=description))
+
+ return resources
+
+
def build_metadata(
release_dir: Path,
*,
@@ -662,6 +809,7 @@ def build_metadata(
resources: list[Resource] = []
for tier in tiers:
resources.extend(build_tier_resources(release_dir, tier, task=task))
+ resources.extend(_agent_reviewable_resources(release_dir))
return DatasetMetadata(
title=title,
@@ -817,6 +965,20 @@ def assemble_upload_dir(
encoding="utf-8",
)
+ # Agent-reviewable root files (metrics.json, claims_register.{md,json,yaml})
+ # — straight copies; these are committed artifacts that ride along
+ # so the published bundle is self-verifiable without GitHub access.
+ for rel, _required in AGENT_REVIEWABLE_ROOT_FILES:
+ src = release_dir / rel
+ if src.is_file():
+ replace_file(src, kaggle_dir / rel)
+
+ # Vendored docs (release/docs/) — full directory copy, mirrors how
+ # we treat per-tier bundle dirs.
+ docs_src = release_dir / AGENT_REVIEWABLE_DOCS_DIR
+ if docs_src.is_dir():
+ replace_dir(docs_src, kaggle_dir / AGENT_REVIEWABLE_DOCS_DIR)
+
# Per-tier bundles — full directory copies.
for tier in tiers:
tier_src = release_dir / tier
diff --git a/scripts/preview_hf_page.py b/scripts/preview_hf_page.py
index 91b5448..5f55f0a 100644
--- a/scripts/preview_hf_page.py
+++ b/scripts/preview_hf_page.py
@@ -45,6 +45,7 @@
escape,
plural,
render_cover,
+ render_jsonld_dataset,
serve,
)
from _release_common import replace_file # noqa: E402
@@ -255,13 +256,14 @@ def _render_footer(frontmatter: dict[str, Any], variant: str) -> str:
"""
-def _wrap_html(*, title: str, body: str) -> str:
+def _wrap_html(*, title: str, body: str, jsonld: str) -> str:
return f"""
HF preview — {escape(title)}
+ {jsonld}
@@ -272,6 +274,57 @@ def _wrap_html(*, title: str, body: str) -> str:
"""
+#: SPDX-style URL for MIT — matches the constant on the Kaggle preview.
+_LICENSE_URL_MIT: Final[str] = "https://opensource.org/licenses/MIT"
+
+
+def _jsonld_for_hf(frontmatter: dict[str, Any], variant: str) -> str:
+ """Build the schema.org ``Dataset`` JSON-LD block for HF previews.
+
+ Sources: pretty_name / license / tags / configs from the YAML
+ frontmatter; license URL pinned to the SPDX MIT URL because HF
+ stores the SPDX short-name (``mit``), not a URL. ``distribution``
+ enumerates the data_files paths declared under ``configs`` —
+ short, deterministic, and reads as the same agent-facing shape
+ Kaggle surfaces.
+ """
+
+ keywords = list(frontmatter.get("tags", []) or [])
+ configs = frontmatter.get("configs", []) or []
+ distribution_paths: list[str] = []
+ for config in configs:
+ for df in config.get("data_files", []) or []:
+ path = df.get("path")
+ if path:
+ distribution_paths.append(str(path))
+ distribution_paths = distribution_paths[:12]
+
+ same_as = [
+ "https://github.com/leadforge-dev/leadforge",
+ "https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1",
+ ]
+
+ # Description is variant-agnostic on purpose — including the
+ # variant token here would diverge the JSON-LD between public /
+ # instructor renderings, breaking the variant-localisation
+ # invariant the regression suite asserts. Variant is implied by
+ # the distribution_paths and the page footer.
+ return render_jsonld_dataset(
+ name=str(frontmatter.get("pretty_name", "")),
+ description="Hugging Face preview of leadforge-lead-scoring-v1.",
+ license_url=_LICENSE_URL_MIT,
+ keywords=keywords,
+ citation=(
+ "Generated by leadforge (https://github.com/leadforge-dev/leadforge); "
+ "recipe b2b_saas_procurement_v1, seed 42."
+ ),
+ distribution_paths=distribution_paths,
+ same_as=same_as,
+ creator="leadforge",
+ version="v1",
+ )
+
+
# ---------------------------------------------------------------------------
# Top-level renderer
# ---------------------------------------------------------------------------
@@ -311,6 +364,7 @@ def render_hf_html(
return _wrap_html(
title=str(doc.frontmatter.get("pretty_name", "")),
body="\n".join(p for p in body_parts if p),
+ jsonld=_jsonld_for_hf(doc.frontmatter, variant),
)
diff --git a/scripts/preview_kaggle_page.py b/scripts/preview_kaggle_page.py
index de5a61b..349ceb2 100644
--- a/scripts/preview_kaggle_page.py
+++ b/scripts/preview_kaggle_page.py
@@ -41,6 +41,7 @@
escape,
plural,
render_cover,
+ render_jsonld_dataset,
serve,
)
from _release_common import replace_file # noqa: E402
@@ -284,13 +285,14 @@ def _render_footer(metadata: dict[str, Any]) -> str:
"""
-def _wrap_html(*, title: str, body: str) -> str:
+def _wrap_html(*, title: str, body: str, jsonld: str) -> str:
return f"""
Kaggle preview — {escape(title)}
+ {jsonld}
@@ -301,6 +303,48 @@ def _wrap_html(*, title: str, body: str) -> str:
"""
+#: SPDX-style URL for MIT (schema.org ``license`` is a URL, not the
+#: SPDX short name). Kept here so a future relicensing PR only has
+#: to flip one constant per preview script.
+_LICENSE_URL_MIT: Final[str] = "https://opensource.org/licenses/MIT"
+
+
+def _jsonld_for_kaggle(metadata: dict[str, Any]) -> str:
+ """Build the schema.org ``Dataset`` JSON-LD block for Kaggle.
+
+ Sources: title / subtitle / id / keywords / image from the Kaggle
+ metadata; license URL is pinned (Kaggle stores the license name,
+ not the SPDX URL the JSON-LD spec wants). ``distribution`` is a
+ short representative list of file paths so an agent can see the
+ bundle's shape without enumerating every parquet — the full list
+ lives in ``resources[]`` lower on the page.
+ """
+
+ keywords = list(metadata.get("keywords", []))
+ sources = metadata.get("userSpecifiedSources", []) or []
+ same_as = [s["url"] for s in sources if isinstance(s, dict) and s.get("url")]
+
+ resources = metadata.get("resources", [])
+ representative_paths = [r["path"] for r in resources if isinstance(r, dict) and r.get("path")][
+ :12
+ ]
+
+ return render_jsonld_dataset(
+ name=str(metadata.get("title", "")),
+ description=str(metadata.get("subtitle", "")),
+ license_url=_LICENSE_URL_MIT,
+ keywords=keywords,
+ citation=(
+ "Generated by leadforge (https://github.com/leadforge-dev/leadforge); "
+ "recipe b2b_saas_procurement_v1, seed 42."
+ ),
+ distribution_paths=representative_paths,
+ same_as=same_as,
+ creator="leadforge",
+ version="v1",
+ )
+
+
# ---------------------------------------------------------------------------
# Top-level renderer
# ---------------------------------------------------------------------------
@@ -322,7 +366,11 @@ def render_kaggle_html(metadata: dict[str, Any], cover_image_filename: str) -> s
_render_sources(metadata),
_render_footer(metadata),
]
- return _wrap_html(title=metadata.get("title", ""), body="\n".join(p for p in body_parts if p))
+ return _wrap_html(
+ title=metadata.get("title", ""),
+ body="\n".join(p for p in body_parts if p),
+ jsonld=_jsonld_for_kaggle(metadata),
+ )
# ---------------------------------------------------------------------------
diff --git a/scripts/sync_release_docs.py b/scripts/sync_release_docs.py
new file mode 100644
index 0000000..9321c22
--- /dev/null
+++ b/scripts/sync_release_docs.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""Sync the agent-reviewable docs vendored under ``release/docs/``.
+
+The Kaggle and HuggingFace mock pages link to documentation that lives
+under ``docs/release/`` in the source repo. An AI agent that lands on
+the published bundle (or the mock preview) without web access cannot
+follow those ``github.com/blob/main/...`` links, so the release-time
+claims become unverifiable.
+
+This script copies the canonical set of supporting docs into
+``release/docs/`` so the published bundle is self-contained and the
+mock previews render against the same files an agent would read on
+Kaggle / HuggingFace. The sync is idempotent: same inputs produce
+byte-identical outputs. CI runs ``--check`` to fail when the source
+docs drift from the vendored copies.
+
+Inputs (all under ``docs/release/``):
+
+* ``generation_method.md`` — what is / isn't modelled by the DGP.
+* ``channel_signal_audit.md`` — backing data for the "channel signal
+ is weak" claim in the README.
+* ``break_me_guide.md`` — nine adversarial patterns + how to detect
+ them.
+* ``feature_dictionary.md`` — long-form per-feature documentation.
+* ``v1_acceptance_gates_bands.yaml`` — operational band thresholds.
+* ``v2_decision_log.md`` — accepted-for-v2 findings register.
+
+``release/docs/relational_table_schemas.csv`` is hand-authored (per
+column docs for relational tables); validated against the live parquet
+schemas, not copied from a source doc.
+
+Exit codes: 0 success / 1 ``--check`` mode and copies are stale /
+2 pre-flight error (source doc missing).
+"""
+
+from __future__ import annotations
+
+import argparse
+import shutil
+import sys
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Final
+
+REPO_ROOT: Final[Path] = Path(__file__).resolve().parent.parent
+
+#: ``(source, destination)`` pairs, both relative to the repo root.
+#: Order is alphabetical by destination basename for deterministic
+#: stderr output.
+VENDORED_DOCS: Final[tuple[tuple[Path, Path], ...]] = (
+ (
+ Path("docs/release/break_me_guide.md"),
+ Path("release/docs/break_me_guide.md"),
+ ),
+ (
+ Path("docs/release/channel_signal_audit.md"),
+ Path("release/docs/channel_signal_audit.md"),
+ ),
+ (
+ Path("docs/release/feature_dictionary.md"),
+ Path("release/docs/feature_dictionary.md"),
+ ),
+ (
+ Path("docs/release/generation_method.md"),
+ Path("release/docs/generation_method.md"),
+ ),
+ (
+ Path("docs/release/v1_acceptance_gates_bands.yaml"),
+ Path("release/docs/v1_acceptance_gates_bands.yaml"),
+ ),
+ (
+ Path("docs/release/v2_decision_log.md"),
+ Path("release/docs/v2_decision_log.md"),
+ ),
+)
+
+
+def _bytes(path: Path) -> bytes:
+ return path.read_bytes()
+
+
+def sync_docs(repo_root: Path, *, check_only: bool) -> tuple[list[Path], list[Path]]:
+ """Sync the vendored docs.
+
+ Returns ``(stale, missing_sources)``: ``stale`` is the list of
+ destination paths whose content differs from the source (and were
+ overwritten when ``check_only`` is False); ``missing_sources`` is
+ the list of source paths the caller declared but that don't exist.
+ """
+
+ stale: list[Path] = []
+ missing_sources: list[Path] = []
+ for src_rel, dst_rel in VENDORED_DOCS:
+ src = repo_root / src_rel
+ dst = repo_root / dst_rel
+ if not src.is_file():
+ missing_sources.append(src_rel)
+ continue
+ src_bytes = _bytes(src)
+ if not dst.is_file() or _bytes(dst) != src_bytes:
+ stale.append(dst_rel)
+ if not check_only:
+ dst.parent.mkdir(parents=True, exist_ok=True)
+ shutil.copy2(src, dst)
+ return stale, missing_sources
+
+
+def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ prog="sync_release_docs",
+ description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ parser.add_argument(
+ "--check",
+ action="store_true",
+ help="report stale copies as an exit-code-1 failure without overwriting (CI use)",
+ )
+ return parser.parse_args(argv)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+ args = parse_args(argv)
+ stale, missing = sync_docs(REPO_ROOT, check_only=args.check)
+
+ if missing:
+ print("error: source docs missing:", file=sys.stderr)
+ for path in missing:
+ print(f" - {path}", file=sys.stderr)
+ return 2
+
+ if args.check:
+ if stale:
+ print("error: release/docs/ is stale:", file=sys.stderr)
+ for path in stale:
+ print(f" - {path}", file=sys.stderr)
+ print(
+ "run `python scripts/sync_release_docs.py` to refresh them.",
+ file=sys.stderr,
+ )
+ return 1
+ print("release/docs/ is up to date.", file=sys.stderr)
+ return 0
+
+ if stale:
+ for path in stale:
+ print(f"updated {path}", file=sys.stderr)
+ else:
+ print("release/docs/ is already up to date.", file=sys.stderr)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tests/scripts/test_build_claims_register.py b/tests/scripts/test_build_claims_register.py
new file mode 100644
index 0000000..1db0443
--- /dev/null
+++ b/tests/scripts/test_build_claims_register.py
@@ -0,0 +1,180 @@
+"""Tests for ``scripts/build_claims_register.py``."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+from pathlib import Path
+from types import ModuleType
+
+import pytest
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+_SCRIPT = _REPO_ROOT / "scripts" / "build_claims_register.py"
+
+
+def _load_module() -> ModuleType:
+ spec = importlib.util.spec_from_file_location("build_claims_register", _SCRIPT)
+ assert spec is not None
+ assert spec.loader is not None
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+
+
+def _minimal_claims_yaml() -> str:
+ return """\
+claims:
+ - id: a01
+ text: Composition claim.
+ category: composition
+ backing_artifact: release//manifest.json
+ backing_path: $.n_leads
+ verifier: leadforge validate
+ - id: a02
+ text: Calibration claim.
+ category: calibration
+ backing_artifact: release/metrics.json
+ backing_path: $.tiers..medians.lr_auc
+ verifier: scripts/validate_release_candidate.py
+"""
+
+
+def _write_source(tmp_path: Path, text: str | None = None) -> tuple[Path, Path]:
+ release_dir = tmp_path / "release"
+ release_dir.mkdir()
+ source = release_dir / "claims_register_source.yaml"
+ source.write_text(text or _minimal_claims_yaml(), encoding="utf-8")
+ return release_dir, source
+
+
+def test_renders_both_files(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, source = _write_source(tmp_path)
+ mod.write_register(release_dir, source, check_only=False)
+ assert (release_dir / "claims_register.json").is_file()
+ assert (release_dir / "claims_register.md").is_file()
+
+
+def test_json_payload_includes_schema_block(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, source = _write_source(tmp_path)
+ mod.write_register(release_dir, source, check_only=False)
+ payload = json.loads((release_dir / "claims_register.json").read_text(encoding="utf-8"))
+ assert "schema" in payload
+ assert "claims" in payload
+ assert len(payload["claims"]) == 2
+ assert payload["claims"][0]["id"] == "a01"
+
+
+def test_markdown_groups_claims_by_category(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, source = _write_source(tmp_path)
+ mod.write_register(release_dir, source, check_only=False)
+ md = (release_dir / "claims_register.md").read_text(encoding="utf-8")
+ assert "## calibration" in md
+ assert "## composition" in md
+ # Claim text is present, escaped or not.
+ assert "Composition claim." in md
+
+
+def test_idempotent_writes(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, source = _write_source(tmp_path)
+ mod.write_register(release_dir, source, check_only=False)
+ stale = mod.write_register(release_dir, source, check_only=False)
+ assert stale == []
+
+
+def test_check_mode_flags_drift(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, source = _write_source(tmp_path)
+ stale = mod.write_register(release_dir, source, check_only=True)
+ assert stale
+ assert not (release_dir / "claims_register.json").is_file()
+
+
+def test_missing_required_keys_rejected(tmp_path: Path) -> None:
+ mod = _load_module()
+ bad_yaml = """\
+claims:
+ - id: missing_text
+ category: composition
+ backing_artifact: x
+ backing_path: y
+ verifier: z
+"""
+ release_dir, source = _write_source(tmp_path, bad_yaml)
+ with pytest.raises(ValueError, match="missing required key"):
+ mod.write_register(release_dir, source, check_only=False)
+
+
+def test_duplicate_ids_rejected(tmp_path: Path) -> None:
+ mod = _load_module()
+ bad_yaml = """\
+claims:
+ - id: dup
+ text: a
+ category: composition
+ backing_artifact: x
+ backing_path: y
+ verifier: z
+ - id: dup
+ text: b
+ category: composition
+ backing_artifact: x
+ backing_path: y
+ verifier: z
+"""
+ release_dir, source = _write_source(tmp_path, bad_yaml)
+ with pytest.raises(ValueError, match="duplicate claim id"):
+ mod.write_register(release_dir, source, check_only=False)
+
+
+def test_invalid_category_rejected(tmp_path: Path) -> None:
+ mod = _load_module()
+ bad_yaml = """\
+claims:
+ - id: x01
+ text: bad category
+ category: not_in_vocab
+ backing_artifact: x
+ backing_path: y
+ verifier: z
+"""
+ release_dir, source = _write_source(tmp_path, bad_yaml)
+ with pytest.raises(ValueError, match="not in"):
+ mod.write_register(release_dir, source, check_only=False)
+
+
+def test_missing_source_raises(tmp_path: Path) -> None:
+ mod = _load_module()
+ with pytest.raises(FileNotFoundError):
+ mod.write_register(tmp_path, tmp_path / "nope.yaml", check_only=False)
+
+
+def test_committed_claims_register_is_in_sync() -> None:
+ """The real repo's ``release/claims_register.{md,json}`` is in sync
+ with ``claims_register_source.yaml``."""
+
+ mod = _load_module()
+ release_dir = _REPO_ROOT / "release"
+ source = release_dir / "claims_register_source.yaml"
+ if not source.is_file():
+ pytest.skip("claims_register_source.yaml missing on this checkout")
+ stale = mod.write_register(release_dir, source, check_only=True)
+ assert stale == [], f"claims register drift: {stale}"
+
+
+def test_every_categories_token_is_in_valid_set() -> None:
+ """The source-file categories all match VALID_CATEGORIES (guards
+ silent drift in the source if a future contributor invents a
+ category)."""
+
+ mod = _load_module()
+ source = _REPO_ROOT / "release" / "claims_register_source.yaml"
+ if not source.is_file():
+ pytest.skip("claims_register_source.yaml missing on this checkout")
+ claims = mod.load_claims(source)
+ for claim in claims:
+ assert claim["category"] in mod.VALID_CATEGORIES
diff --git a/tests/scripts/test_build_release_metrics.py b/tests/scripts/test_build_release_metrics.py
new file mode 100644
index 0000000..1b2cc72
--- /dev/null
+++ b/tests/scripts/test_build_release_metrics.py
@@ -0,0 +1,180 @@
+"""Tests for ``scripts/build_release_metrics.py``."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+from pathlib import Path
+from types import ModuleType
+
+import pytest
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+_SCRIPT = _REPO_ROOT / "scripts" / "build_release_metrics.py"
+
+
+def _load_module() -> ModuleType:
+ spec = importlib.util.spec_from_file_location("build_release_metrics", _SCRIPT)
+ assert spec is not None
+ assert spec.loader is not None
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+
+
+def _minimal_report() -> dict:
+ """Hand-rolled validation_report.json with the keys the script reads."""
+
+ return {
+ "release_id": "leadforge-lead-scoring-v1",
+ "package_version": "1.0.0",
+ "generation_timestamp": "2026-05-06T07:38:31+00:00",
+ "seeds": [42, 43, 44, 45, 46],
+ "tiers": {
+ "intro": {
+ "medians": {
+ "lr_auc": 0.879,
+ "lr_average_precision": 0.761,
+ "brier_score": 0.130,
+ "conversion_rate_test": 0.427,
+ "gbm_auc": 0.873,
+ "gbm_minus_lr_auc": -0.0045,
+ "log_loss": 0.4,
+ "calibration_max_bin_error": 0.25,
+ "gbm_average_precision": 0.75,
+ "top_decile_rate": 0.77,
+ },
+ "spreads": {
+ "lr_auc": 0.027,
+ "conversion_rate_test": 0.092,
+ },
+ "seeds": [42, 43, 44, 45, 46],
+ "per_seed": [{"seed": s, "precision_at_k": {"100": 0.80}} for s in range(42, 47)],
+ },
+ "intermediate": {
+ "medians": {"lr_auc": 0.886, "lr_average_precision": 0.575},
+ "spreads": {"lr_auc": 0.023},
+ "seeds": [42, 43, 44, 45, 46],
+ "per_seed": [{"seed": s, "precision_at_k": {"100": 0.59}} for s in range(42, 47)],
+ },
+ "advanced": {
+ "medians": {"lr_auc": 0.886, "lr_average_precision": 0.351},
+ "spreads": {"lr_auc": 0.040},
+ "seeds": [42, 43, 44, 45, 46],
+ "per_seed": [{"seed": s, "precision_at_k": {"100": 0.34}} for s in range(42, 47)],
+ },
+ },
+ "cohort_shift": {
+ "intro": {
+ "random_split_auc": 0.873,
+ "cohort_split_auc": 0.857,
+ "auc_degradation": 0.016,
+ "seed": 42,
+ },
+ },
+ "cross_tier_ordering": {
+ "by_conversion_rate": ["intro", "intermediate", "advanced"],
+ "by_average_precision": ["intro", "intermediate", "advanced"],
+ },
+ }
+
+
+def _write_minimal_release(tmp_path: Path) -> tuple[Path, Path]:
+ release_dir = tmp_path / "release"
+ (release_dir / "validation").mkdir(parents=True)
+ report_path = release_dir / "validation" / "validation_report.json"
+ report_path.write_text(json.dumps(_minimal_report()), encoding="utf-8")
+ for tier in ("intro", "intermediate", "advanced"):
+ (release_dir / tier).mkdir()
+ return release_dir, report_path
+
+
+def test_top_level_payload_contains_expected_keys(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, report_path = _write_minimal_release(tmp_path)
+ stale, top = mod.write_metrics(release_dir, report_path, check_only=False)
+ assert "tiers" in top
+ assert set(top["tiers"]) == {"intro", "intermediate", "advanced"}
+ assert top["release_id"] == "leadforge-lead-scoring-v1"
+ assert top["seeds"] == [42, 43, 44, 45, 46]
+ assert top["cohort_shift"]["intro"]["auc_degradation"] == 0.016
+
+
+def test_per_tier_files_written_when_dir_exists(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, report_path = _write_minimal_release(tmp_path)
+ mod.write_metrics(release_dir, report_path, check_only=False)
+ for tier in ("intro", "intermediate", "advanced"):
+ path = release_dir / tier / "metrics.json"
+ assert path.is_file()
+ payload = json.loads(path.read_text(encoding="utf-8"))
+ assert payload["tier"] == tier
+ assert payload["medians"]["lr_auc"] is not None
+ assert payload["source_of_truth"]["file"] == "release/validation/validation_report.json"
+
+
+def test_precision_at_100_median_attached_to_per_tier_metrics(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, report_path = _write_minimal_release(tmp_path)
+ mod.write_metrics(release_dir, report_path, check_only=False)
+ intro = json.loads((release_dir / "intro" / "metrics.json").read_text(encoding="utf-8"))
+ assert intro["medians"]["precision_at_100"] == 0.80
+
+
+def test_idempotent_writes(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, report_path = _write_minimal_release(tmp_path)
+ mod.write_metrics(release_dir, report_path, check_only=False)
+ stale, _ = mod.write_metrics(release_dir, report_path, check_only=False)
+ assert stale == []
+
+
+def test_check_mode_flags_drift_on_missing_files(tmp_path: Path) -> None:
+ mod = _load_module()
+ release_dir, report_path = _write_minimal_release(tmp_path)
+ stale, _ = mod.write_metrics(release_dir, report_path, check_only=True)
+ assert stale # nothing written yet
+ assert not (release_dir / "metrics.json").is_file()
+
+
+def test_skips_tier_dir_when_absent(tmp_path: Path) -> None:
+ """Per-tier bundle dirs are gitignored on fresh checkouts; the script
+ must skip silently rather than error."""
+
+ mod = _load_module()
+ release_dir, report_path = _write_minimal_release(tmp_path)
+ # Remove the bundle dirs so only the top-level path can be written.
+ for tier in ("intro", "intermediate", "advanced"):
+ (release_dir / tier).rmdir()
+ stale, _ = mod.write_metrics(release_dir, report_path, check_only=False)
+ # Top-level file is the only one stale (and now written).
+ assert (release_dir / "metrics.json").is_file()
+ for tier in ("intro", "intermediate", "advanced"):
+ assert not (release_dir / tier / "metrics.json").is_file()
+
+
+def test_missing_report_raises(tmp_path: Path) -> None:
+ mod = _load_module()
+ with pytest.raises(FileNotFoundError):
+ mod.write_metrics(tmp_path, tmp_path / "no.json", check_only=False)
+
+
+def test_non_object_report_raises(tmp_path: Path) -> None:
+ mod = _load_module()
+ report_path = tmp_path / "validation_report.json"
+ report_path.write_text("[]", encoding="utf-8")
+ with pytest.raises(ValueError, match="not a JSON object"):
+ mod.write_metrics(tmp_path, report_path, check_only=False)
+
+
+def test_committed_release_metrics_match_validation_report() -> None:
+ """The real repo's ``release/metrics.json`` is in sync with
+ ``release/validation/validation_report.json``."""
+
+ mod = _load_module()
+ release_dir = _REPO_ROOT / "release"
+ report_path = release_dir / "validation" / "validation_report.json"
+ if not report_path.is_file():
+ pytest.skip("validation_report.json missing on this checkout")
+ stale, _ = mod.write_metrics(release_dir, report_path, check_only=True)
+ assert stale == [], f"metrics drift: {stale}"
diff --git a/tests/scripts/test_package_kaggle_release.py b/tests/scripts/test_package_kaggle_release.py
index 8140c23..ec45f0b 100644
--- a/tests/scripts/test_package_kaggle_release.py
+++ b/tests/scripts/test_package_kaggle_release.py
@@ -609,3 +609,26 @@ def test_committed_kaggle_metadata_matches_fresh_regeneration(tmp_path: Path) ->
for r in flat_csvs:
assert r["schema"]["fields"][0]["name"] == "split"
assert r["schema"]["fields"][-1]["name"] == "converted_within_90_days"
+
+ # Per-relational-table parquet resources now carry per-column
+ # descriptions sourced from release/docs/relational_table_schemas.csv
+ # — the preview's col__desc cells were previously empty for these.
+ touches_resources = [
+ r for r in parsed["resources"] if r["path"].endswith("/tables/touches.parquet")
+ ]
+ assert len(touches_resources) == len(packager.DEFAULT_TIERS)
+ for r in touches_resources:
+ for fd in r["schema"]["fields"]:
+ assert fd.get("description"), f"touches.{fd['name']} missing description"
+
+ # Agent-reviewable root resources land on the published file list.
+ paths = {r["path"] for r in parsed["resources"]}
+ assert "metrics.json" in paths
+ assert "claims_register.md" in paths
+ assert "claims_register.json" in paths
+ assert "docs/break_me_guide.md" in paths
+ assert "docs/v1_acceptance_gates_bands.yaml" in paths
+ assert "docs/relational_table_schemas.csv" in paths
+ # Per-tier metrics.json is also enumerated.
+ for tier in packager.DEFAULT_TIERS:
+ assert f"{tier}/metrics.json" in paths
diff --git a/tests/scripts/test_preview_hf_page.py b/tests/scripts/test_preview_hf_page.py
index 7369e9b..4f7301a 100644
--- a/tests/scripts/test_preview_hf_page.py
+++ b/tests/scripts/test_preview_hf_page.py
@@ -258,6 +258,33 @@ def test_render_escapes_html_in_field_values() -> None:
assert "<script>x</script>" in html
+def test_render_emits_jsonld_dataset_block() -> None:
+ """schema.org Dataset JSON-LD lands in the for agent ingestion."""
+
+ html = preview.render_hf_html(_minimal_doc(), variant="public")
+ assert '', re.DOTALL)
+ pub_match = block_re.search(public)
+ inst_match = block_re.search(instructor)
+ assert pub_match is not None
+ assert inst_match is not None
+ assert pub_match.group(1) == inst_match.group(1)
+
+
# ---------------------------------------------------------------------------
# Markdown link resolution (the leakage / link-rewrite regression guard)
# ---------------------------------------------------------------------------
diff --git a/tests/scripts/test_preview_kaggle_page.py b/tests/scripts/test_preview_kaggle_page.py
index e056a7b..428fd6d 100644
--- a/tests/scripts/test_preview_kaggle_page.py
+++ b/tests/scripts/test_preview_kaggle_page.py
@@ -184,6 +184,26 @@ def test_render_escapes_html_in_field_values() -> None:
assert "<script>" in html
+def test_render_emits_jsonld_dataset_block() -> None:
+ """A schema.org ``Dataset`` JSON-LD block lands in the ````
+ so agent reviewers can read structured metadata without parsing the
+ bespoke tables further down the page."""
+
+ html = preview.render_kaggle_html(_minimal_metadata(), "dataset-cover-image.png")
+ assert '