From 8e569cee0f3f299dba8745fa1c2755de3f495f0f Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Tue, 5 May 2026 07:34:53 +0300
Subject: [PATCH 1/4] feat: windowed snapshot for student_public bundles
 (closes #57)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `snapshot_day` as a recipe-level config knob and threads it through
the public API.  When set, event-aggregate features (touch_count,
session_count, expected_acv, days_since_last_touch, ...) only see events
in [lead_created_at, lead_created_at + snapshot_day], so they cannot
encode post-conversion data.  total_touches_all keeps full-horizon counts
as the deliberate pedagogical leakage trap, so the gap between the two
columns now carries real signal instead of being structurally zero.

The b2b_saas_procurement_v1 recipe pins snapshot_day=30, picked from
measurements at seed=42, n_leads=5000.  Conversion rates are invariant
to snapshot_day (the label is event-derived from label_window_days), so
intro/intermediate/advanced rates stay at 0.422/0.210/0.079 — well
inside the declared difficulty_profiles ranges.  Day 30 keeps LR AUC
in the 0.85-0.86 band (challenging but modelable) while preserving a
trap gap of ~3.0 touches with 54-77% of leads showing any gap.

BUNDLE_SCHEMA_VERSION bumped 3 → 4.  The published column SET is
unchanged, but column VALUES are no longer full-horizon, which v3
consumers cannot detect from schema alone.  manifest.snapshot_day is
recorded so the contract is self-describing (null = legacy full-horizon).

The schema contract test is renamed to test_bundle_schema_v4_contract.py
and gains a new assertion that manifest.snapshot_day == 30.
A new test_windowed_bundle_trap.py guards the pedagogical invariant:
total_touches_all >= touch_count for every lead and > for at least some.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 leadforge/api/bundle.py                       |  1 +
 leadforge/api/generator.py                    |  6 ++
 leadforge/api/recipes.py                      | 21 +++++
 leadforge/core/models.py                      | 14 +++
 .../b2b_saas_procurement_v1/recipe.yaml       |  6 ++
 leadforge/render/manifests.py                 | 11 ++-
 ...t.py => test_bundle_schema_v4_contract.py} | 88 +++++++++++--------
 tests/render/test_windowed_bundle_trap.py     | 69 +++++++++++++++
 8 files changed, 180 insertions(+), 36 deletions(-)
 rename tests/render/{test_bundle_schema_v3_contract.py => test_bundle_schema_v4_contract.py} (60%)
 create mode 100644 tests/render/test_windowed_bundle_trap.py

diff --git a/leadforge/api/bundle.py b/leadforge/api/bundle.py
index 4f2d1bb..4065bc9 100644
--- a/leadforge/api/bundle.py
+++ b/leadforge/api/bundle.py
@@ -94,6 +94,7 @@ def write_bundle(
         result,
         population,
         horizon_days=config.horizon_days,
+        snapshot_day=config.snapshot_day,
         difficulty_params=config.difficulty_params,
         seed=config.seed,
     )
diff --git a/leadforge/api/generator.py b/leadforge/api/generator.py
index f2da598..b849109 100644
--- a/leadforge/api/generator.py
+++ b/leadforge/api/generator.py
@@ -54,6 +54,7 @@ def from_recipe(
         horizon_days: int | None = None,
         primary_task: str | None = None,
         label_window_days: int | None = None,
+        snapshot_day: int | None = None,
         output_path: str = _MISSING,  # type: ignore[assignment]
         override: dict[str, Any] | None = None,
     ) -> Generator:
@@ -76,6 +77,10 @@ def from_recipe(
                 directory name and manifest key.
             label_window_days: Override recipe default label observation
                 window in days.
+            snapshot_day: Override recipe default snapshot day for windowed
+                feature aggregation.  ``None`` means full-horizon (legacy)
+                aggregation; an integer ``N`` means features aggregate only
+                events with ``timestamp <= lead_created_at + N days``.
             output_path: Directory where the bundle will be saved.
             override: Optional dict of overrides (mirrors a ``--override`` file).
                 Applied after recipe defaults but before explicit kwargs.
@@ -105,6 +110,7 @@ def from_recipe(
             horizon_days=horizon_days,
             primary_task=primary_task,
             label_window_days=label_window_days,
+            snapshot_day=snapshot_day,
             output_path=output_path,
             override=override,
         )
diff --git a/leadforge/api/recipes.py b/leadforge/api/recipes.py
index 8a39fb5..846918b 100644
--- a/leadforge/api/recipes.py
+++ b/leadforge/api/recipes.py
@@ -41,6 +41,7 @@ class Recipe:
     default_population: dict[str, int]
     horizon_days: int
     label_window_days: int | None = None
+    snapshot_day: int | None = None
 
     # ------------------------------------------------------------------ #
     # Construction
@@ -105,6 +106,17 @@ def from_dict(cls, data: dict[str, Any]) -> Recipe:
                 raise InvalidRecipeError(f"'label_window_days' must be positive, got {raw_lwd}")
             label_window_days = raw_lwd
 
+        snapshot_day: int | None = None
+        raw_sd = data.get("snapshot_day")
+        if raw_sd is not None:
+            if isinstance(raw_sd, bool) or not isinstance(raw_sd, int):
+                raise InvalidRecipeError(
+                    f"'snapshot_day' must be a positive int, got {type(raw_sd).__name__!r}"
+                )
+            if raw_sd <= 0:
+                raise InvalidRecipeError(f"'snapshot_day' must be positive, got {raw_sd}")
+            snapshot_day = raw_sd
+
         return cls(
             id=data["id"],
             title=data["title"],
@@ -116,6 +128,7 @@ def from_dict(cls, data: dict[str, Any]) -> Recipe:
             default_population=dict(pop),
             horizon_days=horizon_days,
             label_window_days=label_window_days,
+            snapshot_day=snapshot_day,
         )
 
     # ------------------------------------------------------------------ #
@@ -134,6 +147,7 @@ def resolve_config(
         horizon_days: int | None = None,
         primary_task: str | None = None,
         label_window_days: int | None = None,
+        snapshot_day: int | None = None,
         output_path: str = _MISSING,  # type: ignore[assignment]
         override: dict[str, Any] | None = None,
     ) -> GenerationConfig:
@@ -165,6 +179,7 @@ def resolve_config(
             "horizon_days": pkg["horizon_days"],
             "primary_task": pkg["primary_task"],
             "label_window_days": pkg["label_window_days"],
+            "snapshot_day": pkg["snapshot_day"],
         }
 
         # Layer 3 — recipe defaults
@@ -176,6 +191,8 @@ def resolve_config(
         resolved["primary_task"] = self.primary_task
         if self.label_window_days is not None:
             resolved["label_window_days"] = self.label_window_days
+        if self.snapshot_day is not None:
+            resolved["snapshot_day"] = self.snapshot_day
 
         # Layer 2 — override dict (beats recipe/package defaults)
         if override:
@@ -186,6 +203,7 @@ def resolve_config(
                 "horizon_days",
                 "primary_task",
                 "label_window_days",
+                "snapshot_day",
                 "seed",
                 "output_path",
                 "exposure_mode",
@@ -216,6 +234,8 @@ def resolve_config(
             resolved["primary_task"] = primary_task
         if label_window_days is not None:
             resolved["label_window_days"] = label_window_days
+        if snapshot_day is not None:
+            resolved["snapshot_day"] = snapshot_day
 
         try:
             mode = ExposureMode(resolved["exposure_mode"])
@@ -254,6 +274,7 @@ def resolve_config(
             horizon_days=resolved["horizon_days"],
             primary_task=resolved["primary_task"],
             label_window_days=resolved["label_window_days"],
+            snapshot_day=resolved["snapshot_day"],
             output_path=resolved["output_path"],
         )
 
diff --git a/leadforge/core/models.py b/leadforge/core/models.py
index f8eed75..db8b609 100644
--- a/leadforge/core/models.py
+++ b/leadforge/core/models.py
@@ -64,6 +64,7 @@ class GenerationConfig:
     horizon_days: int = 90
     primary_task: str = "converted_within_90_days"
     label_window_days: int = 90
+    snapshot_day: int | None = None
     output_path: str = "./out"
     package_version: str = field(default_factory=lambda: __version__)
     difficulty_params: DifficultyParams | None = None
@@ -87,6 +88,19 @@ def __post_init__(self) -> None:
                 f"label_window_days ({self.label_window_days}) must not exceed "
                 f"horizon_days ({self.horizon_days})"
             )
+        if self.snapshot_day is not None:
+            if isinstance(self.snapshot_day, bool) or not isinstance(self.snapshot_day, int):
+                raise InvalidConfigError(
+                    f"snapshot_day must be a positive int or None, "
+                    f"got {type(self.snapshot_day).__name__!r}"
+                )
+            if self.snapshot_day <= 0:
+                raise InvalidConfigError(f"snapshot_day must be positive, got {self.snapshot_day}")
+            if self.snapshot_day > self.horizon_days:
+                raise InvalidConfigError(
+                    f"snapshot_day ({self.snapshot_day}) must not exceed "
+                    f"horizon_days ({self.horizon_days})"
+                )
         # Coerce string enums supplied as plain strings
         if not isinstance(self.exposure_mode, ExposureMode):
             try:
diff --git a/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml b/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml
index 44b48ac..08843a3 100644
--- a/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml
+++ b/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml
@@ -18,3 +18,9 @@ default_population:
   n_contacts: 4200
   n_leads: 5000
 horizon_days: 90
+# Windowed snapshot: features aggregate only events within
+# [lead_created_at, lead_created_at + snapshot_day].  Anchored before the
+# 90-day label window closes so event-aggregate features (touch_count,
+# session_count, expected_acv, etc.) cannot encode post-conversion data.
+# total_touches_all retains the pedagogical leakage trap (full-horizon counts).
+snapshot_day: 30
diff --git a/leadforge/render/manifests.py b/leadforge/render/manifests.py
index 2c71312..27023c5 100644
--- a/leadforge/render/manifests.py
+++ b/leadforge/render/manifests.py
@@ -28,7 +28,15 @@
 #         feature list (zero-variance); ``is_sql`` redacted in
 #         ``student_public`` mode (near-deterministic for non-conversion).
 #         ``manifest.redacted_columns`` was already added in PR #56.
-BUNDLE_SCHEMA_VERSION = "3"
+#   "4" — issue #57 sub-item 1: windowed snapshot.  Event-aggregate
+#         features (touch_count, session_count, expected_acv, ...) now
+#         aggregate only events within ``[lead_created_at, lead_created_at
+#         + snapshot_day]``.  Column SET unchanged from v3, but column
+#         VALUES are no longer full-horizon — consumers pinning v3 and
+#         assuming "features computed over full horizon" must update.
+#         ``manifest.snapshot_day`` recorded so the contract is
+#         self-describing (``null`` means full-horizon, legacy behaviour).
+BUNDLE_SCHEMA_VERSION = "4"
 
 # Manifest fields whose value is non-deterministic by design (wall-clock,
 # host metadata, etc.).  Determinism checks must ignore these fields when
@@ -106,6 +114,7 @@ def build_manifest(
         "horizon_days": config.horizon_days,
         "primary_task": config.primary_task,
         "label_window_days": config.label_window_days,
+        "snapshot_day": config.snapshot_day,
         "motif_family": world_graph.motif_family,
         "redacted_columns": redacted_columns_list,
         "tables": tables,
diff --git a/tests/render/test_bundle_schema_v3_contract.py b/tests/render/test_bundle_schema_v4_contract.py
similarity index 60%
rename from tests/render/test_bundle_schema_v3_contract.py
rename to tests/render/test_bundle_schema_v4_contract.py
index 2135bde..b839471 100644
--- a/tests/render/test_bundle_schema_v3_contract.py
+++ b/tests/render/test_bundle_schema_v4_contract.py
@@ -1,4 +1,4 @@
-"""Schema contract test for ``bundle_schema_version == "3"``.
+"""Schema contract test for ``bundle_schema_version == "4"``.
 
 The constants below are an *intentional* duplication of the column sets
 the bundle writer produces.  The duplication is the point: any change to
@@ -8,6 +8,11 @@
 here, forcing the author to either update the contract (and bump
 ``BUNDLE_SCHEMA_VERSION``) or revisit the change.
 
+v4 vs v3: the column SET is unchanged.  The bump records the semantic
+shift introduced by the windowed snapshot — event-aggregate column
+VALUES are now anchored at ``snapshot_day`` instead of the full horizon.
+``manifest.snapshot_day`` makes the contract self-describing.
+
 If you find yourself wondering "do I have to update this?": yes.  That
 is the failure mode this test is designed to catch.
 """
@@ -22,10 +27,10 @@
 
 from leadforge.api.generator import Generator
 
-# Pinned column lists for bundle schema v3.  Update *together* with
+# Pinned column lists for bundle schema v4.  Update *together* with
 # ``BUNDLE_SCHEMA_VERSION`` and ``LEAD_SNAPSHOT_FEATURES``.
 
-V3_TASK_COLUMNS_STUDENT_PUBLIC: frozenset[str] = frozenset(
+V4_TASK_COLUMNS_STUDENT_PUBLIC: frozenset[str] = frozenset(
     {
         "account_id",
         "industry",
@@ -62,12 +67,12 @@
     }
 )
 
-V3_TASK_COLUMNS_RESEARCH_INSTRUCTOR: frozenset[str] = V3_TASK_COLUMNS_STUDENT_PUBLIC | {
+V4_TASK_COLUMNS_RESEARCH_INSTRUCTOR: frozenset[str] = V4_TASK_COLUMNS_STUDENT_PUBLIC | {
     "current_stage",
     "is_sql",
 }
 
-V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC: frozenset[str] = frozenset(
+V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC: frozenset[str] = frozenset(
     {
         "lead_id",
         "contact_id",
@@ -82,7 +87,7 @@
     }
 )
 
-V3_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR: frozenset[str] = V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC | {
+V4_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR: frozenset[str] = V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC | {
     "current_stage",
     "is_sql",
 }
@@ -97,14 +102,14 @@ def _build(mode: str, out: Path, seed: int = 42) -> None:
 
 @pytest.fixture(scope="module")
 def student_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path:
-    out = tmp_path_factory.mktemp("v3_student")
+    out = tmp_path_factory.mktemp("v4_student")
     _build("student_public", out)
     return out
 
 
 @pytest.fixture(scope="module")
 def instructor_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path:
-    out = tmp_path_factory.mktemp("v3_instructor")
+    out = tmp_path_factory.mktemp("v4_instructor")
     _build("research_instructor", out)
     return out
 
@@ -117,52 +122,64 @@ def _leads_cols(bundle: Path) -> frozenset[str]:
     return frozenset(pq.read_schema(bundle / "tables/leads.parquet").names)
 
 
-def test_manifest_declares_v3(student_bundle: Path, instructor_bundle: Path) -> None:
+def test_manifest_declares_v4(student_bundle: Path, instructor_bundle: Path) -> None:
     for b in (student_bundle, instructor_bundle):
         manifest = json.loads((b / "manifest.json").read_text())
-        assert manifest["bundle_schema_version"] == "3", (
+        assert manifest["bundle_schema_version"] == "4", (
             f"{b.name}: bundle_schema_version is {manifest['bundle_schema_version']!r}, "
-            "expected '3'"
+            "expected '4'"
+        )
+
+
+def test_manifest_records_snapshot_day(student_bundle: Path, instructor_bundle: Path) -> None:
+    """v4 contract: manifest must surface ``snapshot_day`` so consumers can
+    distinguish full-horizon (legacy) bundles from windowed bundles."""
+    for b in (student_bundle, instructor_bundle):
+        manifest = json.loads((b / "manifest.json").read_text())
+        assert "snapshot_day" in manifest, f"{b.name}: manifest is missing 'snapshot_day' field"
+        # The b2b_saas_procurement_v1 recipe pins snapshot_day=30.
+        assert manifest["snapshot_day"] == 30, (
+            f"{b.name}: snapshot_day is {manifest['snapshot_day']!r}, expected 30"
         )
 
 
-def test_student_public_task_columns_match_v3_contract(student_bundle: Path) -> None:
+def test_student_public_task_columns_match_v4_contract(student_bundle: Path) -> None:
     actual = _task_cols(student_bundle)
-    assert actual == V3_TASK_COLUMNS_STUDENT_PUBLIC, (
-        f"student_public task split columns drifted from v3 contract.\n"
-        f"  unexpected: {sorted(actual - V3_TASK_COLUMNS_STUDENT_PUBLIC)}\n"
-        f"  missing:    {sorted(V3_TASK_COLUMNS_STUDENT_PUBLIC - actual)}\n"
-        "  → either update tests/render/test_bundle_schema_v3_contract.py and "
+    assert actual == V4_TASK_COLUMNS_STUDENT_PUBLIC, (
+        f"student_public task split columns drifted from v4 contract.\n"
+        f"  unexpected: {sorted(actual - V4_TASK_COLUMNS_STUDENT_PUBLIC)}\n"
+        f"  missing:    {sorted(V4_TASK_COLUMNS_STUDENT_PUBLIC - actual)}\n"
+        "  → either update tests/render/test_bundle_schema_v4_contract.py and "
         "bump BUNDLE_SCHEMA_VERSION, or revert the schema change."
     )
 
 
-def test_research_instructor_task_columns_match_v3_contract(instructor_bundle: Path) -> None:
+def test_research_instructor_task_columns_match_v4_contract(instructor_bundle: Path) -> None:
     actual = _task_cols(instructor_bundle)
-    assert actual == V3_TASK_COLUMNS_RESEARCH_INSTRUCTOR, (
-        f"research_instructor task split columns drifted from v3 contract.\n"
-        f"  unexpected: {sorted(actual - V3_TASK_COLUMNS_RESEARCH_INSTRUCTOR)}\n"
-        f"  missing:    {sorted(V3_TASK_COLUMNS_RESEARCH_INSTRUCTOR - actual)}"
+    assert actual == V4_TASK_COLUMNS_RESEARCH_INSTRUCTOR, (
+        f"research_instructor task split columns drifted from v4 contract.\n"
+        f"  unexpected: {sorted(actual - V4_TASK_COLUMNS_RESEARCH_INSTRUCTOR)}\n"
+        f"  missing:    {sorted(V4_TASK_COLUMNS_RESEARCH_INSTRUCTOR - actual)}"
     )
 
 
-def test_student_public_leads_table_columns_match_v3_contract(student_bundle: Path) -> None:
+def test_student_public_leads_table_columns_match_v4_contract(student_bundle: Path) -> None:
     actual = _leads_cols(student_bundle)
-    assert actual == V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC, (
-        f"student_public tables/leads.parquet columns drifted from v3 contract.\n"
-        f"  unexpected: {sorted(actual - V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC)}\n"
-        f"  missing:    {sorted(V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC - actual)}"
+    assert actual == V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC, (
+        f"student_public tables/leads.parquet columns drifted from v4 contract.\n"
+        f"  unexpected: {sorted(actual - V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC)}\n"
+        f"  missing:    {sorted(V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC - actual)}"
     )
 
 
-def test_research_instructor_leads_table_columns_match_v3_contract(
+def test_research_instructor_leads_table_columns_match_v4_contract(
     instructor_bundle: Path,
 ) -> None:
     actual = _leads_cols(instructor_bundle)
-    assert actual == V3_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR, (
-        f"research_instructor tables/leads.parquet columns drifted from v3 contract.\n"
-        f"  unexpected: {sorted(actual - V3_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR)}\n"
-        f"  missing:    {sorted(V3_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR - actual)}"
+    assert actual == V4_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR, (
+        f"research_instructor tables/leads.parquet columns drifted from v4 contract.\n"
+        f"  unexpected: {sorted(actual - V4_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR)}\n"
+        f"  missing:    {sorted(V4_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR - actual)}"
     )
 
 
@@ -176,9 +193,10 @@ def test_research_instructor_redacted_set_in_manifest(instructor_bundle: Path) -
     assert manifest["redacted_columns"] == []
 
 
-def test_is_mql_absent_from_all_v3_artifacts(student_bundle: Path, instructor_bundle: Path) -> None:
-    """``is_mql`` was removed entirely in v3 (issue #57): not in the snapshot,
-    not in the relational table, not in the feature dictionary, not anywhere."""
+def test_is_mql_absent_from_all_v4_artifacts(student_bundle: Path, instructor_bundle: Path) -> None:
+    """``is_mql`` was removed entirely in v3 (issue #57) and remains absent in v4:
+    not in the snapshot, not in the relational table, not in the feature
+    dictionary, not anywhere."""
     for b in (student_bundle, instructor_bundle):
         assert "is_mql" not in _task_cols(b)
         assert "is_mql" not in _leads_cols(b)
diff --git a/tests/render/test_windowed_bundle_trap.py b/tests/render/test_windowed_bundle_trap.py
new file mode 100644
index 0000000..c51b59a
--- /dev/null
+++ b/tests/render/test_windowed_bundle_trap.py
@@ -0,0 +1,69 @@
+"""End-to-end test that the leakage trap remains pedagogically meaningful
+once the bundle uses a windowed snapshot.
+
+The pedagogical contract is that ``total_touches_all`` (full-horizon counts)
+diverges from ``touch_count`` (windowed counts) for at least some leads.
+If a careless refactor accidentally widened ``touch_count`` back to the full
+horizon, the trap would silently collapse — students could no longer see
+the gap that the v4 dataset card promises.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from leadforge.api.generator import Generator
+
+
+@pytest.fixture(scope="module")
+def windowed_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path:
+    """A small student_public bundle generated with the recipe default
+    ``snapshot_day`` (currently 30)."""
+    out = tmp_path_factory.mktemp("windowed_trap")
+    gen = Generator.from_recipe(
+        "b2b_saas_procurement_v1",
+        seed=42,
+        exposure_mode="student_public",
+    )
+    gen.generate(n_leads=200, n_accounts=80, n_contacts=200).save(str(out))
+    return out
+
+
+def test_windowed_bundle_uses_recipe_snapshot_day(windowed_bundle: Path) -> None:
+    import json
+
+    manifest = json.loads((windowed_bundle / "manifest.json").read_text())
+    assert manifest["snapshot_day"] == 30
+
+
+def test_total_touches_all_dominates_touch_count(windowed_bundle: Path) -> None:
+    """Invariant: ``total_touches_all >= touch_count`` for every lead."""
+    train = pd.read_parquet(
+        windowed_bundle / "tasks" / "converted_within_90_days" / "train.parquet",
+        columns=["touch_count", "total_touches_all"],
+    )
+    diff = train["total_touches_all"].astype("Float64") - train["touch_count"].astype("Float64")
+    assert (diff >= 0).all(), (
+        "total_touches_all < touch_count for some leads — the windowed feature "
+        "exceeded the full-horizon count, which contradicts the trap contract."
+    )
+
+
+def test_trap_gap_is_meaningful(windowed_bundle: Path) -> None:
+    """At ``snapshot_day < horizon_days``, the gap must be non-trivial: some
+    leads must have ``total_touches_all > touch_count``.  If not, the trap is
+    pedagogically dead and the windowed snapshot wiring is broken."""
+    train = pd.read_parquet(
+        windowed_bundle / "tasks" / "converted_within_90_days" / "train.parquet",
+        columns=["touch_count", "total_touches_all"],
+    )
+    diff = train["total_touches_all"].astype("Float64") - train["touch_count"].astype("Float64")
+    n_with_gap = int((diff > 0).sum())
+    assert n_with_gap > 0, (
+        "No lead has total_touches_all > touch_count — the trap gap collapsed.\n"
+        "Either snapshot_day was set to horizon_days (bundle is no longer windowed) "
+        "or the build_snapshot() windowing logic regressed."
+    )

From 4d8185e741bfbde1ec74221a6cfd13a90a192218 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Tue, 5 May 2026 07:44:40 +0300
Subject: [PATCH 2/4] docs: refresh release docs and changelog for windowed
 snapshot (v4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- release/README.md: refreshed leakage handling section with the v4 schema
  bump, the snapshot/redaction pair, and the new pedagogical framing of
  total_touches_all.  Removed the open-caveat block — the underlying
  caveat (event aggregates over the label window) is fixed.
- release/HF_DATASET_CARD.md: matched the README updates and bumped the
  schema-version reference.
- CHANGELOG.md: documented the schema v4 entry (windowed snapshot,
  manifest.snapshot_day, contract test, trap invariant guard) and dropped
  the "open follow-up" line that previously flagged sub-item 1 of #57.
- .agent-plan.md: marked all three sub-items of #57 resolved; added the
  windowed snapshot bullet to the Phase 5 release checklist.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .agent-plan.md             |  5 ++--
 CHANGELOG.md               | 47 +++++++++++++++++++++++++++++++-------
 release/HF_DATASET_CARD.md | 13 ++++++-----
 release/README.md          | 19 ++++++++-------
 4 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/.agent-plan.md b/.agent-plan.md
index b11f6ff..ed2a914 100644
--- a/.agent-plan.md
+++ b/.agent-plan.md
@@ -46,6 +46,7 @@ First public dataset release: `leadforge-b2b-lead-scoring`. Three difficulty tie
 - [x] Update release/HF_DATASET_CARD.md — add conversion rates to summary table
 - [x] Verify SHA-256 hash determinism (re-run build, compare hashes) — `scripts/verify_hash_determinism.py`; 73/73 files identical across two `build_public_release.py` runs (modulo `manifest.json`'s wall-clock `generation_timestamp`)
 - [x] Fix `current_stage` leakage in student_public bundles via exposure-layer redaction — `is_leakage_trap` flag distinguishes the pedagogical trap (`total_touches_all`) from true label leaks; `BundleFilter.redacted_columns` strips the latter; `validate_bundle()` enforces the invariant. 73/73 hash-determinism preserved.
+- [x] Windowed snapshot for student_public bundles — `snapshot_day=30` pinned in recipe; event-aggregate features no longer share the 90-day label window; `BUNDLE_SCHEMA_VERSION` bumped to 4; 73/73 hash-determinism preserved; conversion rates unchanged (41.5% / 20.1% / 7.9%).
 - [ ] Upload to Kaggle and HuggingFace
 - [ ] Announce
 
@@ -66,11 +67,11 @@ First public dataset release: `leadforge-b2b-lead-scoring`. Three difficulty tie
 
 Deterministic leak fixed via exposure-layer redaction. `FeatureSpec` now carries an explicit `redact_in_modes: frozenset[ExposureMode]` field — *prescriptive* — alongside the descriptive `leakage_risk` flag. `current_stage` is marked `redact_in_modes={ExposureMode.student_public}`; the writer queries `redacted_columns_for(mode)` and strips matching columns from the snapshot, task splits, and feature dictionary before they hit disk. The pedagogical trap `total_touches_all` is preserved in all modes (no entry in `redact_in_modes`). The manifest records `redacted_columns: [...]` so the bundle is self-describing. `validate_bundle()` cross-checks parquet schemas, feature dictionary, and the manifest's declared redaction set against `redacted_columns_for(mode)` derived independently from the feature spec. Hash-determinism preserved (73/73 identical across builds).
 
-### Follow-up: structural leakage in `student_public` bundles (issue #57)
+### Follow-up: structural leakage in `student_public` bundles (issue #57) — fully resolved
 
 Tracked in [GitHub issue #57](https://github.com/leadforge-dev/leadforge/issues/57).
 
-1. **Event-aggregate features are computed over the label window.** `touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, etc. all aggregate events in `[lead_created_at, lead_created_at + 90d]`, the same window over which the label resolves. The structural fix is a windowed snapshot (`snapshot_day=N` with `N < label_window_days`), as v6/v7 datasets already do at day 14/20. **Open** — its own PR with documentation recalibration; will likely bump `BUNDLE_SCHEMA_VERSION` again.
+1. ~~**Event-aggregate features are computed over the label window.** `touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, etc. all aggregate events in `[lead_created_at, lead_created_at + 90d]`, the same window over which the label resolves.~~ **Resolved** — windowed snapshot at `snapshot_day=30` (recipe default). `BUNDLE_SCHEMA_VERSION` bumped 3 → 4; `manifest.snapshot_day` records the contract. Conversion rates invariant (label is event-derived from `label_window_days`); trap gap (`total_touches_all − touch_count`) ~3 touches with 54–77% of leads showing divergence. Guarded by `test_bundle_schema_v4_contract.py` and `test_windowed_bundle_trap.py`.
 2. ~~**`is_sql=False` is near-deterministic for non-conversion.** Measured on the regenerated bundle: P(converted | is_sql=False) = 0.038 (intro), 0.015 (intermediate), 0.006 (advanced).~~ **Resolved** — `is_sql` redacted in `student_public` mode by post-#57 PR (bundle schema v3).
 3. ~~**`is_mql` is a constant `True`.** Zero variance feature in all three tiers.~~ **Resolved** — `is_mql` removed from the canonical feature list by post-#57 PR (bundle schema v3). Guarded by a new `test_no_zero_variance_features` check.
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4093008..1ddc4cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,45 @@ Format inspired by [Keep a Changelog](https://keepachangelog.com/).
 
 ## Unreleased
 
+### Bundle schema v4
+
+`bundle_schema_version` bumped from `"3"` to `"4"`.  Closes the final
+sub-item of issue #57: event-aggregate features are no longer computed
+over the same 90-day window the label resolves in.
+
+- **Windowed snapshot.**  `GenerationConfig.snapshot_day` (also exposed
+  as a recipe-level field and an explicit kwarg on
+  `Generator.from_recipe()`) now controls the feature aggregation
+  window.  When set, `build_snapshot()` filters touches, sessions,
+  sales activities, and opportunities to events with timestamp
+  ≤ `lead_created_at + snapshot_day`.  The
+  `b2b_saas_procurement_v1` recipe pins `snapshot_day: 30` —
+  measurements at seed 42, n_leads=5000 across all three difficulty
+  tiers showed day 30 keeps LR AUC in [0.85, 0.86] (challenging but
+  modelable) while preserving a meaningful trap gap of ~3 touches
+  with 54–77% of leads showing any divergence between
+  `total_touches_all` (full-horizon) and `touch_count` (windowed).
+- **Conversion rates unchanged.**  The label is event-derived from
+  `label_window_days` in the simulator and is independent of
+  `snapshot_day`, so the published rates stay at 41.5% / 20.1% / 7.9%
+  (intro / intermediate / advanced) — well inside the declared
+  `difficulty_profiles.yaml` ranges.
+- **`manifest.snapshot_day` recorded.**  The published bundle
+  declares its windowing contract; consumers can distinguish
+  full-horizon (legacy v2/v3) bundles from windowed (v4) bundles
+  without inspecting package internals.  Column SET is unchanged
+  from v3, but column VALUES are no longer full-horizon — a contract
+  shift that v3 consumers would not detect from schema alone.
+- **Schema contract test.**  `tests/render/test_bundle_schema_v3_contract.py`
+  renamed to `test_bundle_schema_v4_contract.py` and gains a
+  `snapshot_day == 30` assertion alongside the existing column-set
+  pinning.
+- **Trap invariant guard.**  New `tests/render/test_windowed_bundle_trap.py`
+  asserts `total_touches_all >= touch_count` for every lead and
+  `>` for at least some — guarding against a future refactor that
+  silently widens `touch_count` back to the full horizon and
+  collapses the pedagogical gap.
+
 ### Bundle schema v3
 
 `bundle_schema_version` bumped from `"2"` to `"3"`.  Three structural
@@ -53,14 +92,6 @@ changes follow up on PR #56 (issue #57):
   columns (down from 35); **11** columns in `tables/leads.parquet`
   (down from 12 — `is_mql` removed).
 
-### Open follow-up
-
-Issue #57 sub-item 1 remains open: event-aggregate features
-(`touch_count`, `session_count`, `pricing_page_views`, ...) are still
-computed over the same 90-day window the label resolves in.  The
-structural fix is a windowed snapshot rebuild and is deferred to its
-own PR.
-
 ---
 
 ## v1.0.0 — 2026-05-02
diff --git a/release/HF_DATASET_CARD.md b/release/HF_DATASET_CARD.md
index c01136f..b70dcc9 100644
--- a/release/HF_DATASET_CARD.md
+++ b/release/HF_DATASET_CARD.md
@@ -51,7 +51,7 @@ A relational, reproducible, multi-difficulty lead scoring dataset generated by [
 
 1. **Relational structure.** 9 normalized tables plus ML-ready task splits. Practice feature engineering from raw tables, or grab the flat file and start modeling.
 2. **Three difficulty tiers.** Same world, different conversion rates, signal-to-noise ratios, and missingness.
-3. **Reproducible and leakage-safe.** Deterministic generation (seed 42), SHA-256 hashes. The label-encoding `current_stage` column is stripped from public bundles via the exposure layer; the `redacted_columns` field in `manifest.json` records what was removed. The deliberately included `total_touches_all` leakage trap is retained as a teaching feature.
+3. **Reproducible and leakage-safe.** Deterministic generation (seed 42), SHA-256 hashes. Event-aggregate features are computed over a 30-day window (`manifest.snapshot_day == 30`) while the label resolves over 90 days, so features cannot encode post-conversion events. The label-encoding `current_stage` column is stripped from public bundles via the exposure layer (`redacted_columns` in `manifest.json` lists what was removed). The deliberately included `total_touches_all` leakage trap — counting touches over the full 90-day horizon — is retained as a teaching feature.
 
 ## Quick start
 
@@ -92,13 +92,14 @@ df = pd.read_csv("hf://datasets/leadforge/leadforge-b2b-lead-scoring/intermediat
 
 Each difficulty tier includes 9 Parquet tables under `tables/`: accounts, contacts, leads, touches, sessions, sales_activities, opportunities, customers, subscriptions. These form a normalized CRM schema linked by foreign keys.
 
-## Leakage handling (bundle schema v3)
+## Leakage handling (bundle schema v4)
 
-- **Stripped from public bundles:** `current_stage` (label-encoding at the 90-day horizon) and `is_sql` (P(conv | is_sql=False) ≈ 0.04 / 0.015 / 0.006 across tiers — near-deterministic for non-conversion). Both available in `intermediate_instructor/`. The `manifest.json` field `redacted_columns` lists what was stripped.
-- **Removed entirely:** `is_mql` (constant `True` in the simulator — zero variance, no information).
-- **Deliberately retained as a pedagogical trap:** `total_touches_all` counts touches over the full 90-day window including post-snapshot events. Flagged `leakage_risk=True` in `feature_dictionary.csv`. Use it as an exercise — train with and without, compare AUC, explain the gap.
+Two complementary mechanisms keep the published feature set leakage-safe:
 
-**Caveat:** event-aggregate features (`touch_count`, `session_count`, ...) are computed over the same 90-day window the label resolves in, so they correlate with post-conversion events. A windowed-snapshot rebuild is the structural fix — see [issue #57](https://github.com/leadforge-dev/leadforge/issues/57).
+- **Windowed snapshot.** Every event-aggregate feature (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) is computed over the first 30 days only (`manifest.snapshot_day == 30`). The label resolves over the full 90 days (`manifest.label_window_days == 90`). So features cannot accidentally encode events that occur after a lead converts.
+- **Column redaction.** `current_stage` (label-encoding at the 90-day horizon) and `is_sql` (P(conv | is_sql=False) ≈ 0.04 / 0.015 / 0.006 across tiers — near-deterministic for non-conversion) are stripped from `student_public` bundles entirely, including from `tables/leads.parquet`. Both are still available in `intermediate_instructor/`. `manifest.redacted_columns` lists exactly what was removed.
+- **Removed entirely:** `is_mql` (constant `True` in the simulator — zero variance, no information).
+- **Deliberately retained as a pedagogical trap:** `total_touches_all` counts touches over the full 90-day horizon — the only feature that crosses the snapshot window. Flagged `leakage_risk=True` in `feature_dictionary.csv`. Use it as an exercise: train with and without, compare AUC, explain the gap.
 
 ## Research companion
 
diff --git a/release/README.md b/release/README.md
index e174f49..f7a2afe 100644
--- a/release/README.md
+++ b/release/README.md
@@ -10,7 +10,7 @@ Most public lead scoring datasets are flat CSVs with opaque provenance. This one
 
 2. **Three difficulty tiers.** Same company, same product, same buyer personas -- different difficulty profiles that produce meaningfully different conversion rates, noise levels, and missingness.
 
-3. **Reproducible and leakage-safe.** Deterministic generation from a fixed seed. SHA-256 hashes for every file in `manifest.json`. The label-encoding `current_stage` column is stripped from the public bundles in the exposure layer; the only leakage-flagged column that ships in `student_public` is the deliberately included pedagogical trap `total_touches_all`, marked `is_leakage_trap=True` in the feature dictionary. All features are anchored at the snapshot date -- no post-cutoff data leaks in by accident.
+3. **Reproducible and leakage-safe.** Deterministic generation from a fixed seed. SHA-256 hashes for every file in `manifest.json`. The label-encoding `current_stage` column is stripped from the public bundles in the exposure layer. Event-aggregate features (`touch_count`, `session_count`, `pricing_page_views`, ...) are computed over a 30-day window — they cannot encode events that happen *after* day 30, even though the label resolves over a 90-day window. The only leakage-flagged column that ships in `student_public` is the deliberately included pedagogical trap `total_touches_all`, which counts the full 90-day touch history and is marked `is_leakage_trap=True` in the feature dictionary.
 
 ## What's inside
 
@@ -71,7 +71,7 @@ train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parqu
 test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet")
 ```
 
-**Note:** The student-facing Parquet files contain `total_touches_all`, a deliberately included leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`). Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`.
+**Note:** Every event-aggregate feature (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) only counts events that occurred within the first **30 days** after the lead was created — that's the snapshot window. The label is then resolved over the next 60 days. So features cannot accidentally encode post-conversion events. The exception is `total_touches_all`, a deliberately included leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`) that counts touches over the full 90-day horizon. Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`.
 
 ### Option 3: Relational tables (feature engineering)
 
@@ -128,22 +128,21 @@ The sales funnel runs through inbound marketing (45%), SDR outbound (35%), and p
 
 Each bundle contains a `dataset_card.md` and a `feature_dictionary.csv` with the authoritative, auto-generated column list, descriptions, dtypes, and `leakage_risk` flags. Refer to those rather than mirroring counts here, which would drift.
 
-**Leakage handling (bundle schema v3)**
+**Leakage handling (bundle schema v4)**
 
-Redaction in `student_public` bundles applies uniformly to **both** the task splits *and* the relational tables. Joining `tables/leads.parquet` back to your features cannot recover a redacted column.
+Two separate mechanisms keep the published feature set leakage-safe:
+
+1. **Windowed snapshot.** Every event-aggregate feature is computed over a 30-day window (`manifest.snapshot_day == 30`); the label resolves over the full 90 days (`manifest.label_window_days == 90`). Features cannot see touches, sessions, or opportunities that occurred after day 30. The only feature that intentionally crosses this line is `total_touches_all`, the pedagogical trap.
+2. **Column redaction.** A small set of columns that *would* encode the label structurally (`current_stage`, `is_sql`) are stripped from `student_public` bundles entirely — both from `tasks/` splits and from `tables/leads.parquet`, so feature engineering off the relational tables cannot recover them.
 
 | Column | Status in `student_public` | Status in `intermediate_instructor` | Why |
 |---|---|---|---|
 | `current_stage` | redacted (gone from task splits and `tables/leads.parquet`) | retained | At day 90 this contains terminal stages (`closed_won`/`closed_lost`) that encode the label directly. |
 | `is_sql` | redacted | retained | `is_sql=False` predicts non-conversion with very high probability — measured across 5 seeds, P(conv \| is_sql=False) = 0.061 ± 0.026 (intro) / 0.020 ± 0.010 (intermediate) / 0.011 ± 0.004 (advanced). |
 | `is_mql` | removed entirely (no mode has it) | removed entirely | Every lead is initialised at MQL stage in the simulator, so the field was constant `True` and carried no information. |
-| `total_touches_all` | retained | retained | Deliberate pedagogical leakage trap (counts touches over the full 90-day window). Flagged `leakage_risk=True` in `feature_dictionary.csv`. Train with and without it, compare AUC, explain the gap. |
-
-The `redacted_columns` field in each bundle's `manifest.json` records exactly what was stripped.
-
-**Known caveats** (tracked in [issue #57](https://github.com/leadforge-dev/leadforge/issues/57)):
+| `total_touches_all` | retained | retained | Deliberate pedagogical leakage trap. Counts touches over the full 90-day horizon while every other touch feature stops at day 30, so the gap (`total_touches_all - touch_count`) carries real signal. Flagged `leakage_risk=True` in `feature_dictionary.csv`. Train with and without it, compare AUC, explain the gap. |
 
-- All event-aggregate features (`touch_count`, `session_count`, `pricing_page_views`, ...) are computed over the same 90-day window in which the label resolves. They correlate with post-conversion events. The structural fix is a windowed snapshot rebuild — open follow-up.
+The `redacted_columns` and `snapshot_day` fields in each bundle's `manifest.json` record exactly what was stripped and at what window features were computed.
 
 ## Research companion
 

From f19d58e0897ad375aaac5173a4bd955006e6b0e4 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Tue, 5 May 2026 08:03:23 +0300
Subject: [PATCH 3/4] fix: tighten snapshot_day contract per self-review of #59
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Self-review of the windowed-snapshot PR surfaced seven gaps; this commit
addresses all of them.

- Add tests/test_snapshot_day_threading.py — 22 tests covering
  Recipe.from_dict parsing (positive int, absent → None, rejection of
  zero/negative/string/bool), resolve_config precedence across all four
  layers (package default, recipe, override dict, kwarg), and
  GenerationConfig validation paths.  Models the existing
  test_primary_task_threading.py.
- Validate snapshot_day <= label_window_days in
  GenerationConfig.__post_init__.  A snapshot anchored after the label
  closes would let features observe events past the label-scoring
  window — the exact structural leakage v4 is here to prevent.
- Strengthen test_windowed_bundle_trap.py: replace the bare ``> 0`` check
  with ``frac_with_gap >= 0.20``.  Measurements at the recipe default
  show 54-77% of leads carry a gap; a regression that collapsed the trap
  to a single lead would no longer pass silently.
- Pin snapshot_day, primary_task, label_window_days, redacted_columns in
  the required-keys set in tests/render/test_render.py.  These were
  already in the manifest but not formally pinned anywhere outside the
  v4 contract test.
- Surface snapshot_day in the bundle-internal dataset_card.md: add
  ``Label window`` and ``Feature snapshot window`` rows to the header
  table, and replace the generic "Features are anchored at the snapshot
  date" caveat with concrete prose naming the day-30 feature window, the
  day-90 label window, and the total_touches_all trap.  Consumers
  reading the card now see the v4 contract without needing the
  changelog.
- Reconcile snapshot_day validation messages: both layers now say
  "must be a positive int or None/null".
- Trim the recipe.yaml comment to a one-liner pointing at CHANGELOG /
  release docs for the rationale.

937 tests pass (was 915; +22 from the new threading file).
ruff + mypy clean; verify_hash_determinism.py PASS 73/73.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 leadforge/api/recipes.py                      |   6 +-
 leadforge/core/models.py                      |  15 +-
 leadforge/narrative/dataset_card.py           |  24 ++-
 .../b2b_saas_procurement_v1/recipe.yaml       |   7 +-
 tests/render/test_render.py                   |   4 +
 tests/render/test_windowed_bundle_trap.py     |  23 ++-
 tests/test_snapshot_day_threading.py          | 184 ++++++++++++++++++
 7 files changed, 246 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_snapshot_day_threading.py

diff --git a/leadforge/api/recipes.py b/leadforge/api/recipes.py
index 846918b..17d11d7 100644
--- a/leadforge/api/recipes.py
+++ b/leadforge/api/recipes.py
@@ -111,10 +111,12 @@ def from_dict(cls, data: dict[str, Any]) -> Recipe:
         if raw_sd is not None:
             if isinstance(raw_sd, bool) or not isinstance(raw_sd, int):
                 raise InvalidRecipeError(
-                    f"'snapshot_day' must be a positive int, got {type(raw_sd).__name__!r}"
+                    f"'snapshot_day' must be a positive int or null, got {type(raw_sd).__name__!r}"
                 )
             if raw_sd <= 0:
-                raise InvalidRecipeError(f"'snapshot_day' must be positive, got {raw_sd}")
+                raise InvalidRecipeError(
+                    f"'snapshot_day' must be a positive int or null, got {raw_sd}"
+                )
             snapshot_day = raw_sd
 
         return cls(
diff --git a/leadforge/core/models.py b/leadforge/core/models.py
index db8b609..b78b9ac 100644
--- a/leadforge/core/models.py
+++ b/leadforge/core/models.py
@@ -95,12 +95,25 @@ def __post_init__(self) -> None:
                     f"got {type(self.snapshot_day).__name__!r}"
                 )
             if self.snapshot_day <= 0:
-                raise InvalidConfigError(f"snapshot_day must be positive, got {self.snapshot_day}")
+                raise InvalidConfigError(
+                    f"snapshot_day must be a positive int or None, got {self.snapshot_day}"
+                )
             if self.snapshot_day > self.horizon_days:
                 raise InvalidConfigError(
                     f"snapshot_day ({self.snapshot_day}) must not exceed "
                     f"horizon_days ({self.horizon_days})"
                 )
+            # A snapshot anchored after the label closes would let features
+            # observe events that occur beyond the label-scoring window —
+            # exactly the structural leakage the windowed snapshot is here
+            # to prevent.  Reject at config time.
+            if self.snapshot_day > self.label_window_days:
+                raise InvalidConfigError(
+                    f"snapshot_day ({self.snapshot_day}) must not exceed "
+                    f"label_window_days ({self.label_window_days}); a snapshot "
+                    f"anchored after the label closes would re-introduce "
+                    f"structural leakage."
+                )
         # Coerce string enums supplied as plain strings
         if not isinstance(self.exposure_mode, ExposureMode):
             try:
diff --git a/leadforge/narrative/dataset_card.py b/leadforge/narrative/dataset_card.py
index 01e687d..8f049ff 100644
--- a/leadforge/narrative/dataset_card.py
+++ b/leadforge/narrative/dataset_card.py
@@ -54,6 +54,11 @@ def render_dataset_card(
     # ------------------------------------------------------------------
     # Header
     # ------------------------------------------------------------------
+    snapshot_label = (
+        f"{cfg.snapshot_day} days (windowed)"
+        if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days
+        else f"{cfg.horizon_days} days (full horizon)"
+    )
     lines += [
         "# leadforge dataset card",
         "",
@@ -65,6 +70,8 @@ def render_dataset_card(
         f"| Exposure mode | `{cfg.exposure_mode}` |",
         f"| Difficulty | `{cfg.difficulty}` |",
         f"| Horizon | {cfg.horizon_days} days |",
+        f"| Label window | {cfg.label_window_days} days |",
+        f"| Feature snapshot window | {snapshot_label} |",
         "",
     ]
 
@@ -188,14 +195,27 @@ def render_dataset_card(
     # ------------------------------------------------------------------
     # Caveats
     # ------------------------------------------------------------------
+    if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days:
+        feature_window_caveat = (
+            f"- Event-aggregate features (e.g. `touch_count`, `session_count`, "
+            f"`expected_acv`) are computed over the first {cfg.snapshot_day} days "
+            f"after lead creation; the label resolves over the next "
+            f"{cfg.label_window_days - cfg.snapshot_day} days. The deliberate "
+            f"exception is `total_touches_all`, which counts touches over the "
+            f"full {cfg.horizon_days}-day horizon as a pedagogical leakage trap."
+        )
+    else:
+        feature_window_caveat = (
+            "- Features are anchored at the snapshot date. No post-anchor data is "
+            "included (leakage-free by construction)."
+        )
     lines += [
         "## Caveats",
         "",
         "- This is **synthetic** data. It does not represent any real company, product, or market.",
         "- The hidden world structure varies by motif family and stochastic rewiring; "
         "no two seeds produce the same DGP.",
-        "- Features are anchored at the snapshot date. No post-anchor data is "
-        "included (leakage-free by construction).",
+        feature_window_caveat,
         "- In `student_public` mode, the latent world graph, mechanism summary, "
         "and full world spec are withheld.",
         "",
diff --git a/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml b/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml
index 08843a3..3fc36d6 100644
--- a/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml
+++ b/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml
@@ -18,9 +18,6 @@ default_population:
   n_contacts: 4200
   n_leads: 5000
 horizon_days: 90
-# Windowed snapshot: features aggregate only events within
-# [lead_created_at, lead_created_at + snapshot_day].  Anchored before the
-# 90-day label window closes so event-aggregate features (touch_count,
-# session_count, expected_acv, etc.) cannot encode post-conversion data.
-# total_touches_all retains the pedagogical leakage trap (full-horizon counts).
+# Feature aggregation window in days; see CHANGELOG (bundle schema v4) and
+# release/README.md for the rationale.  Must satisfy snapshot_day <= label_window_days.
 snapshot_day: 30
diff --git a/tests/render/test_render.py b/tests/render/test_render.py
index fa4da40..aad4db1 100644
--- a/tests/render/test_render.py
+++ b/tests/render/test_render.py
@@ -375,7 +375,11 @@ def test_required_top_level_keys(self, sim_outputs, tmp_path):
             "n_contacts",
             "n_leads",
             "horizon_days",
+            "primary_task",
+            "label_window_days",
+            "snapshot_day",
             "motif_family",
+            "redacted_columns",
             "tables",
             "tasks",
         }
diff --git a/tests/render/test_windowed_bundle_trap.py b/tests/render/test_windowed_bundle_trap.py
index c51b59a..89c805a 100644
--- a/tests/render/test_windowed_bundle_trap.py
+++ b/tests/render/test_windowed_bundle_trap.py
@@ -53,17 +53,26 @@ def test_total_touches_all_dominates_touch_count(windowed_bundle: Path) -> None:
 
 
 def test_trap_gap_is_meaningful(windowed_bundle: Path) -> None:
-    """At ``snapshot_day < horizon_days``, the gap must be non-trivial: some
-    leads must have ``total_touches_all > touch_count``.  If not, the trap is
-    pedagogically dead and the windowed snapshot wiring is broken."""
+    """At ``snapshot_day < horizon_days``, the gap must be non-trivial.
+
+    A bare ``> 0`` check would pass even if the trap collapsed to a single
+    lead.  Measurements at the recipe default (``snapshot_day=30``) show
+    54-77% of leads carry a gap across all three difficulty tiers; the
+    intermediate bundle this test uses sits in that window, so a
+    ``>= 20%`` floor is well below the natural rate yet high enough that
+    a regression collapsing the trap would trip it.
+    """
     train = pd.read_parquet(
         windowed_bundle / "tasks" / "converted_within_90_days" / "train.parquet",
         columns=["touch_count", "total_touches_all"],
     )
     diff = train["total_touches_all"].astype("Float64") - train["touch_count"].astype("Float64")
     n_with_gap = int((diff > 0).sum())
-    assert n_with_gap > 0, (
-        "No lead has total_touches_all > touch_count — the trap gap collapsed.\n"
-        "Either snapshot_day was set to horizon_days (bundle is no longer windowed) "
-        "or the build_snapshot() windowing logic regressed."
+    frac_with_gap = n_with_gap / len(train)
+
+    assert frac_with_gap >= 0.20, (
+        f"Only {frac_with_gap:.1%} of leads carry a gap between total_touches_all "
+        f"and touch_count (n={n_with_gap}/{len(train)}).  Expected >=20%.\n"
+        "Either snapshot_day was set close to horizon_days (bundle is barely "
+        "windowed) or the build_snapshot() windowing logic regressed."
     )
diff --git a/tests/test_snapshot_day_threading.py b/tests/test_snapshot_day_threading.py
new file mode 100644
index 0000000..56c0fc5
--- /dev/null
+++ b/tests/test_snapshot_day_threading.py
@@ -0,0 +1,184 @@
+"""Tests for ``snapshot_day`` threading through the generation pipeline.
+
+Verifies all four precedence layers (package default → recipe → override
+dict → explicit kwarg) and the validation paths in
+:meth:`GenerationConfig.__post_init__`.
+
+Companion to :mod:`tests.test_primary_task_threading`, which covers the
+sibling fields ``primary_task`` and ``label_window_days`` introduced in
+PR #36 / PR #43.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from leadforge.api.generator import Generator
+from leadforge.api.recipes import Recipe
+from leadforge.core.exceptions import InvalidConfigError, InvalidRecipeError
+from leadforge.core.models import GenerationConfig
+from leadforge.core.serialization import load_json
+
+_SMALL = {"n_leads": 30, "n_accounts": 15, "n_contacts": 45}
+
+
+# ---------------------------------------------------------------------------
+# Recipe.from_dict parsing
+# ---------------------------------------------------------------------------
+
+
+def _base_recipe_dict(**extra) -> dict:
+    """Minimal valid recipe payload for from_dict tests."""
+    payload = {
+        "id": "test_recipe",
+        "title": "Test recipe",
+        "vertical": "test_vertical",
+        "description": "test",
+        "primary_task": "converted_within_90_days",
+        "supported_modes": ["student_public", "research_instructor"],
+        "supported_difficulty": ["intro", "intermediate", "advanced"],
+        "default_population": {"n_accounts": 1, "n_contacts": 1, "n_leads": 1},
+        "horizon_days": 90,
+    }
+    payload.update(extra)
+    return payload
+
+
+class TestRecipeFromDictSnapshotDay:
+    def test_absent_key_yields_none(self) -> None:
+        recipe = Recipe.from_dict(_base_recipe_dict())
+        assert recipe.snapshot_day is None
+
+    def test_positive_int_round_trips(self) -> None:
+        recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=30))
+        assert recipe.snapshot_day == 30
+
+    def test_zero_rejected(self) -> None:
+        with pytest.raises(InvalidRecipeError, match="snapshot_day"):
+            Recipe.from_dict(_base_recipe_dict(snapshot_day=0))
+
+    def test_negative_rejected(self) -> None:
+        with pytest.raises(InvalidRecipeError, match="snapshot_day"):
+            Recipe.from_dict(_base_recipe_dict(snapshot_day=-5))
+
+    def test_string_rejected(self) -> None:
+        with pytest.raises(InvalidRecipeError, match="snapshot_day"):
+            Recipe.from_dict(_base_recipe_dict(snapshot_day="30"))
+
+    def test_bool_rejected(self) -> None:
+        # bool is an int subclass; the validator must reject it explicitly.
+        with pytest.raises(InvalidRecipeError, match="snapshot_day"):
+            Recipe.from_dict(_base_recipe_dict(snapshot_day=True))
+
+    def test_explicit_null_treated_as_none(self) -> None:
+        recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=None))
+        assert recipe.snapshot_day is None
+
+
+# ---------------------------------------------------------------------------
+# resolve_config precedence
+# ---------------------------------------------------------------------------
+
+
+class TestResolveConfigSnapshotDayPrecedence:
+    def test_package_default_when_recipe_silent(self) -> None:
+        recipe = Recipe.from_dict(_base_recipe_dict())  # no snapshot_day
+        cfg = recipe.resolve_config()
+        assert cfg.snapshot_day is None  # package default
+
+    def test_recipe_default_overrides_package(self) -> None:
+        recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=20))
+        cfg = recipe.resolve_config()
+        assert cfg.snapshot_day == 20
+
+    def test_override_dict_beats_recipe(self) -> None:
+        recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=20))
+        cfg = recipe.resolve_config(override={"snapshot_day": 45})
+        assert cfg.snapshot_day == 45
+
+    def test_kwarg_beats_override_and_recipe(self) -> None:
+        recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=20))
+        cfg = recipe.resolve_config(override={"snapshot_day": 45}, snapshot_day=60)
+        assert cfg.snapshot_day == 60
+
+    def test_b2b_recipe_pins_thirty(self) -> None:
+        """Sanity check that the shipped recipe applies snapshot_day=30."""
+        from leadforge.recipes.registry import load_recipe
+
+        recipe = Recipe.from_dict(load_recipe("b2b_saas_procurement_v1"))
+        cfg = recipe.resolve_config()
+        assert cfg.snapshot_day == 30
+
+
+# ---------------------------------------------------------------------------
+# GenerationConfig validation
+# ---------------------------------------------------------------------------
+
+
+class TestGenerationConfigValidation:
+    def test_none_is_valid(self) -> None:
+        cfg = GenerationConfig(snapshot_day=None)
+        assert cfg.snapshot_day is None
+
+    def test_positive_int_is_valid(self) -> None:
+        cfg = GenerationConfig(snapshot_day=30)
+        assert cfg.snapshot_day == 30
+
+    def test_zero_rejected(self) -> None:
+        with pytest.raises(InvalidConfigError, match="snapshot_day"):
+            GenerationConfig(snapshot_day=0)
+
+    def test_negative_rejected(self) -> None:
+        with pytest.raises(InvalidConfigError, match="snapshot_day"):
+            GenerationConfig(snapshot_day=-1)
+
+    def test_string_rejected(self) -> None:
+        with pytest.raises(InvalidConfigError, match="snapshot_day"):
+            GenerationConfig(snapshot_day="30")  # type: ignore[arg-type]
+
+    def test_bool_rejected(self) -> None:
+        with pytest.raises(InvalidConfigError, match="snapshot_day"):
+            GenerationConfig(snapshot_day=True)  # type: ignore[arg-type]
+
+    def test_exceeds_horizon_rejected(self) -> None:
+        with pytest.raises(InvalidConfigError, match="horizon_days"):
+            GenerationConfig(horizon_days=20, label_window_days=20, snapshot_day=21)
+
+    def test_exceeds_label_window_rejected(self) -> None:
+        """A snapshot anchored after the label closes is nonsensical: the
+        feature window would extend past events the label was scored on.
+        Catch this at config time, not at the consumer end."""
+        with pytest.raises(InvalidConfigError, match="label_window_days"):
+            GenerationConfig(horizon_days=120, label_window_days=90, snapshot_day=100)
+
+    def test_equal_to_horizon_is_valid(self) -> None:
+        # Equal is the boundary case — this is "full-horizon" expressed
+        # explicitly rather than via None.  Allowed.
+        cfg = GenerationConfig(horizon_days=90, label_window_days=90, snapshot_day=90)
+        assert cfg.snapshot_day == 90
+
+
+# ---------------------------------------------------------------------------
+# End-to-end: kwarg → manifest field
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="module")
+def custom_snapshot_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path:
+    """Bundle generated with snapshot_day overridden via the explicit kwarg."""
+    out = tmp_path_factory.mktemp("custom_snapshot")
+    Generator.from_recipe(
+        "b2b_saas_procurement_v1",
+        seed=42,
+        exposure_mode="student_public",
+        snapshot_day=20,
+    ).generate(**_SMALL).save(str(out))
+    return out
+
+
+class TestEndToEnd:
+    def test_kwarg_overrides_recipe_in_manifest(self, custom_snapshot_bundle: Path) -> None:
+        manifest = load_json(custom_snapshot_bundle / "manifest.json")
+        assert manifest["snapshot_day"] == 20  # not 30 (the recipe default)

From 650c3a7535e20ddda250bee96e5414e9ae7339b1 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Tue, 5 May 2026 08:14:45 +0300
Subject: [PATCH 4/4] fix: correct label-window phrasing in release docs
 (Copilot review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous wording — "the label is then resolved over the next 60
days" — implied the label was scored over [day 30, day 90].  In fact
``converted_within_90_days`` is evaluated over the full 90 days from
lead creation; only the *features* are restricted to the first 30 days.
Same misleading phrasing existed in the auto-generated dataset_card.md
caveat and is corrected too, so consumers reading the in-bundle card
get the same accurate description.

Resolves Copilot review comment on release/README.md:74.
The matching review comment on leadforge/core/models.py:104 was
already addressed in f19d58e (snapshot_day <= label_window_days
validation block); resolved as already-treated.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 leadforge/narrative/dataset_card.py | 8 ++++----
 release/README.md                   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/leadforge/narrative/dataset_card.py b/leadforge/narrative/dataset_card.py
index 8f049ff..ea265cd 100644
--- a/leadforge/narrative/dataset_card.py
+++ b/leadforge/narrative/dataset_card.py
@@ -197,10 +197,10 @@ def render_dataset_card(
     # ------------------------------------------------------------------
     if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days:
         feature_window_caveat = (
-            f"- Event-aggregate features (e.g. `touch_count`, `session_count`, "
-            f"`expected_acv`) are computed over the first {cfg.snapshot_day} days "
-            f"after lead creation; the label resolves over the next "
-            f"{cfg.label_window_days - cfg.snapshot_day} days. The deliberate "
+            f"- The label is evaluated over the full {cfg.label_window_days}-day "
+            f"window from lead creation; event-aggregate features (e.g. "
+            f"`touch_count`, `session_count`, `expected_acv`) observe only the "
+            f"first {cfg.snapshot_day} days of that window. The deliberate "
             f"exception is `total_touches_all`, which counts touches over the "
             f"full {cfg.horizon_days}-day horizon as a pedagogical leakage trap."
         )
diff --git a/release/README.md b/release/README.md
index f7a2afe..cb85b61 100644
--- a/release/README.md
+++ b/release/README.md
@@ -71,7 +71,7 @@ train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parqu
 test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet")
 ```
 
-**Note:** Every event-aggregate feature (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) only counts events that occurred within the first **30 days** after the lead was created — that's the snapshot window. The label is then resolved over the next 60 days. So features cannot accidentally encode post-conversion events. The exception is `total_touches_all`, a deliberately included leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`) that counts touches over the full 90-day horizon. Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`.
+**Note:** The label `converted_within_90_days` is evaluated over the full **90 days** from lead creation. Event-aggregate features (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) observe **only the first 30 days** of that window — so even when a lead converts on day 50, the features are frozen at day 30 and cannot encode the conversion event. The deliberate exception is `total_touches_all`, a leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`) that counts touches over the full 90-day horizon. Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`.
 
 ### Option 3: Relational tables (feature engineering)