From 8e569cee0f3f299dba8745fa1c2755de3f495f0f Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 5 May 2026 07:34:53 +0300 Subject: [PATCH 1/4] feat: windowed snapshot for student_public bundles (closes #57) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `snapshot_day` as a recipe-level config knob and threads it through the public API. When set, event-aggregate features (touch_count, session_count, expected_acv, days_since_last_touch, ...) only see events in [lead_created_at, lead_created_at + snapshot_day], so they cannot encode post-conversion data. total_touches_all keeps full-horizon counts as the deliberate pedagogical leakage trap, so the gap between the two columns now carries real signal instead of being structurally zero. The b2b_saas_procurement_v1 recipe pins snapshot_day=30, picked from measurements at seed=42, n_leads=5000. Conversion rates are invariant to snapshot_day (the label is event-derived from label_window_days), so intro/intermediate/advanced rates stay at 0.422/0.210/0.079 — well inside the declared difficulty_profiles ranges. Day 30 keeps LR AUC in the 0.85-0.86 band (challenging but modelable) while preserving a trap gap of ~3.0 touches with 54-77% of leads showing any gap. BUNDLE_SCHEMA_VERSION bumped 3 → 4. The published column SET is unchanged, but column VALUES are no longer full-horizon, which v3 consumers cannot detect from schema alone. manifest.snapshot_day is recorded so the contract is self-describing (null = legacy full-horizon). The schema contract test is renamed to test_bundle_schema_v4_contract.py and gains a new assertion that manifest.snapshot_day == 30. A new test_windowed_bundle_trap.py guards the pedagogical invariant: total_touches_all >= touch_count for every lead and > for at least some. Co-Authored-By: Claude Opus 4.7 --- leadforge/api/bundle.py | 1 + leadforge/api/generator.py | 6 ++ leadforge/api/recipes.py | 21 +++++ leadforge/core/models.py | 14 +++ .../b2b_saas_procurement_v1/recipe.yaml | 6 ++ leadforge/render/manifests.py | 11 ++- ...t.py => test_bundle_schema_v4_contract.py} | 88 +++++++++++-------- tests/render/test_windowed_bundle_trap.py | 69 +++++++++++++++ 8 files changed, 180 insertions(+), 36 deletions(-) rename tests/render/{test_bundle_schema_v3_contract.py => test_bundle_schema_v4_contract.py} (60%) create mode 100644 tests/render/test_windowed_bundle_trap.py diff --git a/leadforge/api/bundle.py b/leadforge/api/bundle.py index 4f2d1bb..4065bc9 100644 --- a/leadforge/api/bundle.py +++ b/leadforge/api/bundle.py @@ -94,6 +94,7 @@ def write_bundle( result, population, horizon_days=config.horizon_days, + snapshot_day=config.snapshot_day, difficulty_params=config.difficulty_params, seed=config.seed, ) diff --git a/leadforge/api/generator.py b/leadforge/api/generator.py index f2da598..b849109 100644 --- a/leadforge/api/generator.py +++ b/leadforge/api/generator.py @@ -54,6 +54,7 @@ def from_recipe( horizon_days: int | None = None, primary_task: str | None = None, label_window_days: int | None = None, + snapshot_day: int | None = None, output_path: str = _MISSING, # type: ignore[assignment] override: dict[str, Any] | None = None, ) -> Generator: @@ -76,6 +77,10 @@ def from_recipe( directory name and manifest key. label_window_days: Override recipe default label observation window in days. + snapshot_day: Override recipe default snapshot day for windowed + feature aggregation. ``None`` means full-horizon (legacy) + aggregation; an integer ``N`` means features aggregate only + events with ``timestamp <= lead_created_at + N days``. output_path: Directory where the bundle will be saved. override: Optional dict of overrides (mirrors a ``--override`` file). Applied after recipe defaults but before explicit kwargs. @@ -105,6 +110,7 @@ def from_recipe( horizon_days=horizon_days, primary_task=primary_task, label_window_days=label_window_days, + snapshot_day=snapshot_day, output_path=output_path, override=override, ) diff --git a/leadforge/api/recipes.py b/leadforge/api/recipes.py index 8a39fb5..846918b 100644 --- a/leadforge/api/recipes.py +++ b/leadforge/api/recipes.py @@ -41,6 +41,7 @@ class Recipe: default_population: dict[str, int] horizon_days: int label_window_days: int | None = None + snapshot_day: int | None = None # ------------------------------------------------------------------ # # Construction @@ -105,6 +106,17 @@ def from_dict(cls, data: dict[str, Any]) -> Recipe: raise InvalidRecipeError(f"'label_window_days' must be positive, got {raw_lwd}") label_window_days = raw_lwd + snapshot_day: int | None = None + raw_sd = data.get("snapshot_day") + if raw_sd is not None: + if isinstance(raw_sd, bool) or not isinstance(raw_sd, int): + raise InvalidRecipeError( + f"'snapshot_day' must be a positive int, got {type(raw_sd).__name__!r}" + ) + if raw_sd <= 0: + raise InvalidRecipeError(f"'snapshot_day' must be positive, got {raw_sd}") + snapshot_day = raw_sd + return cls( id=data["id"], title=data["title"], @@ -116,6 +128,7 @@ def from_dict(cls, data: dict[str, Any]) -> Recipe: default_population=dict(pop), horizon_days=horizon_days, label_window_days=label_window_days, + snapshot_day=snapshot_day, ) # ------------------------------------------------------------------ # @@ -134,6 +147,7 @@ def resolve_config( horizon_days: int | None = None, primary_task: str | None = None, label_window_days: int | None = None, + snapshot_day: int | None = None, output_path: str = _MISSING, # type: ignore[assignment] override: dict[str, Any] | None = None, ) -> GenerationConfig: @@ -165,6 +179,7 @@ def resolve_config( "horizon_days": pkg["horizon_days"], "primary_task": pkg["primary_task"], "label_window_days": pkg["label_window_days"], + "snapshot_day": pkg["snapshot_day"], } # Layer 3 — recipe defaults @@ -176,6 +191,8 @@ def resolve_config( resolved["primary_task"] = self.primary_task if self.label_window_days is not None: resolved["label_window_days"] = self.label_window_days + if self.snapshot_day is not None: + resolved["snapshot_day"] = self.snapshot_day # Layer 2 — override dict (beats recipe/package defaults) if override: @@ -186,6 +203,7 @@ def resolve_config( "horizon_days", "primary_task", "label_window_days", + "snapshot_day", "seed", "output_path", "exposure_mode", @@ -216,6 +234,8 @@ def resolve_config( resolved["primary_task"] = primary_task if label_window_days is not None: resolved["label_window_days"] = label_window_days + if snapshot_day is not None: + resolved["snapshot_day"] = snapshot_day try: mode = ExposureMode(resolved["exposure_mode"]) @@ -254,6 +274,7 @@ def resolve_config( horizon_days=resolved["horizon_days"], primary_task=resolved["primary_task"], label_window_days=resolved["label_window_days"], + snapshot_day=resolved["snapshot_day"], output_path=resolved["output_path"], ) diff --git a/leadforge/core/models.py b/leadforge/core/models.py index f8eed75..db8b609 100644 --- a/leadforge/core/models.py +++ b/leadforge/core/models.py @@ -64,6 +64,7 @@ class GenerationConfig: horizon_days: int = 90 primary_task: str = "converted_within_90_days" label_window_days: int = 90 + snapshot_day: int | None = None output_path: str = "./out" package_version: str = field(default_factory=lambda: __version__) difficulty_params: DifficultyParams | None = None @@ -87,6 +88,19 @@ def __post_init__(self) -> None: f"label_window_days ({self.label_window_days}) must not exceed " f"horizon_days ({self.horizon_days})" ) + if self.snapshot_day is not None: + if isinstance(self.snapshot_day, bool) or not isinstance(self.snapshot_day, int): + raise InvalidConfigError( + f"snapshot_day must be a positive int or None, " + f"got {type(self.snapshot_day).__name__!r}" + ) + if self.snapshot_day <= 0: + raise InvalidConfigError(f"snapshot_day must be positive, got {self.snapshot_day}") + if self.snapshot_day > self.horizon_days: + raise InvalidConfigError( + f"snapshot_day ({self.snapshot_day}) must not exceed " + f"horizon_days ({self.horizon_days})" + ) # Coerce string enums supplied as plain strings if not isinstance(self.exposure_mode, ExposureMode): try: diff --git a/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml b/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml index 44b48ac..08843a3 100644 --- a/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml +++ b/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml @@ -18,3 +18,9 @@ default_population: n_contacts: 4200 n_leads: 5000 horizon_days: 90 +# Windowed snapshot: features aggregate only events within +# [lead_created_at, lead_created_at + snapshot_day]. Anchored before the +# 90-day label window closes so event-aggregate features (touch_count, +# session_count, expected_acv, etc.) cannot encode post-conversion data. +# total_touches_all retains the pedagogical leakage trap (full-horizon counts). +snapshot_day: 30 diff --git a/leadforge/render/manifests.py b/leadforge/render/manifests.py index 2c71312..27023c5 100644 --- a/leadforge/render/manifests.py +++ b/leadforge/render/manifests.py @@ -28,7 +28,15 @@ # feature list (zero-variance); ``is_sql`` redacted in # ``student_public`` mode (near-deterministic for non-conversion). # ``manifest.redacted_columns`` was already added in PR #56. -BUNDLE_SCHEMA_VERSION = "3" +# "4" — issue #57 sub-item 1: windowed snapshot. Event-aggregate +# features (touch_count, session_count, expected_acv, ...) now +# aggregate only events within ``[lead_created_at, lead_created_at +# + snapshot_day]``. Column SET unchanged from v3, but column +# VALUES are no longer full-horizon — consumers pinning v3 and +# assuming "features computed over full horizon" must update. +# ``manifest.snapshot_day`` recorded so the contract is +# self-describing (``null`` means full-horizon, legacy behaviour). +BUNDLE_SCHEMA_VERSION = "4" # Manifest fields whose value is non-deterministic by design (wall-clock, # host metadata, etc.). Determinism checks must ignore these fields when @@ -106,6 +114,7 @@ def build_manifest( "horizon_days": config.horizon_days, "primary_task": config.primary_task, "label_window_days": config.label_window_days, + "snapshot_day": config.snapshot_day, "motif_family": world_graph.motif_family, "redacted_columns": redacted_columns_list, "tables": tables, diff --git a/tests/render/test_bundle_schema_v3_contract.py b/tests/render/test_bundle_schema_v4_contract.py similarity index 60% rename from tests/render/test_bundle_schema_v3_contract.py rename to tests/render/test_bundle_schema_v4_contract.py index 2135bde..b839471 100644 --- a/tests/render/test_bundle_schema_v3_contract.py +++ b/tests/render/test_bundle_schema_v4_contract.py @@ -1,4 +1,4 @@ -"""Schema contract test for ``bundle_schema_version == "3"``. +"""Schema contract test for ``bundle_schema_version == "4"``. The constants below are an *intentional* duplication of the column sets the bundle writer produces. The duplication is the point: any change to @@ -8,6 +8,11 @@ here, forcing the author to either update the contract (and bump ``BUNDLE_SCHEMA_VERSION``) or revisit the change. +v4 vs v3: the column SET is unchanged. The bump records the semantic +shift introduced by the windowed snapshot — event-aggregate column +VALUES are now anchored at ``snapshot_day`` instead of the full horizon. +``manifest.snapshot_day`` makes the contract self-describing. + If you find yourself wondering "do I have to update this?": yes. That is the failure mode this test is designed to catch. """ @@ -22,10 +27,10 @@ from leadforge.api.generator import Generator -# Pinned column lists for bundle schema v3. Update *together* with +# Pinned column lists for bundle schema v4. Update *together* with # ``BUNDLE_SCHEMA_VERSION`` and ``LEAD_SNAPSHOT_FEATURES``. -V3_TASK_COLUMNS_STUDENT_PUBLIC: frozenset[str] = frozenset( +V4_TASK_COLUMNS_STUDENT_PUBLIC: frozenset[str] = frozenset( { "account_id", "industry", @@ -62,12 +67,12 @@ } ) -V3_TASK_COLUMNS_RESEARCH_INSTRUCTOR: frozenset[str] = V3_TASK_COLUMNS_STUDENT_PUBLIC | { +V4_TASK_COLUMNS_RESEARCH_INSTRUCTOR: frozenset[str] = V4_TASK_COLUMNS_STUDENT_PUBLIC | { "current_stage", "is_sql", } -V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC: frozenset[str] = frozenset( +V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC: frozenset[str] = frozenset( { "lead_id", "contact_id", @@ -82,7 +87,7 @@ } ) -V3_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR: frozenset[str] = V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC | { +V4_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR: frozenset[str] = V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC | { "current_stage", "is_sql", } @@ -97,14 +102,14 @@ def _build(mode: str, out: Path, seed: int = 42) -> None: @pytest.fixture(scope="module") def student_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path: - out = tmp_path_factory.mktemp("v3_student") + out = tmp_path_factory.mktemp("v4_student") _build("student_public", out) return out @pytest.fixture(scope="module") def instructor_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path: - out = tmp_path_factory.mktemp("v3_instructor") + out = tmp_path_factory.mktemp("v4_instructor") _build("research_instructor", out) return out @@ -117,52 +122,64 @@ def _leads_cols(bundle: Path) -> frozenset[str]: return frozenset(pq.read_schema(bundle / "tables/leads.parquet").names) -def test_manifest_declares_v3(student_bundle: Path, instructor_bundle: Path) -> None: +def test_manifest_declares_v4(student_bundle: Path, instructor_bundle: Path) -> None: for b in (student_bundle, instructor_bundle): manifest = json.loads((b / "manifest.json").read_text()) - assert manifest["bundle_schema_version"] == "3", ( + assert manifest["bundle_schema_version"] == "4", ( f"{b.name}: bundle_schema_version is {manifest['bundle_schema_version']!r}, " - "expected '3'" + "expected '4'" + ) + + +def test_manifest_records_snapshot_day(student_bundle: Path, instructor_bundle: Path) -> None: + """v4 contract: manifest must surface ``snapshot_day`` so consumers can + distinguish full-horizon (legacy) bundles from windowed bundles.""" + for b in (student_bundle, instructor_bundle): + manifest = json.loads((b / "manifest.json").read_text()) + assert "snapshot_day" in manifest, f"{b.name}: manifest is missing 'snapshot_day' field" + # The b2b_saas_procurement_v1 recipe pins snapshot_day=30. + assert manifest["snapshot_day"] == 30, ( + f"{b.name}: snapshot_day is {manifest['snapshot_day']!r}, expected 30" ) -def test_student_public_task_columns_match_v3_contract(student_bundle: Path) -> None: +def test_student_public_task_columns_match_v4_contract(student_bundle: Path) -> None: actual = _task_cols(student_bundle) - assert actual == V3_TASK_COLUMNS_STUDENT_PUBLIC, ( - f"student_public task split columns drifted from v3 contract.\n" - f" unexpected: {sorted(actual - V3_TASK_COLUMNS_STUDENT_PUBLIC)}\n" - f" missing: {sorted(V3_TASK_COLUMNS_STUDENT_PUBLIC - actual)}\n" - " → either update tests/render/test_bundle_schema_v3_contract.py and " + assert actual == V4_TASK_COLUMNS_STUDENT_PUBLIC, ( + f"student_public task split columns drifted from v4 contract.\n" + f" unexpected: {sorted(actual - V4_TASK_COLUMNS_STUDENT_PUBLIC)}\n" + f" missing: {sorted(V4_TASK_COLUMNS_STUDENT_PUBLIC - actual)}\n" + " → either update tests/render/test_bundle_schema_v4_contract.py and " "bump BUNDLE_SCHEMA_VERSION, or revert the schema change." ) -def test_research_instructor_task_columns_match_v3_contract(instructor_bundle: Path) -> None: +def test_research_instructor_task_columns_match_v4_contract(instructor_bundle: Path) -> None: actual = _task_cols(instructor_bundle) - assert actual == V3_TASK_COLUMNS_RESEARCH_INSTRUCTOR, ( - f"research_instructor task split columns drifted from v3 contract.\n" - f" unexpected: {sorted(actual - V3_TASK_COLUMNS_RESEARCH_INSTRUCTOR)}\n" - f" missing: {sorted(V3_TASK_COLUMNS_RESEARCH_INSTRUCTOR - actual)}" + assert actual == V4_TASK_COLUMNS_RESEARCH_INSTRUCTOR, ( + f"research_instructor task split columns drifted from v4 contract.\n" + f" unexpected: {sorted(actual - V4_TASK_COLUMNS_RESEARCH_INSTRUCTOR)}\n" + f" missing: {sorted(V4_TASK_COLUMNS_RESEARCH_INSTRUCTOR - actual)}" ) -def test_student_public_leads_table_columns_match_v3_contract(student_bundle: Path) -> None: +def test_student_public_leads_table_columns_match_v4_contract(student_bundle: Path) -> None: actual = _leads_cols(student_bundle) - assert actual == V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC, ( - f"student_public tables/leads.parquet columns drifted from v3 contract.\n" - f" unexpected: {sorted(actual - V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC)}\n" - f" missing: {sorted(V3_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC - actual)}" + assert actual == V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC, ( + f"student_public tables/leads.parquet columns drifted from v4 contract.\n" + f" unexpected: {sorted(actual - V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC)}\n" + f" missing: {sorted(V4_LEAD_TABLE_COLUMNS_STUDENT_PUBLIC - actual)}" ) -def test_research_instructor_leads_table_columns_match_v3_contract( +def test_research_instructor_leads_table_columns_match_v4_contract( instructor_bundle: Path, ) -> None: actual = _leads_cols(instructor_bundle) - assert actual == V3_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR, ( - f"research_instructor tables/leads.parquet columns drifted from v3 contract.\n" - f" unexpected: {sorted(actual - V3_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR)}\n" - f" missing: {sorted(V3_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR - actual)}" + assert actual == V4_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR, ( + f"research_instructor tables/leads.parquet columns drifted from v4 contract.\n" + f" unexpected: {sorted(actual - V4_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR)}\n" + f" missing: {sorted(V4_LEAD_TABLE_COLUMNS_RESEARCH_INSTRUCTOR - actual)}" ) @@ -176,9 +193,10 @@ def test_research_instructor_redacted_set_in_manifest(instructor_bundle: Path) - assert manifest["redacted_columns"] == [] -def test_is_mql_absent_from_all_v3_artifacts(student_bundle: Path, instructor_bundle: Path) -> None: - """``is_mql`` was removed entirely in v3 (issue #57): not in the snapshot, - not in the relational table, not in the feature dictionary, not anywhere.""" +def test_is_mql_absent_from_all_v4_artifacts(student_bundle: Path, instructor_bundle: Path) -> None: + """``is_mql`` was removed entirely in v3 (issue #57) and remains absent in v4: + not in the snapshot, not in the relational table, not in the feature + dictionary, not anywhere.""" for b in (student_bundle, instructor_bundle): assert "is_mql" not in _task_cols(b) assert "is_mql" not in _leads_cols(b) diff --git a/tests/render/test_windowed_bundle_trap.py b/tests/render/test_windowed_bundle_trap.py new file mode 100644 index 0000000..c51b59a --- /dev/null +++ b/tests/render/test_windowed_bundle_trap.py @@ -0,0 +1,69 @@ +"""End-to-end test that the leakage trap remains pedagogically meaningful +once the bundle uses a windowed snapshot. + +The pedagogical contract is that ``total_touches_all`` (full-horizon counts) +diverges from ``touch_count`` (windowed counts) for at least some leads. +If a careless refactor accidentally widened ``touch_count`` back to the full +horizon, the trap would silently collapse — students could no longer see +the gap that the v4 dataset card promises. +""" + +from __future__ import annotations + +from pathlib import Path + +import pandas as pd +import pytest + +from leadforge.api.generator import Generator + + +@pytest.fixture(scope="module") +def windowed_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path: + """A small student_public bundle generated with the recipe default + ``snapshot_day`` (currently 30).""" + out = tmp_path_factory.mktemp("windowed_trap") + gen = Generator.from_recipe( + "b2b_saas_procurement_v1", + seed=42, + exposure_mode="student_public", + ) + gen.generate(n_leads=200, n_accounts=80, n_contacts=200).save(str(out)) + return out + + +def test_windowed_bundle_uses_recipe_snapshot_day(windowed_bundle: Path) -> None: + import json + + manifest = json.loads((windowed_bundle / "manifest.json").read_text()) + assert manifest["snapshot_day"] == 30 + + +def test_total_touches_all_dominates_touch_count(windowed_bundle: Path) -> None: + """Invariant: ``total_touches_all >= touch_count`` for every lead.""" + train = pd.read_parquet( + windowed_bundle / "tasks" / "converted_within_90_days" / "train.parquet", + columns=["touch_count", "total_touches_all"], + ) + diff = train["total_touches_all"].astype("Float64") - train["touch_count"].astype("Float64") + assert (diff >= 0).all(), ( + "total_touches_all < touch_count for some leads — the windowed feature " + "exceeded the full-horizon count, which contradicts the trap contract." + ) + + +def test_trap_gap_is_meaningful(windowed_bundle: Path) -> None: + """At ``snapshot_day < horizon_days``, the gap must be non-trivial: some + leads must have ``total_touches_all > touch_count``. If not, the trap is + pedagogically dead and the windowed snapshot wiring is broken.""" + train = pd.read_parquet( + windowed_bundle / "tasks" / "converted_within_90_days" / "train.parquet", + columns=["touch_count", "total_touches_all"], + ) + diff = train["total_touches_all"].astype("Float64") - train["touch_count"].astype("Float64") + n_with_gap = int((diff > 0).sum()) + assert n_with_gap > 0, ( + "No lead has total_touches_all > touch_count — the trap gap collapsed.\n" + "Either snapshot_day was set to horizon_days (bundle is no longer windowed) " + "or the build_snapshot() windowing logic regressed." + ) From 4d8185e741bfbde1ec74221a6cfd13a90a192218 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 5 May 2026 07:44:40 +0300 Subject: [PATCH 2/4] docs: refresh release docs and changelog for windowed snapshot (v4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - release/README.md: refreshed leakage handling section with the v4 schema bump, the snapshot/redaction pair, and the new pedagogical framing of total_touches_all. Removed the open-caveat block — the underlying caveat (event aggregates over the label window) is fixed. - release/HF_DATASET_CARD.md: matched the README updates and bumped the schema-version reference. - CHANGELOG.md: documented the schema v4 entry (windowed snapshot, manifest.snapshot_day, contract test, trap invariant guard) and dropped the "open follow-up" line that previously flagged sub-item 1 of #57. - .agent-plan.md: marked all three sub-items of #57 resolved; added the windowed snapshot bullet to the Phase 5 release checklist. Co-Authored-By: Claude Opus 4.7 --- .agent-plan.md | 5 ++-- CHANGELOG.md | 47 +++++++++++++++++++++++++++++++------- release/HF_DATASET_CARD.md | 13 ++++++----- release/README.md | 19 ++++++++------- 4 files changed, 58 insertions(+), 26 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index b11f6ff..ed2a914 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -46,6 +46,7 @@ First public dataset release: `leadforge-b2b-lead-scoring`. Three difficulty tie - [x] Update release/HF_DATASET_CARD.md — add conversion rates to summary table - [x] Verify SHA-256 hash determinism (re-run build, compare hashes) — `scripts/verify_hash_determinism.py`; 73/73 files identical across two `build_public_release.py` runs (modulo `manifest.json`'s wall-clock `generation_timestamp`) - [x] Fix `current_stage` leakage in student_public bundles via exposure-layer redaction — `is_leakage_trap` flag distinguishes the pedagogical trap (`total_touches_all`) from true label leaks; `BundleFilter.redacted_columns` strips the latter; `validate_bundle()` enforces the invariant. 73/73 hash-determinism preserved. +- [x] Windowed snapshot for student_public bundles — `snapshot_day=30` pinned in recipe; event-aggregate features no longer share the 90-day label window; `BUNDLE_SCHEMA_VERSION` bumped to 4; 73/73 hash-determinism preserved; conversion rates unchanged (41.5% / 20.1% / 7.9%). - [ ] Upload to Kaggle and HuggingFace - [ ] Announce @@ -66,11 +67,11 @@ First public dataset release: `leadforge-b2b-lead-scoring`. Three difficulty tie Deterministic leak fixed via exposure-layer redaction. `FeatureSpec` now carries an explicit `redact_in_modes: frozenset[ExposureMode]` field — *prescriptive* — alongside the descriptive `leakage_risk` flag. `current_stage` is marked `redact_in_modes={ExposureMode.student_public}`; the writer queries `redacted_columns_for(mode)` and strips matching columns from the snapshot, task splits, and feature dictionary before they hit disk. The pedagogical trap `total_touches_all` is preserved in all modes (no entry in `redact_in_modes`). The manifest records `redacted_columns: [...]` so the bundle is self-describing. `validate_bundle()` cross-checks parquet schemas, feature dictionary, and the manifest's declared redaction set against `redacted_columns_for(mode)` derived independently from the feature spec. Hash-determinism preserved (73/73 identical across builds). -### Follow-up: structural leakage in `student_public` bundles (issue #57) +### Follow-up: structural leakage in `student_public` bundles (issue #57) — fully resolved Tracked in [GitHub issue #57](https://github.com/leadforge-dev/leadforge/issues/57). -1. **Event-aggregate features are computed over the label window.** `touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, etc. all aggregate events in `[lead_created_at, lead_created_at + 90d]`, the same window over which the label resolves. The structural fix is a windowed snapshot (`snapshot_day=N` with `N < label_window_days`), as v6/v7 datasets already do at day 14/20. **Open** — its own PR with documentation recalibration; will likely bump `BUNDLE_SCHEMA_VERSION` again. +1. ~~**Event-aggregate features are computed over the label window.** `touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, etc. all aggregate events in `[lead_created_at, lead_created_at + 90d]`, the same window over which the label resolves.~~ **Resolved** — windowed snapshot at `snapshot_day=30` (recipe default). `BUNDLE_SCHEMA_VERSION` bumped 3 → 4; `manifest.snapshot_day` records the contract. Conversion rates invariant (label is event-derived from `label_window_days`); trap gap (`total_touches_all − touch_count`) ~3 touches with 54–77% of leads showing divergence. Guarded by `test_bundle_schema_v4_contract.py` and `test_windowed_bundle_trap.py`. 2. ~~**`is_sql=False` is near-deterministic for non-conversion.** Measured on the regenerated bundle: P(converted | is_sql=False) = 0.038 (intro), 0.015 (intermediate), 0.006 (advanced).~~ **Resolved** — `is_sql` redacted in `student_public` mode by post-#57 PR (bundle schema v3). 3. ~~**`is_mql` is a constant `True`.** Zero variance feature in all three tiers.~~ **Resolved** — `is_mql` removed from the canonical feature list by post-#57 PR (bundle schema v3). Guarded by a new `test_no_zero_variance_features` check. diff --git a/CHANGELOG.md b/CHANGELOG.md index 4093008..1ddc4cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,45 @@ Format inspired by [Keep a Changelog](https://keepachangelog.com/). ## Unreleased +### Bundle schema v4 + +`bundle_schema_version` bumped from `"3"` to `"4"`. Closes the final +sub-item of issue #57: event-aggregate features are no longer computed +over the same 90-day window the label resolves in. + +- **Windowed snapshot.** `GenerationConfig.snapshot_day` (also exposed + as a recipe-level field and an explicit kwarg on + `Generator.from_recipe()`) now controls the feature aggregation + window. When set, `build_snapshot()` filters touches, sessions, + sales activities, and opportunities to events with timestamp + ≤ `lead_created_at + snapshot_day`. The + `b2b_saas_procurement_v1` recipe pins `snapshot_day: 30` — + measurements at seed 42, n_leads=5000 across all three difficulty + tiers showed day 30 keeps LR AUC in [0.85, 0.86] (challenging but + modelable) while preserving a meaningful trap gap of ~3 touches + with 54–77% of leads showing any divergence between + `total_touches_all` (full-horizon) and `touch_count` (windowed). +- **Conversion rates unchanged.** The label is event-derived from + `label_window_days` in the simulator and is independent of + `snapshot_day`, so the published rates stay at 41.5% / 20.1% / 7.9% + (intro / intermediate / advanced) — well inside the declared + `difficulty_profiles.yaml` ranges. +- **`manifest.snapshot_day` recorded.** The published bundle + declares its windowing contract; consumers can distinguish + full-horizon (legacy v2/v3) bundles from windowed (v4) bundles + without inspecting package internals. Column SET is unchanged + from v3, but column VALUES are no longer full-horizon — a contract + shift that v3 consumers would not detect from schema alone. +- **Schema contract test.** `tests/render/test_bundle_schema_v3_contract.py` + renamed to `test_bundle_schema_v4_contract.py` and gains a + `snapshot_day == 30` assertion alongside the existing column-set + pinning. +- **Trap invariant guard.** New `tests/render/test_windowed_bundle_trap.py` + asserts `total_touches_all >= touch_count` for every lead and + `>` for at least some — guarding against a future refactor that + silently widens `touch_count` back to the full horizon and + collapses the pedagogical gap. + ### Bundle schema v3 `bundle_schema_version` bumped from `"2"` to `"3"`. Three structural @@ -53,14 +92,6 @@ changes follow up on PR #56 (issue #57): columns (down from 35); **11** columns in `tables/leads.parquet` (down from 12 — `is_mql` removed). -### Open follow-up - -Issue #57 sub-item 1 remains open: event-aggregate features -(`touch_count`, `session_count`, `pricing_page_views`, ...) are still -computed over the same 90-day window the label resolves in. The -structural fix is a windowed snapshot rebuild and is deferred to its -own PR. - --- ## v1.0.0 — 2026-05-02 diff --git a/release/HF_DATASET_CARD.md b/release/HF_DATASET_CARD.md index c01136f..b70dcc9 100644 --- a/release/HF_DATASET_CARD.md +++ b/release/HF_DATASET_CARD.md @@ -51,7 +51,7 @@ A relational, reproducible, multi-difficulty lead scoring dataset generated by [ 1. **Relational structure.** 9 normalized tables plus ML-ready task splits. Practice feature engineering from raw tables, or grab the flat file and start modeling. 2. **Three difficulty tiers.** Same world, different conversion rates, signal-to-noise ratios, and missingness. -3. **Reproducible and leakage-safe.** Deterministic generation (seed 42), SHA-256 hashes. The label-encoding `current_stage` column is stripped from public bundles via the exposure layer; the `redacted_columns` field in `manifest.json` records what was removed. The deliberately included `total_touches_all` leakage trap is retained as a teaching feature. +3. **Reproducible and leakage-safe.** Deterministic generation (seed 42), SHA-256 hashes. Event-aggregate features are computed over a 30-day window (`manifest.snapshot_day == 30`) while the label resolves over 90 days, so features cannot encode post-conversion events. The label-encoding `current_stage` column is stripped from public bundles via the exposure layer (`redacted_columns` in `manifest.json` lists what was removed). The deliberately included `total_touches_all` leakage trap — counting touches over the full 90-day horizon — is retained as a teaching feature. ## Quick start @@ -92,13 +92,14 @@ df = pd.read_csv("hf://datasets/leadforge/leadforge-b2b-lead-scoring/intermediat Each difficulty tier includes 9 Parquet tables under `tables/`: accounts, contacts, leads, touches, sessions, sales_activities, opportunities, customers, subscriptions. These form a normalized CRM schema linked by foreign keys. -## Leakage handling (bundle schema v3) +## Leakage handling (bundle schema v4) -- **Stripped from public bundles:** `current_stage` (label-encoding at the 90-day horizon) and `is_sql` (P(conv | is_sql=False) ≈ 0.04 / 0.015 / 0.006 across tiers — near-deterministic for non-conversion). Both available in `intermediate_instructor/`. The `manifest.json` field `redacted_columns` lists what was stripped. -- **Removed entirely:** `is_mql` (constant `True` in the simulator — zero variance, no information). -- **Deliberately retained as a pedagogical trap:** `total_touches_all` counts touches over the full 90-day window including post-snapshot events. Flagged `leakage_risk=True` in `feature_dictionary.csv`. Use it as an exercise — train with and without, compare AUC, explain the gap. +Two complementary mechanisms keep the published feature set leakage-safe: -**Caveat:** event-aggregate features (`touch_count`, `session_count`, ...) are computed over the same 90-day window the label resolves in, so they correlate with post-conversion events. A windowed-snapshot rebuild is the structural fix — see [issue #57](https://github.com/leadforge-dev/leadforge/issues/57). +- **Windowed snapshot.** Every event-aggregate feature (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) is computed over the first 30 days only (`manifest.snapshot_day == 30`). The label resolves over the full 90 days (`manifest.label_window_days == 90`). So features cannot accidentally encode events that occur after a lead converts. +- **Column redaction.** `current_stage` (label-encoding at the 90-day horizon) and `is_sql` (P(conv | is_sql=False) ≈ 0.04 / 0.015 / 0.006 across tiers — near-deterministic for non-conversion) are stripped from `student_public` bundles entirely, including from `tables/leads.parquet`. Both are still available in `intermediate_instructor/`. `manifest.redacted_columns` lists exactly what was removed. +- **Removed entirely:** `is_mql` (constant `True` in the simulator — zero variance, no information). +- **Deliberately retained as a pedagogical trap:** `total_touches_all` counts touches over the full 90-day horizon — the only feature that crosses the snapshot window. Flagged `leakage_risk=True` in `feature_dictionary.csv`. Use it as an exercise: train with and without, compare AUC, explain the gap. ## Research companion diff --git a/release/README.md b/release/README.md index e174f49..f7a2afe 100644 --- a/release/README.md +++ b/release/README.md @@ -10,7 +10,7 @@ Most public lead scoring datasets are flat CSVs with opaque provenance. This one 2. **Three difficulty tiers.** Same company, same product, same buyer personas -- different difficulty profiles that produce meaningfully different conversion rates, noise levels, and missingness. -3. **Reproducible and leakage-safe.** Deterministic generation from a fixed seed. SHA-256 hashes for every file in `manifest.json`. The label-encoding `current_stage` column is stripped from the public bundles in the exposure layer; the only leakage-flagged column that ships in `student_public` is the deliberately included pedagogical trap `total_touches_all`, marked `is_leakage_trap=True` in the feature dictionary. All features are anchored at the snapshot date -- no post-cutoff data leaks in by accident. +3. **Reproducible and leakage-safe.** Deterministic generation from a fixed seed. SHA-256 hashes for every file in `manifest.json`. The label-encoding `current_stage` column is stripped from the public bundles in the exposure layer. Event-aggregate features (`touch_count`, `session_count`, `pricing_page_views`, ...) are computed over a 30-day window — they cannot encode events that happen *after* day 30, even though the label resolves over a 90-day window. The only leakage-flagged column that ships in `student_public` is the deliberately included pedagogical trap `total_touches_all`, which counts the full 90-day touch history and is marked `is_leakage_trap=True` in the feature dictionary. ## What's inside @@ -71,7 +71,7 @@ train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parqu test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet") ``` -**Note:** The student-facing Parquet files contain `total_touches_all`, a deliberately included leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`). Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`. +**Note:** Every event-aggregate feature (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) only counts events that occurred within the first **30 days** after the lead was created — that's the snapshot window. The label is then resolved over the next 60 days. So features cannot accidentally encode post-conversion events. The exception is `total_touches_all`, a deliberately included leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`) that counts touches over the full 90-day horizon. Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`. ### Option 3: Relational tables (feature engineering) @@ -128,22 +128,21 @@ The sales funnel runs through inbound marketing (45%), SDR outbound (35%), and p Each bundle contains a `dataset_card.md` and a `feature_dictionary.csv` with the authoritative, auto-generated column list, descriptions, dtypes, and `leakage_risk` flags. Refer to those rather than mirroring counts here, which would drift. -**Leakage handling (bundle schema v3)** +**Leakage handling (bundle schema v4)** -Redaction in `student_public` bundles applies uniformly to **both** the task splits *and* the relational tables. Joining `tables/leads.parquet` back to your features cannot recover a redacted column. +Two separate mechanisms keep the published feature set leakage-safe: + +1. **Windowed snapshot.** Every event-aggregate feature is computed over a 30-day window (`manifest.snapshot_day == 30`); the label resolves over the full 90 days (`manifest.label_window_days == 90`). Features cannot see touches, sessions, or opportunities that occurred after day 30. The only feature that intentionally crosses this line is `total_touches_all`, the pedagogical trap. +2. **Column redaction.** A small set of columns that *would* encode the label structurally (`current_stage`, `is_sql`) are stripped from `student_public` bundles entirely — both from `tasks/` splits and from `tables/leads.parquet`, so feature engineering off the relational tables cannot recover them. | Column | Status in `student_public` | Status in `intermediate_instructor` | Why | |---|---|---|---| | `current_stage` | redacted (gone from task splits and `tables/leads.parquet`) | retained | At day 90 this contains terminal stages (`closed_won`/`closed_lost`) that encode the label directly. | | `is_sql` | redacted | retained | `is_sql=False` predicts non-conversion with very high probability — measured across 5 seeds, P(conv \| is_sql=False) = 0.061 ± 0.026 (intro) / 0.020 ± 0.010 (intermediate) / 0.011 ± 0.004 (advanced). | | `is_mql` | removed entirely (no mode has it) | removed entirely | Every lead is initialised at MQL stage in the simulator, so the field was constant `True` and carried no information. | -| `total_touches_all` | retained | retained | Deliberate pedagogical leakage trap (counts touches over the full 90-day window). Flagged `leakage_risk=True` in `feature_dictionary.csv`. Train with and without it, compare AUC, explain the gap. | - -The `redacted_columns` field in each bundle's `manifest.json` records exactly what was stripped. - -**Known caveats** (tracked in [issue #57](https://github.com/leadforge-dev/leadforge/issues/57)): +| `total_touches_all` | retained | retained | Deliberate pedagogical leakage trap. Counts touches over the full 90-day horizon while every other touch feature stops at day 30, so the gap (`total_touches_all - touch_count`) carries real signal. Flagged `leakage_risk=True` in `feature_dictionary.csv`. Train with and without it, compare AUC, explain the gap. | -- All event-aggregate features (`touch_count`, `session_count`, `pricing_page_views`, ...) are computed over the same 90-day window in which the label resolves. They correlate with post-conversion events. The structural fix is a windowed snapshot rebuild — open follow-up. +The `redacted_columns` and `snapshot_day` fields in each bundle's `manifest.json` record exactly what was stripped and at what window features were computed. ## Research companion From f19d58e0897ad375aaac5173a4bd955006e6b0e4 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 5 May 2026 08:03:23 +0300 Subject: [PATCH 3/4] fix: tighten snapshot_day contract per self-review of #59 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review of the windowed-snapshot PR surfaced seven gaps; this commit addresses all of them. - Add tests/test_snapshot_day_threading.py — 22 tests covering Recipe.from_dict parsing (positive int, absent → None, rejection of zero/negative/string/bool), resolve_config precedence across all four layers (package default, recipe, override dict, kwarg), and GenerationConfig validation paths. Models the existing test_primary_task_threading.py. - Validate snapshot_day <= label_window_days in GenerationConfig.__post_init__. A snapshot anchored after the label closes would let features observe events past the label-scoring window — the exact structural leakage v4 is here to prevent. - Strengthen test_windowed_bundle_trap.py: replace the bare ``> 0`` check with ``frac_with_gap >= 0.20``. Measurements at the recipe default show 54-77% of leads carry a gap; a regression that collapsed the trap to a single lead would no longer pass silently. - Pin snapshot_day, primary_task, label_window_days, redacted_columns in the required-keys set in tests/render/test_render.py. These were already in the manifest but not formally pinned anywhere outside the v4 contract test. - Surface snapshot_day in the bundle-internal dataset_card.md: add ``Label window`` and ``Feature snapshot window`` rows to the header table, and replace the generic "Features are anchored at the snapshot date" caveat with concrete prose naming the day-30 feature window, the day-90 label window, and the total_touches_all trap. Consumers reading the card now see the v4 contract without needing the changelog. - Reconcile snapshot_day validation messages: both layers now say "must be a positive int or None/null". - Trim the recipe.yaml comment to a one-liner pointing at CHANGELOG / release docs for the rationale. 937 tests pass (was 915; +22 from the new threading file). ruff + mypy clean; verify_hash_determinism.py PASS 73/73. Co-Authored-By: Claude Opus 4.7 --- leadforge/api/recipes.py | 6 +- leadforge/core/models.py | 15 +- leadforge/narrative/dataset_card.py | 24 ++- .../b2b_saas_procurement_v1/recipe.yaml | 7 +- tests/render/test_render.py | 4 + tests/render/test_windowed_bundle_trap.py | 23 ++- tests/test_snapshot_day_threading.py | 184 ++++++++++++++++++ 7 files changed, 246 insertions(+), 17 deletions(-) create mode 100644 tests/test_snapshot_day_threading.py diff --git a/leadforge/api/recipes.py b/leadforge/api/recipes.py index 846918b..17d11d7 100644 --- a/leadforge/api/recipes.py +++ b/leadforge/api/recipes.py @@ -111,10 +111,12 @@ def from_dict(cls, data: dict[str, Any]) -> Recipe: if raw_sd is not None: if isinstance(raw_sd, bool) or not isinstance(raw_sd, int): raise InvalidRecipeError( - f"'snapshot_day' must be a positive int, got {type(raw_sd).__name__!r}" + f"'snapshot_day' must be a positive int or null, got {type(raw_sd).__name__!r}" ) if raw_sd <= 0: - raise InvalidRecipeError(f"'snapshot_day' must be positive, got {raw_sd}") + raise InvalidRecipeError( + f"'snapshot_day' must be a positive int or null, got {raw_sd}" + ) snapshot_day = raw_sd return cls( diff --git a/leadforge/core/models.py b/leadforge/core/models.py index db8b609..b78b9ac 100644 --- a/leadforge/core/models.py +++ b/leadforge/core/models.py @@ -95,12 +95,25 @@ def __post_init__(self) -> None: f"got {type(self.snapshot_day).__name__!r}" ) if self.snapshot_day <= 0: - raise InvalidConfigError(f"snapshot_day must be positive, got {self.snapshot_day}") + raise InvalidConfigError( + f"snapshot_day must be a positive int or None, got {self.snapshot_day}" + ) if self.snapshot_day > self.horizon_days: raise InvalidConfigError( f"snapshot_day ({self.snapshot_day}) must not exceed " f"horizon_days ({self.horizon_days})" ) + # A snapshot anchored after the label closes would let features + # observe events that occur beyond the label-scoring window — + # exactly the structural leakage the windowed snapshot is here + # to prevent. Reject at config time. + if self.snapshot_day > self.label_window_days: + raise InvalidConfigError( + f"snapshot_day ({self.snapshot_day}) must not exceed " + f"label_window_days ({self.label_window_days}); a snapshot " + f"anchored after the label closes would re-introduce " + f"structural leakage." + ) # Coerce string enums supplied as plain strings if not isinstance(self.exposure_mode, ExposureMode): try: diff --git a/leadforge/narrative/dataset_card.py b/leadforge/narrative/dataset_card.py index 01e687d..8f049ff 100644 --- a/leadforge/narrative/dataset_card.py +++ b/leadforge/narrative/dataset_card.py @@ -54,6 +54,11 @@ def render_dataset_card( # ------------------------------------------------------------------ # Header # ------------------------------------------------------------------ + snapshot_label = ( + f"{cfg.snapshot_day} days (windowed)" + if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days + else f"{cfg.horizon_days} days (full horizon)" + ) lines += [ "# leadforge dataset card", "", @@ -65,6 +70,8 @@ def render_dataset_card( f"| Exposure mode | `{cfg.exposure_mode}` |", f"| Difficulty | `{cfg.difficulty}` |", f"| Horizon | {cfg.horizon_days} days |", + f"| Label window | {cfg.label_window_days} days |", + f"| Feature snapshot window | {snapshot_label} |", "", ] @@ -188,14 +195,27 @@ def render_dataset_card( # ------------------------------------------------------------------ # Caveats # ------------------------------------------------------------------ + if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days: + feature_window_caveat = ( + f"- Event-aggregate features (e.g. `touch_count`, `session_count`, " + f"`expected_acv`) are computed over the first {cfg.snapshot_day} days " + f"after lead creation; the label resolves over the next " + f"{cfg.label_window_days - cfg.snapshot_day} days. The deliberate " + f"exception is `total_touches_all`, which counts touches over the " + f"full {cfg.horizon_days}-day horizon as a pedagogical leakage trap." + ) + else: + feature_window_caveat = ( + "- Features are anchored at the snapshot date. No post-anchor data is " + "included (leakage-free by construction)." + ) lines += [ "## Caveats", "", "- This is **synthetic** data. It does not represent any real company, product, or market.", "- The hidden world structure varies by motif family and stochastic rewiring; " "no two seeds produce the same DGP.", - "- Features are anchored at the snapshot date. No post-anchor data is " - "included (leakage-free by construction).", + feature_window_caveat, "- In `student_public` mode, the latent world graph, mechanism summary, " "and full world spec are withheld.", "", diff --git a/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml b/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml index 08843a3..3fc36d6 100644 --- a/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml +++ b/leadforge/recipes/b2b_saas_procurement_v1/recipe.yaml @@ -18,9 +18,6 @@ default_population: n_contacts: 4200 n_leads: 5000 horizon_days: 90 -# Windowed snapshot: features aggregate only events within -# [lead_created_at, lead_created_at + snapshot_day]. Anchored before the -# 90-day label window closes so event-aggregate features (touch_count, -# session_count, expected_acv, etc.) cannot encode post-conversion data. -# total_touches_all retains the pedagogical leakage trap (full-horizon counts). +# Feature aggregation window in days; see CHANGELOG (bundle schema v4) and +# release/README.md for the rationale. Must satisfy snapshot_day <= label_window_days. snapshot_day: 30 diff --git a/tests/render/test_render.py b/tests/render/test_render.py index fa4da40..aad4db1 100644 --- a/tests/render/test_render.py +++ b/tests/render/test_render.py @@ -375,7 +375,11 @@ def test_required_top_level_keys(self, sim_outputs, tmp_path): "n_contacts", "n_leads", "horizon_days", + "primary_task", + "label_window_days", + "snapshot_day", "motif_family", + "redacted_columns", "tables", "tasks", } diff --git a/tests/render/test_windowed_bundle_trap.py b/tests/render/test_windowed_bundle_trap.py index c51b59a..89c805a 100644 --- a/tests/render/test_windowed_bundle_trap.py +++ b/tests/render/test_windowed_bundle_trap.py @@ -53,17 +53,26 @@ def test_total_touches_all_dominates_touch_count(windowed_bundle: Path) -> None: def test_trap_gap_is_meaningful(windowed_bundle: Path) -> None: - """At ``snapshot_day < horizon_days``, the gap must be non-trivial: some - leads must have ``total_touches_all > touch_count``. If not, the trap is - pedagogically dead and the windowed snapshot wiring is broken.""" + """At ``snapshot_day < horizon_days``, the gap must be non-trivial. + + A bare ``> 0`` check would pass even if the trap collapsed to a single + lead. Measurements at the recipe default (``snapshot_day=30``) show + 54-77% of leads carry a gap across all three difficulty tiers; the + intermediate bundle this test uses sits in that window, so a + ``>= 20%`` floor is well below the natural rate yet high enough that + a regression collapsing the trap would trip it. + """ train = pd.read_parquet( windowed_bundle / "tasks" / "converted_within_90_days" / "train.parquet", columns=["touch_count", "total_touches_all"], ) diff = train["total_touches_all"].astype("Float64") - train["touch_count"].astype("Float64") n_with_gap = int((diff > 0).sum()) - assert n_with_gap > 0, ( - "No lead has total_touches_all > touch_count — the trap gap collapsed.\n" - "Either snapshot_day was set to horizon_days (bundle is no longer windowed) " - "or the build_snapshot() windowing logic regressed." + frac_with_gap = n_with_gap / len(train) + + assert frac_with_gap >= 0.20, ( + f"Only {frac_with_gap:.1%} of leads carry a gap between total_touches_all " + f"and touch_count (n={n_with_gap}/{len(train)}). Expected >=20%.\n" + "Either snapshot_day was set close to horizon_days (bundle is barely " + "windowed) or the build_snapshot() windowing logic regressed." ) diff --git a/tests/test_snapshot_day_threading.py b/tests/test_snapshot_day_threading.py new file mode 100644 index 0000000..56c0fc5 --- /dev/null +++ b/tests/test_snapshot_day_threading.py @@ -0,0 +1,184 @@ +"""Tests for ``snapshot_day`` threading through the generation pipeline. + +Verifies all four precedence layers (package default → recipe → override +dict → explicit kwarg) and the validation paths in +:meth:`GenerationConfig.__post_init__`. + +Companion to :mod:`tests.test_primary_task_threading`, which covers the +sibling fields ``primary_task`` and ``label_window_days`` introduced in +PR #36 / PR #43. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from leadforge.api.generator import Generator +from leadforge.api.recipes import Recipe +from leadforge.core.exceptions import InvalidConfigError, InvalidRecipeError +from leadforge.core.models import GenerationConfig +from leadforge.core.serialization import load_json + +_SMALL = {"n_leads": 30, "n_accounts": 15, "n_contacts": 45} + + +# --------------------------------------------------------------------------- +# Recipe.from_dict parsing +# --------------------------------------------------------------------------- + + +def _base_recipe_dict(**extra) -> dict: + """Minimal valid recipe payload for from_dict tests.""" + payload = { + "id": "test_recipe", + "title": "Test recipe", + "vertical": "test_vertical", + "description": "test", + "primary_task": "converted_within_90_days", + "supported_modes": ["student_public", "research_instructor"], + "supported_difficulty": ["intro", "intermediate", "advanced"], + "default_population": {"n_accounts": 1, "n_contacts": 1, "n_leads": 1}, + "horizon_days": 90, + } + payload.update(extra) + return payload + + +class TestRecipeFromDictSnapshotDay: + def test_absent_key_yields_none(self) -> None: + recipe = Recipe.from_dict(_base_recipe_dict()) + assert recipe.snapshot_day is None + + def test_positive_int_round_trips(self) -> None: + recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=30)) + assert recipe.snapshot_day == 30 + + def test_zero_rejected(self) -> None: + with pytest.raises(InvalidRecipeError, match="snapshot_day"): + Recipe.from_dict(_base_recipe_dict(snapshot_day=0)) + + def test_negative_rejected(self) -> None: + with pytest.raises(InvalidRecipeError, match="snapshot_day"): + Recipe.from_dict(_base_recipe_dict(snapshot_day=-5)) + + def test_string_rejected(self) -> None: + with pytest.raises(InvalidRecipeError, match="snapshot_day"): + Recipe.from_dict(_base_recipe_dict(snapshot_day="30")) + + def test_bool_rejected(self) -> None: + # bool is an int subclass; the validator must reject it explicitly. + with pytest.raises(InvalidRecipeError, match="snapshot_day"): + Recipe.from_dict(_base_recipe_dict(snapshot_day=True)) + + def test_explicit_null_treated_as_none(self) -> None: + recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=None)) + assert recipe.snapshot_day is None + + +# --------------------------------------------------------------------------- +# resolve_config precedence +# --------------------------------------------------------------------------- + + +class TestResolveConfigSnapshotDayPrecedence: + def test_package_default_when_recipe_silent(self) -> None: + recipe = Recipe.from_dict(_base_recipe_dict()) # no snapshot_day + cfg = recipe.resolve_config() + assert cfg.snapshot_day is None # package default + + def test_recipe_default_overrides_package(self) -> None: + recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=20)) + cfg = recipe.resolve_config() + assert cfg.snapshot_day == 20 + + def test_override_dict_beats_recipe(self) -> None: + recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=20)) + cfg = recipe.resolve_config(override={"snapshot_day": 45}) + assert cfg.snapshot_day == 45 + + def test_kwarg_beats_override_and_recipe(self) -> None: + recipe = Recipe.from_dict(_base_recipe_dict(snapshot_day=20)) + cfg = recipe.resolve_config(override={"snapshot_day": 45}, snapshot_day=60) + assert cfg.snapshot_day == 60 + + def test_b2b_recipe_pins_thirty(self) -> None: + """Sanity check that the shipped recipe applies snapshot_day=30.""" + from leadforge.recipes.registry import load_recipe + + recipe = Recipe.from_dict(load_recipe("b2b_saas_procurement_v1")) + cfg = recipe.resolve_config() + assert cfg.snapshot_day == 30 + + +# --------------------------------------------------------------------------- +# GenerationConfig validation +# --------------------------------------------------------------------------- + + +class TestGenerationConfigValidation: + def test_none_is_valid(self) -> None: + cfg = GenerationConfig(snapshot_day=None) + assert cfg.snapshot_day is None + + def test_positive_int_is_valid(self) -> None: + cfg = GenerationConfig(snapshot_day=30) + assert cfg.snapshot_day == 30 + + def test_zero_rejected(self) -> None: + with pytest.raises(InvalidConfigError, match="snapshot_day"): + GenerationConfig(snapshot_day=0) + + def test_negative_rejected(self) -> None: + with pytest.raises(InvalidConfigError, match="snapshot_day"): + GenerationConfig(snapshot_day=-1) + + def test_string_rejected(self) -> None: + with pytest.raises(InvalidConfigError, match="snapshot_day"): + GenerationConfig(snapshot_day="30") # type: ignore[arg-type] + + def test_bool_rejected(self) -> None: + with pytest.raises(InvalidConfigError, match="snapshot_day"): + GenerationConfig(snapshot_day=True) # type: ignore[arg-type] + + def test_exceeds_horizon_rejected(self) -> None: + with pytest.raises(InvalidConfigError, match="horizon_days"): + GenerationConfig(horizon_days=20, label_window_days=20, snapshot_day=21) + + def test_exceeds_label_window_rejected(self) -> None: + """A snapshot anchored after the label closes is nonsensical: the + feature window would extend past events the label was scored on. + Catch this at config time, not at the consumer end.""" + with pytest.raises(InvalidConfigError, match="label_window_days"): + GenerationConfig(horizon_days=120, label_window_days=90, snapshot_day=100) + + def test_equal_to_horizon_is_valid(self) -> None: + # Equal is the boundary case — this is "full-horizon" expressed + # explicitly rather than via None. Allowed. + cfg = GenerationConfig(horizon_days=90, label_window_days=90, snapshot_day=90) + assert cfg.snapshot_day == 90 + + +# --------------------------------------------------------------------------- +# End-to-end: kwarg → manifest field +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def custom_snapshot_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path: + """Bundle generated with snapshot_day overridden via the explicit kwarg.""" + out = tmp_path_factory.mktemp("custom_snapshot") + Generator.from_recipe( + "b2b_saas_procurement_v1", + seed=42, + exposure_mode="student_public", + snapshot_day=20, + ).generate(**_SMALL).save(str(out)) + return out + + +class TestEndToEnd: + def test_kwarg_overrides_recipe_in_manifest(self, custom_snapshot_bundle: Path) -> None: + manifest = load_json(custom_snapshot_bundle / "manifest.json") + assert manifest["snapshot_day"] == 20 # not 30 (the recipe default) From 650c3a7535e20ddda250bee96e5414e9ae7339b1 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 5 May 2026 08:14:45 +0300 Subject: [PATCH 4/4] fix: correct label-window phrasing in release docs (Copilot review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous wording — "the label is then resolved over the next 60 days" — implied the label was scored over [day 30, day 90]. In fact ``converted_within_90_days`` is evaluated over the full 90 days from lead creation; only the *features* are restricted to the first 30 days. Same misleading phrasing existed in the auto-generated dataset_card.md caveat and is corrected too, so consumers reading the in-bundle card get the same accurate description. Resolves Copilot review comment on release/README.md:74. The matching review comment on leadforge/core/models.py:104 was already addressed in f19d58e (snapshot_day <= label_window_days validation block); resolved as already-treated. Co-Authored-By: Claude Opus 4.7 --- leadforge/narrative/dataset_card.py | 8 ++++---- release/README.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/leadforge/narrative/dataset_card.py b/leadforge/narrative/dataset_card.py index 8f049ff..ea265cd 100644 --- a/leadforge/narrative/dataset_card.py +++ b/leadforge/narrative/dataset_card.py @@ -197,10 +197,10 @@ def render_dataset_card( # ------------------------------------------------------------------ if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days: feature_window_caveat = ( - f"- Event-aggregate features (e.g. `touch_count`, `session_count`, " - f"`expected_acv`) are computed over the first {cfg.snapshot_day} days " - f"after lead creation; the label resolves over the next " - f"{cfg.label_window_days - cfg.snapshot_day} days. The deliberate " + f"- The label is evaluated over the full {cfg.label_window_days}-day " + f"window from lead creation; event-aggregate features (e.g. " + f"`touch_count`, `session_count`, `expected_acv`) observe only the " + f"first {cfg.snapshot_day} days of that window. The deliberate " f"exception is `total_touches_all`, which counts touches over the " f"full {cfg.horizon_days}-day horizon as a pedagogical leakage trap." ) diff --git a/release/README.md b/release/README.md index f7a2afe..cb85b61 100644 --- a/release/README.md +++ b/release/README.md @@ -71,7 +71,7 @@ train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parqu test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet") ``` -**Note:** Every event-aggregate feature (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) only counts events that occurred within the first **30 days** after the lead was created — that's the snapshot window. The label is then resolved over the next 60 days. So features cannot accidentally encode post-conversion events. The exception is `total_touches_all`, a deliberately included leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`) that counts touches over the full 90-day horizon. Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`. +**Note:** The label `converted_within_90_days` is evaluated over the full **90 days** from lead creation. Event-aggregate features (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) observe **only the first 30 days** of that window — so even when a lead converts on day 50, the features are frozen at day 30 and cannot encode the conversion event. The deliberate exception is `total_touches_all`, a leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`) that counts touches over the full 90-day horizon. Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`. ### Option 3: Relational tables (feature engineering)