From 6dcddcd4d2410194c892e254518b3de627d7917a Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 00:48:23 +0300 Subject: [PATCH 1/4] feat(validation): release_quality + reporting modules (PR 3.2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the release-grade metric panel and renderer that v1's validate_release_candidate driver (PR 3.3) will consume. leadforge/validation/release_quality.py: - TierMetrics / CrossSeedTierMetrics / CohortShiftMetrics / CrossTierOrdering / ReleaseQualityReport dataclasses, JSON-primitive end-to-end so report_to_dict + json.dumps suffices. - measure_tier_from_bundle(): full G7.* panel — LR + HistGBM AUC, AP, log loss, Brier, calibration bins, P@{50,100}, R@K, lift@{1,5,10}%, top-decile rate, expected-ACV capture, GBM-vs-LR delta, plus source/engagement/stage/post-snapshot/ID-only baseline AUCs (G5.*). - measure_cohort_shift_from_bundle(): random-vs-chronological-cohort split AUC degradation (G6.4) using HistGBM (NaN-tolerant). - regenerate_tier_for_seeds() orchestrates cross-seed rebuilds via Generator.from_recipe; idempotent across re-runs. - measure_release_quality() aggregates per-(tier, seed) into the full ReleaseQualityReport with G8.1 cross-seed spreads and G7.4.* cross-tier ordering booleans. leadforge/validation/reporting.py: - render_report(report, output_dir) writes the pinned output contract: validation_report.json, validation_report.md, and figures/ lift_curve_{intro,intermediate,advanced}.png, calibration_intermediate.png, leakage_delta.png, cohort_shift.png, value_capture.png — under the Agg backend (CI-safe). - Markdown carries a $.tiers..medians. JSON-path citation on every metric cell per G10.6. - NaN floats serialise as JSON null and as _n/a_ in markdown. pyproject.toml: - matplotlib>=3.7 added to [scripts] and [dev] extras + mypy override. Tests (28 new): - tests/validation/test_release_quality.py: metric primitives with hand-built inputs (perfect ranker, known miscalibration, zero base rate), JSON serialisation including NaN coercion, cross-tier ordering, and bundle-level measurement against synthetic mini- bundles (degenerate train surfaces a clear ValueError). - tests/validation/test_reporting.py: every contract file is written non-empty, JSON well-formed, markdown cites JSON paths for every tier, partial-release rendering skips missing tiers, byte-identical determinism across two consecutive renders. - tests/integration/test_release_quality_round_trip.py: full Generator.from_recipe(_SMALL).save → measure_release_quality → render_report flow at N=2 seeds (the full N=5 sweep is PR 3.3's driver). Acceptance: - 1095/1095 tests pass (was 1067). - ruff check + format clean; mypy clean. - BUNDLE_SCHEMA_VERSION unchanged (purely additive). - scripts/verify_hash_determinism.py PASS, 67/67 files identical. - scripts/probe_relational_leakage.py release/{intro,intermediate, advanced} --max-accuracy 0.65 still exits 0. Co-Authored-By: Claude Opus 4.7 --- .agent-plan.md | 2 +- leadforge/validation/release_quality.py | 1100 +++++++++++++++++ leadforge/validation/reporting.py | 514 ++++++++ pyproject.toml | 6 + .../test_release_quality_round_trip.py | 198 +++ tests/validation/test_release_quality.py | 511 ++++++++ tests/validation/test_reporting.py | 256 ++++ 7 files changed, 2586 insertions(+), 1 deletion(-) create mode 100644 leadforge/validation/release_quality.py create mode 100644 leadforge/validation/reporting.py create mode 100644 tests/integration/test_release_quality_round_trip.py create mode 100644 tests/validation/test_release_quality.py create mode 100644 tests/validation/test_reporting.py diff --git a/.agent-plan.md b/.agent-plan.md index 55eb99c..7cc8948 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -36,7 +36,7 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family ### Phase 3 — Release validation hardening - [x] PR 3.1: `leadforge/validation/leakage_probes.py` (new) — unified leakage taxonomy. Subsumes the PR 2.1 `relational_leakage` module and broadens it to the full design-doc / acceptance-gates taxonomy: direct (banned columns / banned tables, generalised to accept caller-supplied banned sets), time-window (`probe_snapshot_window`, generalised over `(table, ts_col)` pairs), relational (`probe_deterministic_reconstruction`, `deterministic_relational_reconstruction`), split (`probe_split_id_overlap` for G6.1/G6.2, `probe_split_near_duplicates` via deterministic rounded-vector hashing for G6.3, `probe_split_label_drift` opt-in), model-realism (`probe_bonus_model_auc` opt-in, new opt-in `probe_id_only_baseline` for G5.3, `probe_feature_subset_baseline` for G5.1/G5.2). `PROBE_REGISTRY` is the single source of truth (probe → taxonomy / opt-in flag); meta-test asserts every module-level `probe_*` is registered. Two orchestrators: `run_all_probes` / `run_all_probes_on_dataframes` (structural, kept stable for `validate_bundle`) and new `run_split_probes` (split-level over `{split_name: DataFrame}`). `relational_leakage.py` deleted; every internal call site updated (`leadforge/validation/{bundle_checks,invariants}.py`, `leadforge/render/{manifests,relational_snapshot_safe}.py`, `leadforge/exposure/filters.py` doc, `scripts/probe_relational_leakage.py`); test file renamed `test_relational_leakage.py` → `test_leakage_probes.py` and grew 24 new tests for the new probes + meta-coverage. `RelationalLeakageError` retained (now spans every taxonomy) with `LeakageError` alias for the new umbrella name. `BUNDLE_SCHEMA_VERSION` unchanged (purely additive on the validator side); 1067/1067 tests pass; hash-determinism preserved (67/67 files identical); `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier. -- [ ] PR 3.2: `leadforge/validation/{release_quality,reporting}.py` (new) +- [x] PR 3.2: `leadforge/validation/release_quality.py` + `leadforge/validation/reporting.py` (new). `release_quality.py` produces a structured `ReleaseQualityReport` (JSON-primitive `TierMetrics` / `CrossSeedTierMetrics` / `CohortShiftMetrics` / `CrossTierOrdering` dataclasses) covering G7.* (per-tier ROC-AUC, PR-AUC, log loss, Brier, calibration bins, P@K / R@K, lift@{1,5,10}%, top-decile rate, expected-ACV capture, LR-vs-HistGBM delta, source/engagement/stage/post-snapshot/ID-only baseline AUCs), G8.1 (cross-seed median + spread bands), G6.4 (random-vs-chronological cohort-shift split with HistGBM), and G7.4.* (cross-tier ordering booleans + descending rankings). `TierBuildSpec.from_bundle` + idempotent `regenerate_tier_for_seeds(spec, seeds, workdir)` orchestrate cross-seed rebuilds via `Generator.from_recipe`. `reporting.py` ships `render_report(report, output_dir)` writing `validation_report.json` (deterministic `dataclasses.asdict` + sorted-keys `json.dumps`, NaN→null), `validation_report.md` (every metric cell carries a `$.tiers..medians.` JSON-path citation per G10.6), and the pinned figure set (`lift_curve_{intro,intermediate,advanced}.png`, `calibration_intermediate.png`, `leakage_delta.png`, `cohort_shift.png`, `value_capture.png`) under the Agg backend. New deps: `matplotlib>=3.7` added to `[scripts]` and `[dev]` extras (mypy override too). `pyproject.toml` mypy override added. 28 new tests across `tests/validation/test_release_quality.py`, `tests/validation/test_reporting.py`, and `tests/integration/test_release_quality_round_trip.py` (synthetic minimal bundles + N=2 round-trip via `Generator.from_recipe(...).generate(_SMALL).save(...)`); 1095/1095 tests pass; ruff + mypy clean; hash-determinism preserved (67/67 files identical); `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` still exits 0 on every public tier; `BUNDLE_SCHEMA_VERSION` unchanged (purely additive layer on top of the validator/reporting stack). - [ ] PR 3.3: `scripts/validate_release_candidate.py` (new); resolve numeric `TBD-*` bands in `v1_acceptance_gates.md`; `release/validation/validation_report.{json,md}` + figures auto-generated ### Phase 4 — Channel-signal audit + dataset card hardening diff --git a/leadforge/validation/release_quality.py b/leadforge/validation/release_quality.py new file mode 100644 index 0000000..f682250 --- /dev/null +++ b/leadforge/validation/release_quality.py @@ -0,0 +1,1100 @@ +"""Release-grade quality metrics for ``leadforge-lead-scoring-v1`` bundles. + +Sits one layer above +:mod:`leadforge.validation.{realism,difficulty,drift,lead_scoring}` and +produces a single :class:`ReleaseQualityReport` covering G7.* (per-tier +performance), G8.* (cross-seed stability), and G6.4 (cohort/time-shift +degradation) of ``docs/release/v1_acceptance_gates.md``. + +PR 3.2 measures and serialises; PR 3.3 calibrates per-tier band literals +in :mod:`leadforge.validation.difficulty` and gates on them. This module +deliberately stays band-free so the same numbers feed both the JSON +report and the (future) gating layer. + +Public surface +-------------- + +* :func:`measure_tier_from_bundle` — full metric panel for one bundle. +* :func:`measure_cohort_shift_from_bundle` — random-vs-cohort split AUC. +* :func:`regenerate_tier_for_seeds` — orchestrate cross-seed rebuilds. +* :func:`measure_release_quality` — top-level orchestrator producing the + full :class:`ReleaseQualityReport`. + +Result dataclasses (:class:`TierMetrics`, :class:`CrossSeedTierMetrics`, +:class:`CohortShiftMetrics`, :class:`CrossTierOrdering`, +:class:`ReleaseQualityReport`) are JSON-primitive end-to-end so +:func:`leadforge.validation.reporting.render_report` can ``asdict`` → +``json.dumps`` without custom encoders. +""" + +from __future__ import annotations + +import dataclasses +import hashlib +import json +import math +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any, cast + +import numpy as np +import pandas as pd + +from leadforge.core.serialization import load_json +from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +#: Label column on the snapshot task splits. Mirrors +#: :data:`leadforge.validation.realism._LABEL_COLUMN`; duplicated here to +#: keep this module standalone (the realism module uses a different +#: label-resolution path that goes through the bundle manifest). +LABEL_COLUMN = "converted_within_90_days" + +#: Default seed used when the caller doesn't pin one. Held constant so +#: the released report is reproducible across re-runs of the driver. +DEFAULT_SEED: int = 42 + +#: K values for ``precision_at_k`` / ``recall_at_k``. Matches G7.*.6 in +#: ``v1_acceptance_gates.md`` (P@100 is the headline; P@50 carries the +#: tighter top-of-funnel bound). +PRECISION_KS: tuple[int, ...] = (50, 100) + +#: Lift percentages (top-X% of predictions, by score). Matches the +#: design-doc §"Release validation" call for "lift@1/5/10%". +LIFT_PCTS: tuple[float, ...] = (1.0, 5.0, 10.0) + +#: Number of equal-width bins for the calibration / reliability diagram. +N_CALIBRATION_BINS: int = 10 + +#: Fraction of the chronologically-ordered combined train+test used as +#: training data for the cohort-shift comparison; the remainder is the +#: cohort test set. 85/15 mirrors the bundle's own valid+test fraction +#: (15%) so the two splits are roughly comparable in test size. +COHORT_TRAIN_FRAC: float = 0.85 + +#: Random-state seed used inside the cohort-shift HistGBM so the +#: random-vs-cohort comparison shares one source of randomness. +COHORT_SEED: int = 42 + + +# --------------------------------------------------------------------------- +# Result dataclasses +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class CalibrationBin: + """One row of a reliability diagram.""" + + bin_lower: float + bin_upper: float + n: int + mean_predicted: float + mean_actual: float + + +@dataclass(frozen=True) +class TierMetrics: + """Full metric panel for one (tier, seed) pair. + + Mirrors the gates declared in ``v1_acceptance_gates.md`` G7.*; field + names are stable because PR 3.3 wires them straight into the JSON + report and the markdown citation table (G10.6). Add new metrics + only at the bottom of this list, never in the middle. + """ + + tier: str + seed: int + n_train: int + n_test: int + base_rate: float + + conversion_rate_train: float + conversion_rate_test: float + + # Headline pair: LR (interpretable) vs HistGBM (sophistication-rewarding). + lr_auc: float + gbm_auc: float + gbm_minus_lr_auc: float + lr_average_precision: float + gbm_average_precision: float + + # Average-precision under the LR model (G7.*.5 reports the LR number + # because the LR model is the canonical baseline in the dataset card; + # GBM AP is reported for the cross-tier ordering check below). + average_precision: float + + precision_at_k: dict[str, float] + recall_at_k: dict[str, float] + lift_at_pct: dict[str, float] + top_decile_rate: float + + # Value-aware ranking (G7.*.5 / design-doc "expected ACV captured at K"). + expected_acv_capture_at_k: dict[str, float] + + # Calibration (G7.*.7 / G7.*.8). + brier_score: float + log_loss: float + calibration_max_bin_error: float + calibration_bins: list[CalibrationBin] + + # Model-family / feature-subset baselines. Names are well-known + # constants, not free strings, so the reporting layer can render + # them deterministically. AUCs are absolute; deltas (full_lr_auc - + # baseline_auc) are computed once at serialisation time. + baselines: dict[str, float] + + +@dataclass(frozen=True) +class CrossSeedTierMetrics: + """Aggregate of :class:`TierMetrics` across one tier's seed sweep.""" + + tier: str + seeds: list[int] + per_seed: list[TierMetrics] + medians: dict[str, float] + spreads: dict[str, float] + + +@dataclass(frozen=True) +class CohortShiftMetrics: + """Random vs chronological cohort split AUC for one tier (G6.4).""" + + tier: str + seed: int + random_split_auc: float + cohort_split_auc: float + auc_degradation: float + + +@dataclass(frozen=True) +class CrossTierOrdering: + """Cross-tier difficulty ordering (G7.4.*).""" + + by_average_precision: list[str] + by_precision_at_100: list[str] + by_gbm_minus_lr: list[str] + by_conversion_rate: list[str] + average_precision_intro_gt_intermediate: bool + average_precision_intermediate_gt_advanced: bool + precision_at_100_intro_gt_intermediate: bool + precision_at_100_intermediate_gt_advanced: bool + conversion_rate_intro_gt_intermediate: bool + conversion_rate_intermediate_gt_advanced: bool + gbm_minus_lr_positive_in_every_tier: bool + + +@dataclass(frozen=True) +class ReleaseQualityReport: + """Top-level structured result. JSON-primitive end-to-end.""" + + release_id: str + package_version: str + generation_timestamp: str + seeds: list[int] + tiers: dict[str, CrossSeedTierMetrics] + cohort_shift: dict[str, CohortShiftMetrics] + cross_tier_ordering: CrossTierOrdering + + +@dataclass(frozen=True) +class TierBuildSpec: + """Recipe configuration to regenerate a tier across seeds. + + Fields default to manifest values via :meth:`from_bundle`. PR 3.3's + driver builds one of these per tier and hands them to + :func:`measure_release_quality` along with the seed list. + """ + + name: str + recipe_id: str + difficulty: str + n_leads: int + n_accounts: int + n_contacts: int + snapshot_day: int | None + primary_task: str = "converted_within_90_days" + label_window_days: int = 90 + exposure_mode: str = "student_public" + + @classmethod + def from_bundle(cls, bundle_dir: Path, *, name: str | None = None) -> TierBuildSpec: + """Build a spec by reading a bundle's manifest.json.""" + manifest = load_json(bundle_dir / "manifest.json") + return cls( + name=name or str(manifest.get("difficulty", bundle_dir.name)), + recipe_id=str(manifest["recipe_id"]), + difficulty=str(manifest["difficulty"]), + n_leads=int(manifest["n_leads"]), + n_accounts=int(manifest["n_accounts"]), + n_contacts=int(manifest["n_contacts"]), + snapshot_day=int(manifest["snapshot_day"]) if manifest.get("snapshot_day") else None, + primary_task=str(manifest.get("primary_task", "converted_within_90_days")), + label_window_days=int(manifest.get("label_window_days", 90)), + exposure_mode=str(manifest.get("exposure_mode", "student_public")), + ) + + +# --------------------------------------------------------------------------- +# Single-tier measurement +# --------------------------------------------------------------------------- + + +def measure_tier_from_bundle( + bundle_dir: Path, + *, + seed: int = DEFAULT_SEED, + tier_name: str | None = None, +) -> TierMetrics: + """Compute the full :class:`TierMetrics` panel for one bundle. + + Reads the primary task's ``train.parquet`` / ``test.parquet`` (the + ``valid`` split is intentionally unused here — it is reserved for + hyperparameter selection by downstream consumers, and including it + in the test set would conflate model selection with reporting). + + Args: + bundle_dir: Path to a single-seed bundle root. + seed: Random-state seed for the sklearn estimators. Bundle-level + generation seed is read from the manifest separately and is + independent of this argument. + tier_name: Override the tier label. Defaults to the bundle's + declared difficulty. + + Raises: + FileNotFoundError: when the manifest or task files are missing. + ValueError: when the train split has only one class (an honest + degeneracy that breaks every downstream metric, surfaced + loudly rather than silently producing NaNs). + """ + sk = _import_sklearn() + manifest = load_json(bundle_dir / "manifest.json") + primary_task = str(manifest.get("primary_task", "converted_within_90_days")) + + train_path = bundle_dir / f"tasks/{primary_task}/train.parquet" + test_path = bundle_dir / f"tasks/{primary_task}/test.parquet" + if not train_path.exists() or not test_path.exists(): + raise FileNotFoundError( + f"missing train.parquet or test.parquet under {bundle_dir}/tasks/{primary_task}/" + ) + + train = pd.read_parquet(train_path) + test = pd.read_parquet(test_path) + + if LABEL_COLUMN not in train.columns or LABEL_COLUMN not in test.columns: + raise ValueError(f"task splits must contain the {LABEL_COLUMN!r} label column") + + y_train = train[LABEL_COLUMN].astype("boolean").fillna(False).astype(int) + y_test = test[LABEL_COLUMN].astype("boolean").fillna(False).astype(int) + if y_train.nunique() < 2: + raise ValueError( + "train split has fewer than two classes; refusing to fit " + "(a single-class regime breaks every downstream metric)" + ) + if y_test.nunique() < 2: + raise ValueError("test split has fewer than two classes; refusing to score") + + cat_cols, num_cols = _partition_columns(train, exclude={LABEL_COLUMN}) + x_train = _sanitize_categoricals(train[cat_cols + num_cols], cat_cols) + x_test = _sanitize_categoricals(test[cat_cols + num_cols], cat_cols) + + lr_pipe = _build_pipeline(num_cols, cat_cols, model="lr", seed=seed, sk=sk) + gbm_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=seed, sk=sk) + + lr_pipe.fit(x_train, y_train.values) + gbm_pipe.fit(x_train, y_train.values) + lr_probs = lr_pipe.predict_proba(x_test)[:, 1] + gbm_probs = gbm_pipe.predict_proba(x_test)[:, 1] + + lr_auc = float(sk.roc_auc_score(y_test.values, lr_probs)) + gbm_auc = float(sk.roc_auc_score(y_test.values, gbm_probs)) + lr_ap = float(sk.average_precision_score(y_test.values, lr_probs)) + gbm_ap = float(sk.average_precision_score(y_test.values, gbm_probs)) + + p_at_k: dict[str, float] = {} + r_at_k: dict[str, float] = {} + for k in PRECISION_KS: + p_at_k[str(k)] = _precision_at_k(lr_probs, y_test.values, k) + r_at_k[str(k)] = _recall_at_k(lr_probs, y_test.values, k) + lift_at_pct = {f"{p:g}": _lift_at_pct(lr_probs, y_test.values, p) for p in LIFT_PCTS} + top_decile = _top_decile_rate(lr_probs, y_test.values) + + acv_capture: dict[str, float] = {} + if "expected_acv" in test.columns: + acv = pd.to_numeric(test["expected_acv"], errors="coerce").fillna(0.0).values + for k in PRECISION_KS: + acv_capture[str(k)] = _expected_acv_capture(lr_probs, y_test.values, acv, k) + + brier = float(sk.brier_score_loss(y_test.values, lr_probs)) + eps = 1e-15 + clipped = np.clip(lr_probs, eps, 1.0 - eps) + log_loss = float(sk.log_loss(y_test.values, clipped, labels=[0, 1])) + bins, max_bin_err = _calibration_bins(lr_probs, y_test.values, n_bins=N_CALIBRATION_BINS) + + baselines = _compute_baselines( + train=train, test=test, y_train=y_train.values, y_test=y_test.values, seed=seed, sk=sk + ) + + return TierMetrics( + tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), + seed=seed, + n_train=int(len(train)), + n_test=int(len(test)), + base_rate=float(y_test.mean()), + conversion_rate_train=float(y_train.mean()), + conversion_rate_test=float(y_test.mean()), + lr_auc=lr_auc, + gbm_auc=gbm_auc, + gbm_minus_lr_auc=gbm_auc - lr_auc, + lr_average_precision=lr_ap, + gbm_average_precision=gbm_ap, + average_precision=lr_ap, + precision_at_k=p_at_k, + recall_at_k=r_at_k, + lift_at_pct=lift_at_pct, + top_decile_rate=top_decile, + expected_acv_capture_at_k=acv_capture, + brier_score=brier, + log_loss=log_loss, + calibration_max_bin_error=max_bin_err, + calibration_bins=bins, + baselines=baselines, + ) + + +def measure_cohort_shift_from_bundle( + bundle_dir: Path, + *, + seed: int = DEFAULT_SEED, + tier_name: str | None = None, +) -> CohortShiftMetrics: + """Random-vs-chronological-cohort split AUC degradation (G6.4). + + Uses the bundle's existing train/test as the random-split AUC and + re-splits the union chronologically by ``lead_created_at`` for the + cohort-split AUC. HistGBM is used for both — it handles NaN + natively so we don't have to thread a separate imputation pipeline + through the chronological resplit. + """ + sk = _import_sklearn() + manifest = load_json(bundle_dir / "manifest.json") + primary_task = str(manifest.get("primary_task", "converted_within_90_days")) + + train = pd.read_parquet(bundle_dir / f"tasks/{primary_task}/train.parquet") + test = pd.read_parquet(bundle_dir / f"tasks/{primary_task}/test.parquet") + + cat_cols, num_cols = _partition_columns(train, exclude={LABEL_COLUMN}) + x_train = _sanitize_categoricals(train[cat_cols + num_cols], cat_cols) + x_test = _sanitize_categoricals(test[cat_cols + num_cols], cat_cols) + y_train = train[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values + y_test = test[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values + + rand_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=seed, sk=sk) + rand_pipe.fit(x_train, y_train) + rand_probs = rand_pipe.predict_proba(x_test)[:, 1] + random_auc = float(sk.roc_auc_score(y_test, rand_probs)) + + if "lead_created_at" not in train.columns: + # Without a timestamp column, "cohort" has no meaning; emit a + # NaN degradation so PR 3.3 can see this is unsupported on the + # bundle rather than silently report 0. + return CohortShiftMetrics( + tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), + seed=seed, + random_split_auc=random_auc, + cohort_split_auc=float("nan"), + auc_degradation=float("nan"), + ) + + pooled = pd.concat([train, test], ignore_index=True) + ts = pd.to_datetime(pooled["lead_created_at"], errors="coerce") + if ts.isna().any(): + # Same posture as ``probe_snapshot_window`` — a malformed anchor + # would mask the cohort split. Surface it as NaN rather than + # invent a value. + return CohortShiftMetrics( + tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), + seed=seed, + random_split_auc=random_auc, + cohort_split_auc=float("nan"), + auc_degradation=float("nan"), + ) + + order = np.argsort(ts.values, kind="stable") + cutoff = int(round(len(pooled) * COHORT_TRAIN_FRAC)) + early_idx = order[:cutoff] + late_idx = order[cutoff:] + if len(late_idx) == 0: + return CohortShiftMetrics( + tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), + seed=seed, + random_split_auc=random_auc, + cohort_split_auc=float("nan"), + auc_degradation=float("nan"), + ) + + early = pooled.iloc[early_idx] + late = pooled.iloc[late_idx] + y_early = early[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values + y_late = late[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values + if len(set(y_early)) < 2 or len(set(y_late)) < 2: + return CohortShiftMetrics( + tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), + seed=seed, + random_split_auc=random_auc, + cohort_split_auc=float("nan"), + auc_degradation=float("nan"), + ) + + x_early = _sanitize_categoricals(early[cat_cols + num_cols], cat_cols) + x_late = _sanitize_categoricals(late[cat_cols + num_cols], cat_cols) + cohort_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=COHORT_SEED, sk=sk) + cohort_pipe.fit(x_early, y_early) + cohort_probs = cohort_pipe.predict_proba(x_late)[:, 1] + cohort_auc = float(sk.roc_auc_score(y_late, cohort_probs)) + + return CohortShiftMetrics( + tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), + seed=seed, + random_split_auc=random_auc, + cohort_split_auc=cohort_auc, + auc_degradation=random_auc - cohort_auc, + ) + + +# --------------------------------------------------------------------------- +# Cross-seed orchestration +# --------------------------------------------------------------------------- + + +def regenerate_tier_for_seeds( + spec: TierBuildSpec, + seeds: Sequence[int], + workdir: Path, +) -> dict[int, Path]: + """Generate one bundle per seed under ``workdir``. + + Idempotent: if ``workdir / "__seed{seed}"`` already contains a + valid manifest, that bundle is reused. Used by + :func:`measure_release_quality` and the round-trip test; PR 3.3's + driver can use it directly to keep cross-seed sweep state on disk + between runs. + """ + from leadforge.api.generator import Generator + + workdir.mkdir(parents=True, exist_ok=True) + out: dict[int, Path] = {} + for seed in seeds: + target = workdir / f"{spec.name}__seed{seed}" + if (target / "manifest.json").exists(): + out[seed] = target + continue + gen = Generator.from_recipe( + spec.recipe_id, + seed=seed, + exposure_mode=spec.exposure_mode, + difficulty=spec.difficulty, + n_accounts=spec.n_accounts, + n_contacts=spec.n_contacts, + n_leads=spec.n_leads, + primary_task=spec.primary_task, + label_window_days=spec.label_window_days, + snapshot_day=spec.snapshot_day, + ) + gen.generate().save(str(target)) + out[seed] = target + return out + + +def measure_release_quality( + tier_bundles: Mapping[str, Mapping[int, Path]], + *, + cohort_canonical_seed: int | None = None, + release_id: str = "leadforge-lead-scoring-v1", + package_version: str | None = None, + generation_timestamp: str | None = None, +) -> ReleaseQualityReport: + """Aggregate per-(tier, seed) measurements into a full report. + + Args: + tier_bundles: Mapping ``tier_name -> {seed: bundle_dir}``. + Tier names are arbitrary strings; the cross-tier ordering + check looks for the canonical ``intro``/``intermediate``/ + ``advanced`` names but tolerates their absence (the + corresponding ordering-bool fields default to ``True`` so a + partial release does not over-report ordering failures). + cohort_canonical_seed: Seed at which to run the cohort-shift + evaluation per tier. When ``None``, the smallest seed + present per tier is used. Cohort shift is reported for one + seed per tier — running it on every seed would multiply the + sweep cost without producing extra signal at this layer. + release_id: Identifier baked into the JSON. + package_version: leadforge package version. Defaults to the + installed version from :mod:`leadforge.version`. + generation_timestamp: Pinned timestamp for the report. Defaults + to current UTC. + """ + from leadforge.version import __version__ + + cross_seed: dict[str, CrossSeedTierMetrics] = {} + cohort: dict[str, CohortShiftMetrics] = {} + for tier_name, by_seed in tier_bundles.items(): + seeds = sorted(by_seed.keys()) + per_seed_metrics = [ + measure_tier_from_bundle(by_seed[s], seed=s, tier_name=tier_name) for s in seeds + ] + medians, spreads = _aggregate_cross_seed(per_seed_metrics) + cross_seed[tier_name] = CrossSeedTierMetrics( + tier=tier_name, + seeds=list(seeds), + per_seed=per_seed_metrics, + medians=medians, + spreads=spreads, + ) + canonical: int = ( + cohort_canonical_seed + if cohort_canonical_seed is not None and cohort_canonical_seed in by_seed + else seeds[0] + ) + cohort[tier_name] = measure_cohort_shift_from_bundle( + by_seed[canonical], seed=canonical, tier_name=tier_name + ) + + ordering = _compute_cross_tier_ordering(cross_seed) + + if generation_timestamp is None: + from datetime import UTC, datetime + + generation_timestamp = datetime.now(UTC).replace(microsecond=0).isoformat() + + return ReleaseQualityReport( + release_id=release_id, + package_version=package_version or __version__, + generation_timestamp=generation_timestamp, + seeds=sorted({s for d in tier_bundles.values() for s in d}), + tiers=cross_seed, + cohort_shift=cohort, + cross_tier_ordering=ordering, + ) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +_HEADLINE_FIELDS: tuple[str, ...] = ( + "lr_auc", + "gbm_auc", + "gbm_minus_lr_auc", + "lr_average_precision", + "gbm_average_precision", + "brier_score", + "log_loss", + "calibration_max_bin_error", + "top_decile_rate", + "conversion_rate_test", +) + + +def _aggregate_cross_seed( + per_seed: list[TierMetrics], +) -> tuple[dict[str, float], dict[str, float]]: + """Compute medians and (max - min) spreads for the headline fields. + + Spreads here are the simple max-min range, not a standard deviation — + G8.1 declares the band on ``±TBD`` of the median, which is most + naturally expressed as a half-range. PR 3.3 reads this directly. + """ + medians: dict[str, float] = {} + spreads: dict[str, float] = {} + if not per_seed: + return medians, spreads + for fld in _HEADLINE_FIELDS: + values = [float(getattr(m, fld)) for m in per_seed] + medians[fld] = float(np.median(values)) + spreads[fld] = float(max(values) - min(values)) + return medians, spreads + + +def _compute_cross_tier_ordering( + cross_seed: Mapping[str, CrossSeedTierMetrics], +) -> CrossTierOrdering: + """Derive G7.4.* ordering booleans + descending tier rankings.""" + if not cross_seed: + # Empty release → all-True booleans (vacuously satisfied) plus + # empty rankings. PR 3.3's gating layer is the place to assert + # presence of all three canonical tiers, not this function. + return CrossTierOrdering( + by_average_precision=[], + by_precision_at_100=[], + by_gbm_minus_lr=[], + by_conversion_rate=[], + average_precision_intro_gt_intermediate=True, + average_precision_intermediate_gt_advanced=True, + precision_at_100_intro_gt_intermediate=True, + precision_at_100_intermediate_gt_advanced=True, + conversion_rate_intro_gt_intermediate=True, + conversion_rate_intermediate_gt_advanced=True, + gbm_minus_lr_positive_in_every_tier=True, + ) + + # Build per-tier representative numbers from the median across seeds. + median_ap: dict[str, float] = {} + median_p100: dict[str, float] = {} + median_gbm_lr: dict[str, float] = {} + median_rate: dict[str, float] = {} + for tier, csm in cross_seed.items(): + # Median P@100 is computed from the per-seed dicts directly — + # the headline aggregator only carries scalars. + p100s = [float(m.precision_at_k.get("100", float("nan"))) for m in csm.per_seed] + median_ap[tier] = csm.medians.get("lr_average_precision", float("nan")) + if p100s and not all(math.isnan(p) for p in p100s): + median_p100[tier] = float(np.median(p100s)) + else: + median_p100[tier] = float("nan") + median_gbm_lr[tier] = csm.medians.get("gbm_minus_lr_auc", float("nan")) + median_rate[tier] = csm.medians.get("conversion_rate_test", float("nan")) + + def _sorted_desc(d: Mapping[str, float]) -> list[str]: + # NaN sorts last so it doesn't artificially top the ranking. + return sorted(d, key=lambda k: (math.isnan(d[k]), -d[k] if not math.isnan(d[k]) else 0.0)) + + def _gt(d: Mapping[str, float], a: str, b: str) -> bool: + if a not in d or b not in d: + return True # tier missing → vacuous + if math.isnan(d[a]) or math.isnan(d[b]): + return True + return d[a] > d[b] + + return CrossTierOrdering( + by_average_precision=_sorted_desc(median_ap), + by_precision_at_100=_sorted_desc(median_p100), + by_gbm_minus_lr=_sorted_desc(median_gbm_lr), + by_conversion_rate=_sorted_desc(median_rate), + average_precision_intro_gt_intermediate=_gt(median_ap, "intro", "intermediate"), + average_precision_intermediate_gt_advanced=_gt(median_ap, "intermediate", "advanced"), + precision_at_100_intro_gt_intermediate=_gt(median_p100, "intro", "intermediate"), + precision_at_100_intermediate_gt_advanced=_gt(median_p100, "intermediate", "advanced"), + conversion_rate_intro_gt_intermediate=_gt(median_rate, "intro", "intermediate"), + conversion_rate_intermediate_gt_advanced=_gt(median_rate, "intermediate", "advanced"), + gbm_minus_lr_positive_in_every_tier=all( + v > 0 for v in median_gbm_lr.values() if not math.isnan(v) + ), + ) + + +# --------------------------------------------------------------------------- +# Metric primitives +# --------------------------------------------------------------------------- + + +def _precision_at_k(probs: np.ndarray, y: np.ndarray, k: int) -> float: + if k <= 0 or k > len(y): + return float("nan") + order = np.argsort(-np.asarray(probs), kind="stable") + return float(np.asarray(y)[order[:k]].mean()) + + +def _recall_at_k(probs: np.ndarray, y: np.ndarray, k: int) -> float: + if k <= 0 or k > len(y): + return float("nan") + n_pos = int(np.sum(y)) + if n_pos == 0: + return float("nan") + order = np.argsort(-np.asarray(probs), kind="stable") + return float(np.asarray(y)[order[:k]].sum() / n_pos) + + +def _lift_at_pct(probs: np.ndarray, y: np.ndarray, pct: float) -> float: + n = len(y) + if n == 0: + return float("nan") + base_rate = float(np.mean(y)) + if base_rate <= 0: + return float("nan") + k = max(1, int(round(n * pct / 100.0))) + order = np.argsort(-np.asarray(probs), kind="stable") + top_k_y = np.asarray(y)[order[:k]] + return float(top_k_y.mean() / base_rate) + + +def _top_decile_rate(probs: np.ndarray, y: np.ndarray) -> float: + n = len(y) + if n == 0: + return float("nan") + return _precision_at_k(probs, y, max(1, int(round(n * 0.1)))) + + +def _expected_acv_capture(probs: np.ndarray, y: np.ndarray, acv: np.ndarray, k: int) -> float: + """Fraction of total converted-ACV captured in the top-k by score.""" + if k <= 0 or k > len(y): + return float("nan") + order = np.argsort(-np.asarray(probs), kind="stable") + captured = float(np.sum(np.asarray(acv)[order[:k]] * np.asarray(y)[order[:k]])) + total = float(np.sum(np.asarray(acv) * np.asarray(y))) + if total <= 0: + return float("nan") + return captured / total + + +def _calibration_bins( + probs: np.ndarray, y: np.ndarray, *, n_bins: int = 10 +) -> tuple[list[CalibrationBin], float]: + """Equal-width reliability bins + max absolute calibration error.""" + edges = np.linspace(0.0, 1.0, n_bins + 1) + out: list[CalibrationBin] = [] + max_err = 0.0 + probs_arr = np.asarray(probs, dtype=float) + y_arr = np.asarray(y, dtype=float) + for i in range(n_bins): + lo = float(edges[i]) + hi = float(edges[i + 1]) + if i < n_bins - 1: + mask = (probs_arr >= lo) & (probs_arr < hi) + else: + mask = (probs_arr >= lo) & (probs_arr <= hi) + n_in = int(mask.sum()) + if n_in == 0: + continue + mean_p = float(probs_arr[mask].mean()) + mean_y = float(y_arr[mask].mean()) + err = abs(mean_p - mean_y) + if err > max_err: + max_err = err + out.append( + CalibrationBin( + bin_lower=lo, bin_upper=hi, n=n_in, mean_predicted=mean_p, mean_actual=mean_y + ) + ) + return out, max_err + + +# --------------------------------------------------------------------------- +# Pipeline construction (consumes the leadforge.pipelines.ml conventions +# but rebuilds the column partition because the bundle's task split has +# different column names than the v6/v7 flat CSVs). +# --------------------------------------------------------------------------- + + +_EXCLUDE_FROM_FEATURES: frozenset[str] = frozenset( + {LABEL_COLUMN, "lead_id", "account_id", "contact_id", "lead_created_at"} +) + + +def _partition_columns(df: pd.DataFrame, *, exclude: Iterable[str]) -> tuple[list[str], list[str]]: + """Split bundle-snapshot columns into (categorical, numeric) by dtype. + + IDs and timestamp anchors are always excluded — they are not + legitimate predictive features for this task and they would balloon + the OneHotEncoder's vocabulary. Boolean / nullable-int / nullable- + float columns count as numeric. + """ + excl = set(exclude) | _EXCLUDE_FROM_FEATURES + cat: list[str] = [] + num: list[str] = [] + for col in df.columns: + if col in excl: + continue + if pd.api.types.is_bool_dtype(df[col]) or pd.api.types.is_numeric_dtype(df[col]): + num.append(col) + else: + cat.append(col) + return cat, num + + +def _sanitize_categoricals(df: pd.DataFrame, cat_cols: list[str]) -> pd.DataFrame: + """Convert pd.NA in categorical columns to None for sklearn compatibility. + + Mirrors :func:`leadforge.pipelines.ml.sanitize_categoricals` — kept + here as a private helper to avoid importing the v6/v7 pipeline + constants (this module's column lists are derived from the bundle + schema, not the flat-CSV schema). + """ + out = df.copy() + for c in cat_cols: + if c in out.columns: + out[c] = out[c].astype(object).where(out[c].notna(), None) + return out + + +def _build_pipeline( + num_cols: list[str], + cat_cols: list[str], + *, + model: str, + seed: int, + sk: _SklearnHandles, +) -> Any: + """LR or HistGBM pipeline on top of the canonical preprocessor. + + The preprocessor mirrors :func:`leadforge.pipelines.ml.build_preprocessor` + (median-impute + standard-scale numeric; most-frequent-impute + + one-hot encode categorical) so the metric panel agrees by + construction with the canonical baseline used elsewhere in the + package. It is rebuilt locally because the bundle column set + differs from the flat-CSV column set. + """ + numeric_t = sk.Pipeline( + [("imputer", sk.SimpleImputer(strategy="median")), ("scaler", sk.StandardScaler())] + ) + categorical_t = sk.Pipeline( + [ + ("imputer", sk.SimpleImputer(strategy="most_frequent")), + ("encoder", sk.OneHotEncoder(handle_unknown="ignore", sparse_output=False)), + ] + ) + pre = sk.ColumnTransformer( + transformers=[("num", numeric_t, num_cols), ("cat", categorical_t, cat_cols)], + remainder="drop", + ) + if model == "lr": + clf: Any = sk.LogisticRegression(max_iter=1000, solver="lbfgs", random_state=seed) + elif model == "gbm": + clf = sk.HistGradientBoostingClassifier(random_state=seed) + else: + raise ValueError(f"unknown model: {model!r}") + return sk.Pipeline([("preprocessor", pre), ("classifier", clf)]) + + +# --------------------------------------------------------------------------- +# Baseline subsets +# --------------------------------------------------------------------------- + + +_SOURCE_COLUMNS: tuple[str, ...] = ("lead_source", "first_touch_channel") +_STAGE_COLUMNS: tuple[str, ...] = ("current_stage", "is_sql") +_ID_COLUMNS: tuple[str, ...] = ("lead_id", "account_id", "contact_id") +_POST_SNAPSHOT_AGGREGATES: tuple[str, ...] = ("total_touches_all",) + + +def _engagement_columns() -> tuple[str, ...]: + """Column names tagged ``category="engagement"`` in the snapshot spec.""" + return tuple(f.name for f in LEAD_SNAPSHOT_FEATURES if f.category == "engagement") + + +def _compute_baselines( + *, + train: pd.DataFrame, + test: pd.DataFrame, + y_train: np.ndarray, + y_test: np.ndarray, + seed: int, + sk: _SklearnHandles, +) -> dict[str, float]: + """AUC of HistGBM trained on each well-known feature subset. + + Keys present in the result are exactly those whose source columns + exist in the bundle. Stage-only is typically absent from public + bundles (G5.2's columns are redacted under student_public); the + omission is the result, not an error. + """ + out: dict[str, float] = {} + for name, cols in ( + ("source_only", _SOURCE_COLUMNS), + ("engagement_only", _engagement_columns()), + ("stage_only", _STAGE_COLUMNS), + ("post_snapshot_aggregates", _POST_SNAPSHOT_AGGREGATES), + ): + present = [c for c in cols if c in train.columns] + if not present: + continue + auc = _subset_auc(train, test, y_train, y_test, present, seed=seed, sk=sk) + if auc is not None: + out[name] = auc + + id_present = [c for c in _ID_COLUMNS if c in train.columns] + if id_present: + auc = _id_only_auc(train, test, y_train, y_test, id_present, seed=seed, sk=sk) + if auc is not None: + out["id_only"] = auc + return out + + +def _subset_auc( + train: pd.DataFrame, + test: pd.DataFrame, + y_train: np.ndarray, + y_test: np.ndarray, + cols: list[str], + *, + seed: int, + sk: _SklearnHandles, +) -> float | None: + """HistGBM on a feature subset; returns None when scoring is impossible.""" + cat_in_subset = [ + c + for c in cols + if not pd.api.types.is_numeric_dtype(train[c]) and not pd.api.types.is_bool_dtype(train[c]) + ] + num_in_subset = [c for c in cols if c not in cat_in_subset] + x_tr = _sanitize_categoricals(train[cols], cat_in_subset) + x_te = _sanitize_categoricals(test[cols], cat_in_subset) + if len(set(y_train)) < 2 or len(set(y_test)) < 2: + return None + pipe = _build_pipeline(num_in_subset, cat_in_subset, model="gbm", seed=seed, sk=sk) + pipe.fit(x_tr, y_train) + probs = pipe.predict_proba(x_te)[:, 1] + return float(sk.roc_auc_score(y_test, probs)) + + +def _id_only_auc( + train: pd.DataFrame, + test: pd.DataFrame, + y_train: np.ndarray, + y_test: np.ndarray, + id_cols: list[str], + *, + seed: int, + sk: _SklearnHandles, +) -> float | None: + """Hash IDs to ints and feed HistGBM directly. + + Mirrors :func:`leadforge.validation.leakage_probes._hash_id_columns` + so the leakage-probe baseline and the release-quality baseline + produce comparable numbers. Expected ≈ 0.5 + ε on a clean bundle. + """ + if len(set(y_train)) < 2 or len(set(y_test)) < 2: + return None + x_tr = _hash_id_columns(train[id_cols]) + x_te = _hash_id_columns(test[id_cols]) + model = sk.HistGradientBoostingClassifier(random_state=seed, max_iter=100) + model.fit(x_tr.values, y_train) + probs = model.predict_proba(x_te.values)[:, 1] + return float(sk.roc_auc_score(y_test, probs)) + + +def _hash_id_columns(df: pd.DataFrame) -> pd.DataFrame: + def _h(value: object) -> int: + digest = hashlib.blake2b(str(value).encode("utf-8"), digest_size=4).digest() + return int.from_bytes(digest, "big", signed=False) + + return pd.DataFrame({col: df[col].map(_h).astype("int64") for col in df.columns}) + + +# --------------------------------------------------------------------------- +# sklearn handles — lazy bundle so import-time failures stay loud and the +# probe-style "skip cleanly when missing" pattern from leakage_probes is +# inverted here (release quality REQUIRES sklearn to do anything useful). +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class _SklearnHandles: + Pipeline: Any + ColumnTransformer: Any + SimpleImputer: Any + StandardScaler: Any + OneHotEncoder: Any + LogisticRegression: Any + HistGradientBoostingClassifier: Any + roc_auc_score: Any + average_precision_score: Any + brier_score_loss: Any + log_loss: Any + + +def _import_sklearn() -> _SklearnHandles: + from sklearn.compose import ColumnTransformer + from sklearn.ensemble import HistGradientBoostingClassifier + from sklearn.impute import SimpleImputer + from sklearn.linear_model import LogisticRegression + from sklearn.metrics import ( + average_precision_score, + brier_score_loss, + log_loss, + roc_auc_score, + ) + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder, StandardScaler + + return _SklearnHandles( + Pipeline=Pipeline, + ColumnTransformer=ColumnTransformer, + SimpleImputer=SimpleImputer, + StandardScaler=StandardScaler, + OneHotEncoder=OneHotEncoder, + LogisticRegression=LogisticRegression, + HistGradientBoostingClassifier=HistGradientBoostingClassifier, + roc_auc_score=roc_auc_score, + average_precision_score=average_precision_score, + brier_score_loss=brier_score_loss, + log_loss=log_loss, + ) + + +# --------------------------------------------------------------------------- +# JSON serialisation helpers (used by reporting.py). Centralised here so +# any caller that imports the dataclasses also gets a deterministic +# JSON-conversion entry point. +# --------------------------------------------------------------------------- + + +def report_to_dict(report: ReleaseQualityReport) -> dict[str, Any]: + """Convert a :class:`ReleaseQualityReport` into a JSON-primitive dict. + + Wraps :func:`dataclasses.asdict` and walks the result to coerce + floats that ``json.dumps`` would otherwise reject (NaN / ±Inf) into + ``None``. PR 3.2 produces NaN deliberately (e.g. cohort-shift on a + bundle with no ``lead_created_at``); turning them into ``null`` is + the cheapest contract change for downstream JSON consumers. + """ + raw = dataclasses.asdict(report) + # ``_json_safe`` walks dicts/lists in place; the top-level result of + # ``asdict`` on a dataclass is always a dict, so the return type + # narrows to ``dict[str, Any]``. ``cast`` is the right tool here — + # we're not asserting an invariant, we're declaring a known shape. + return cast(dict[str, Any], _json_safe(raw)) + + +def _json_safe(obj: Any) -> Any: + if isinstance(obj, dict): + return {str(k): _json_safe(v) for k, v in obj.items()} + if isinstance(obj, list | tuple): + return [_json_safe(v) for v in obj] + if isinstance(obj, float): + if math.isnan(obj) or math.isinf(obj): + return None + return obj + if isinstance(obj, np.floating): + f = float(obj) + if math.isnan(f) or math.isinf(f): + return None + return f + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.bool_): + return bool(obj) + return obj + + +def report_to_json(report: ReleaseQualityReport, *, indent: int = 2) -> str: + """Stable JSON dump of the report (sorted keys, fixed indent).""" + return json.dumps(report_to_dict(report), indent=indent, sort_keys=True) + + +__all__ = [ + "COHORT_TRAIN_FRAC", + "DEFAULT_SEED", + "LABEL_COLUMN", + "LIFT_PCTS", + "N_CALIBRATION_BINS", + "PRECISION_KS", + "CalibrationBin", + "CohortShiftMetrics", + "CrossSeedTierMetrics", + "CrossTierOrdering", + "ReleaseQualityReport", + "TierBuildSpec", + "TierMetrics", + "measure_cohort_shift_from_bundle", + "measure_release_quality", + "measure_tier_from_bundle", + "regenerate_tier_for_seeds", + "report_to_dict", + "report_to_json", +] diff --git a/leadforge/validation/reporting.py b/leadforge/validation/reporting.py new file mode 100644 index 0000000..1d8afc3 --- /dev/null +++ b/leadforge/validation/reporting.py @@ -0,0 +1,514 @@ +"""Render :class:`ReleaseQualityReport` to JSON + markdown + figures. + +PR 3.2 ships the renderer; PR 3.3 wires it into +``scripts/validate_release_candidate.py``. + +Output contract (pinned in ``docs/release/v1_release_design.md`` +§"Output contract"):: + + / + validation_report.json + validation_report.md + figures/ + lift_curve_intro.png + lift_curve_intermediate.png + lift_curve_advanced.png + calibration_intermediate.png + leakage_delta.png + cohort_shift.png + value_capture.png + +Filenames are *exact* — they are referenced from the dataset card and +the markdown report; renaming them is a contract change. + +Matplotlib is the only figure dependency; we force the Agg backend +before importing :mod:`matplotlib.pyplot` so this module is safe in +headless CI. Figures are deterministic byte-for-byte under the same +:class:`ReleaseQualityReport` input — the renderer does no sampling and +pins every text-source font option. +""" + +from __future__ import annotations + +import math +from collections.abc import Mapping +from pathlib import Path +from typing import Any + +import matplotlib + +matplotlib.use("Agg") # headless / deterministic; must precede pyplot import. + +import matplotlib.pyplot as plt # noqa: E402 + +from leadforge.validation.release_quality import ( # noqa: E402 + PRECISION_KS, + CohortShiftMetrics, + CrossSeedTierMetrics, + ReleaseQualityReport, + report_to_json, +) + +# --------------------------------------------------------------------------- +# Output paths +# --------------------------------------------------------------------------- + +REPORT_JSON: str = "validation_report.json" +REPORT_MD: str = "validation_report.md" +FIGURES_DIRNAME: str = "figures" + +#: Pinned figure filenames — must match +#: ``docs/release/v1_release_design.md`` §"Output contract" exactly. +LIFT_CURVE_FIGURE_TEMPLATE: str = "lift_curve_{tier}.png" +CALIBRATION_FIGURE: str = "calibration_intermediate.png" +LEAKAGE_DELTA_FIGURE: str = "leakage_delta.png" +COHORT_SHIFT_FIGURE: str = "cohort_shift.png" +VALUE_CAPTURE_FIGURE: str = "value_capture.png" + +#: Tiers for which a lift curve is rendered. The design doc names +#: these three explicitly; rendering for unknown tiers would diverge +#: from the contract filenames. +_LIFT_CURVE_TIERS: tuple[str, ...] = ("intro", "intermediate", "advanced") + +#: Tier whose calibration curve is the canonical figure. Per the +#: design doc; PR 3.3 may grow per-tier reliability later. +_CALIBRATION_TIER: str = "intermediate" + + +# --------------------------------------------------------------------------- +# Public renderer +# --------------------------------------------------------------------------- + + +def render_report(report: ReleaseQualityReport, output_dir: Path) -> dict[str, Path]: + """Write JSON, markdown and figures under *output_dir*. + + Returns a mapping of logical name → path written, for callers that + want to assert presence (the round-trip integration test) or list + the artefacts in a higher-level manifest (PR 3.3's driver). + + The output directory is created if missing; existing files are + overwritten. No file is *deleted* — a stale figure from a previous + run will still be present unless the caller pre-cleans the + directory. + """ + output_dir.mkdir(parents=True, exist_ok=True) + figures_dir = output_dir / FIGURES_DIRNAME + figures_dir.mkdir(parents=True, exist_ok=True) + + written: dict[str, Path] = {} + + json_path = output_dir / REPORT_JSON + json_path.write_text(report_to_json(report)) + written["json"] = json_path + + md_path = output_dir / REPORT_MD + md_path.write_text(_render_markdown(report)) + written["md"] = md_path + + for tier_name in _LIFT_CURVE_TIERS: + if tier_name not in report.tiers: + continue + path = figures_dir / LIFT_CURVE_FIGURE_TEMPLATE.format(tier=tier_name) + _write_lift_curve(report.tiers[tier_name], path) + written[f"lift_curve_{tier_name}"] = path + + if _CALIBRATION_TIER in report.tiers: + cal_path = figures_dir / CALIBRATION_FIGURE + _write_calibration_curve(report.tiers[_CALIBRATION_TIER], cal_path) + written["calibration"] = cal_path + + if report.tiers: + leak_path = figures_dir / LEAKAGE_DELTA_FIGURE + _write_leakage_delta(report.tiers, leak_path) + written["leakage_delta"] = leak_path + + value_path = figures_dir / VALUE_CAPTURE_FIGURE + _write_value_capture(report.tiers, value_path) + written["value_capture"] = value_path + + if report.cohort_shift: + cohort_path = figures_dir / COHORT_SHIFT_FIGURE + _write_cohort_shift(report.cohort_shift, cohort_path) + written["cohort_shift"] = cohort_path + + return written + + +# --------------------------------------------------------------------------- +# Markdown +# --------------------------------------------------------------------------- + + +def _render_markdown(report: ReleaseQualityReport) -> str: + """Human-readable report; every claim cites the JSON path that backs it. + + Format invariant: every metric value is followed by a parenthesised + ``()`` reference matching ``$.tiers.....`` so a + reader can grep the markdown and find the exact field in + ``validation_report.json``. G10.6 in ``v1_acceptance_gates.md`` + requires this. + """ + out: list[str] = [] + out.append(f"# {report.release_id} — release quality report") + out.append("") + out.append(f"**Package version:** `{report.package_version}` ") + out.append(f"**Generated:** `{report.generation_timestamp}` ") + out.append(f"**Seeds:** {report.seeds} ") + out.append( + "Every value below cites the JSON field that backs it; see " + f"`{REPORT_JSON}` for the machine-readable form." + ) + out.append("") + + out.append("## Per-tier headline metrics") + out.append("") + out.append( + "| Tier | Conv. rate (test) | LR AUC | GBM AUC | GBM−LR | LR AP | Brier | " + "Cal. max-bin err | Top-decile rate |" + ) + out.append("|---|---|---|---|---|---|---|---|---|") + for tier_name, csm in sorted(report.tiers.items()): + m = csm.medians + path = f"$.tiers.{tier_name}.medians" + out.append( + "| {tier} | {rate} | {lr} | {gbm} | {delta} | {ap} | {br} | {cal} | {td} |".format( + tier=tier_name, + rate=_fmt(m.get("conversion_rate_test"), f"{path}.conversion_rate_test"), + lr=_fmt(m.get("lr_auc"), f"{path}.lr_auc"), + gbm=_fmt(m.get("gbm_auc"), f"{path}.gbm_auc"), + delta=_fmt(m.get("gbm_minus_lr_auc"), f"{path}.gbm_minus_lr_auc"), + ap=_fmt(m.get("lr_average_precision"), f"{path}.lr_average_precision"), + br=_fmt(m.get("brier_score"), f"{path}.brier_score"), + cal=_fmt( + m.get("calibration_max_bin_error"), + f"{path}.calibration_max_bin_error", + ), + td=_fmt(m.get("top_decile_rate"), f"{path}.top_decile_rate"), + ) + ) + out.append("") + + out.append("## Cross-seed stability (G8.1)") + out.append("") + out.append("| Tier | Seeds | LR AUC spread | GBM AUC spread | AP spread | Brier spread |") + out.append("|---|---|---|---|---|---|") + for tier_name, csm in sorted(report.tiers.items()): + sp = csm.spreads + path = f"$.tiers.{tier_name}.spreads" + out.append( + "| {tier} | {seeds} | {lr} | {gbm} | {ap} | {br} |".format( + tier=tier_name, + seeds=csm.seeds, + lr=_fmt(sp.get("lr_auc"), f"{path}.lr_auc"), + gbm=_fmt(sp.get("gbm_auc"), f"{path}.gbm_auc"), + ap=_fmt(sp.get("lr_average_precision"), f"{path}.lr_average_precision"), + br=_fmt(sp.get("brier_score"), f"{path}.brier_score"), + ) + ) + out.append("") + + out.append("## Cross-tier ordering (G7.4)") + out.append("") + ord_path = "$.cross_tier_ordering" + o = report.cross_tier_ordering + out.append( + f"- AP ranking (descending): {o.by_average_precision} (`{ord_path}.by_average_precision`)" + ) + out.append( + f"- P@100 ranking (descending): {o.by_precision_at_100} (`{ord_path}.by_precision_at_100`)" + ) + out.append(f"- GBM−LR ranking (descending): {o.by_gbm_minus_lr} (`{ord_path}.by_gbm_minus_lr`)") + out.append( + f"- Conversion-rate ranking (descending): {o.by_conversion_rate} " + f"(`{ord_path}.by_conversion_rate`)" + ) + out.append( + f"- AP intro > intermediate: **{o.average_precision_intro_gt_intermediate}** " + f"(`{ord_path}.average_precision_intro_gt_intermediate`)" + ) + out.append( + f"- AP intermediate > advanced: **{o.average_precision_intermediate_gt_advanced}** " + f"(`{ord_path}.average_precision_intermediate_gt_advanced`)" + ) + out.append( + f"- GBM−LR positive in every tier: **{o.gbm_minus_lr_positive_in_every_tier}** " + f"(`{ord_path}.gbm_minus_lr_positive_in_every_tier`)" + ) + out.append("") + + out.append("## Cohort-shift evaluation (G6.4)") + out.append("") + out.append("| Tier | Random-split AUC | Cohort-split AUC | Degradation (random − cohort) |") + out.append("|---|---|---|---|") + for tier_name, cs in sorted(report.cohort_shift.items()): + cs_path = f"$.cohort_shift.{tier_name}" + out.append( + "| {tier} | {r} | {c} | {d} |".format( + tier=tier_name, + r=_fmt(cs.random_split_auc, f"{cs_path}.random_split_auc"), + c=_fmt(cs.cohort_split_auc, f"{cs_path}.cohort_split_auc"), + d=_fmt(cs.auc_degradation, f"{cs_path}.auc_degradation"), + ) + ) + out.append("") + + out.append("## Baseline AUCs (G5.* / leakage probes)") + out.append("") + out.append("Each cell is HistGBM AUC trained on the named feature subset only.") + out.append("") + baseline_names = sorted( + {name for csm in report.tiers.values() for tm in csm.per_seed for name in tm.baselines} + ) + if baseline_names: + header = "| Tier | seed | " + " | ".join(baseline_names) + " |" + sep = "|---|---|" + "|".join(["---"] * len(baseline_names)) + "|" + out.append(header) + out.append(sep) + for tier_name, csm in sorted(report.tiers.items()): + for tm in csm.per_seed: + cells = [tier_name, str(tm.seed)] + for bn in baseline_names: + cell_path = f"$.tiers.{tier_name}.per_seed[seed={tm.seed}].baselines.{bn}" + cells.append(_fmt(tm.baselines.get(bn), cell_path)) + out.append("| " + " | ".join(cells) + " |") + out.append("") + else: + out.append("_No baseline AUCs were computed._") + out.append("") + + out.append("## Figures") + out.append("") + figures_paths = [ + f"`{FIGURES_DIRNAME}/{LIFT_CURVE_FIGURE_TEMPLATE.format(tier=t)}`" + for t in _LIFT_CURVE_TIERS + if t in report.tiers + ] + if figures_paths: + out.append("- Lift curves: " + ", ".join(figures_paths)) + if _CALIBRATION_TIER in report.tiers: + out.append(f"- Calibration ({_CALIBRATION_TIER}): `{FIGURES_DIRNAME}/{CALIBRATION_FIGURE}`") + if report.tiers: + out.append(f"- Leakage / baseline deltas: `{FIGURES_DIRNAME}/{LEAKAGE_DELTA_FIGURE}`") + out.append(f"- Value capture: `{FIGURES_DIRNAME}/{VALUE_CAPTURE_FIGURE}`") + if report.cohort_shift: + out.append(f"- Cohort shift: `{FIGURES_DIRNAME}/{COHORT_SHIFT_FIGURE}`") + out.append("") + + out.append("---") + out.append("") + out.append(f"_Renderer: `leadforge.validation.reporting`. JSON sibling: `{REPORT_JSON}`._") + return "\n".join(out) + "\n" + + +def _fmt(value: Any, json_path: str) -> str: + """Format a numeric metric with its JSON path citation appended.""" + if value is None: + return f"_n/a_ (`{json_path}`)" + if isinstance(value, float): + if math.isnan(value) or math.isinf(value): + return f"_n/a_ (`{json_path}`)" + return f"{value:.4f} (`{json_path}`)" + if isinstance(value, int): + return f"{value} (`{json_path}`)" + return f"{value} (`{json_path}`)" + + +# --------------------------------------------------------------------------- +# Figures. Determinism notes: +# - We never call ``plt.show`` or any interactive backend. +# - All bar / line orderings are sorted alphabetically by tier so the +# PNG bytes are stable across runs. +# - We ``close`` every figure after writing, otherwise long-running +# drivers (PR 3.3) accumulate matplotlib state. +# --------------------------------------------------------------------------- + + +def _figure(figsize: tuple[float, float] = (6.0, 4.0)) -> tuple[Any, Any]: + fig, ax = plt.subplots(figsize=figsize) + return fig, ax + + +def _save(fig: Any, path: Path) -> None: + fig.tight_layout() + fig.savefig(path, dpi=120, format="png") + plt.close(fig) + + +def _write_lift_curve(csm: CrossSeedTierMetrics, path: Path) -> None: + """Cumulative-gains chart at the median seed for one tier. + + The headline-tier picture: x = fraction of leads (sorted by score + descending), y = fraction of positives captured. Diagonal = random + baseline. The lift table in the markdown gives the numbers; the + figure gives the shape. + """ + if not csm.per_seed: + empty_fig, _ = _figure() + _save(empty_fig, path) + return + metrics = csm.per_seed[len(csm.per_seed) // 2] + fig, ax = _figure(figsize=(6.0, 5.0)) + # Convert lift@pct (precision/base) into a cumulative-gains-style + # coordinate: y = lift × pct/100, capped at 1.0. Computed only + # for the LIFT_PCTS we have; for {20, 50, 100} we fall back to + # base-rate diagonal as those points were not measured. + measured: dict[float, float] = {} + for p in (1.0, 5.0, 10.0): + v = metrics.lift_at_pct.get(f"{p:g}") + if v is not None and not math.isnan(v): + measured[p] = v + pcts = sorted(measured) + ys = [min(1.0, measured[p] * p / 100.0) for p in pcts] + ax.plot([0.0, *pcts, 100.0], [0.0, *ys, 1.0], marker="o", label=f"{csm.tier} (median seed)") + ax.plot([0, 100], [0, 1], linestyle="--", color="grey", label="random") + ax.set_xlabel("Top-K% of leads (sorted by predicted P(convert))") + ax.set_ylabel("Fraction of positives captured") + ax.set_title(f"Cumulative gains — {csm.tier}") + ax.set_xlim(0, 100) + ax.set_ylim(0, 1.05) + ax.legend(loc="lower right") + ax.grid(True, linestyle=":") + _save(fig, path) + + +def _write_calibration_curve(csm: CrossSeedTierMetrics, path: Path) -> None: + """Reliability diagram for the canonical (median) seed of a tier.""" + fig, ax = _figure(figsize=(5.0, 5.0)) + if not csm.per_seed: + _save(fig, path) + return + metrics = csm.per_seed[len(csm.per_seed) // 2] + bins = list(metrics.calibration_bins) + if bins: + xs = [b.mean_predicted for b in bins] + ys = [b.mean_actual for b in bins] + ax.plot(xs, ys, marker="o", label=csm.tier) + ax.plot([0, 1], [0, 1], linestyle="--", color="grey", label="perfectly calibrated") + ax.set_xlabel("Mean predicted P(convert)") + ax.set_ylabel("Empirical conversion rate") + ax.set_title(f"Reliability diagram — {csm.tier}") + ax.set_xlim(0, 1) + ax.set_ylim(0, 1) + ax.legend(loc="upper left") + ax.grid(True, linestyle=":") + _save(fig, path) + + +def _write_leakage_delta(tiers: Mapping[str, CrossSeedTierMetrics], path: Path) -> None: + """Bar chart of baseline AUCs per tier — the leakage-delta panel. + + ``id_only`` should hover near 0.5 (G5.3); ``post_snapshot_aggregates`` + well above LR is the trap signal; ``stage_only`` is typically absent + from public bundles. + """ + fig, ax = _figure(figsize=(8.0, 5.0)) + baseline_names = sorted( + {name for csm in tiers.values() for tm in csm.per_seed for name in tm.baselines} + ) + if not baseline_names: + _save(fig, path) + return + tier_names = sorted(tiers.keys()) + n_groups = len(tier_names) + n_bars = len(baseline_names) + bar_w = 0.8 / max(1, n_bars) + import numpy as np # local import so the module top stays sklearn-free + + xs = np.arange(n_groups) + for i, bn in enumerate(baseline_names): + ys: list[float] = [] + for tier_name in tier_names: + csm = tiers[tier_name] + seed_aucs: list[float] = [tm.baselines[bn] for tm in csm.per_seed if bn in tm.baselines] + ys.append(float(np.median(seed_aucs)) if seed_aucs else 0.0) + ax.bar(xs + i * bar_w, ys, width=bar_w, label=bn) + ax.set_xticks(xs + bar_w * (n_bars - 1) / 2) + ax.set_xticklabels(tier_names) + ax.set_ylabel("AUC (median across seeds)") + ax.set_ylim(0.4, 1.0) + ax.axhline(0.5, color="grey", linestyle="--", label="random (0.5)") + ax.set_title("Baseline AUCs per tier") + ax.legend(loc="best", fontsize=8) + ax.grid(True, linestyle=":", axis="y") + _save(fig, path) + + +def _write_cohort_shift(cohort: Mapping[str, CohortShiftMetrics], path: Path) -> None: + """Side-by-side bars: random vs chronological-cohort split AUC per tier.""" + import numpy as np + + fig, ax = _figure(figsize=(7.0, 4.5)) + tier_names = sorted(cohort.keys()) + xs = np.arange(len(tier_names)) + rand = [cohort[t].random_split_auc for t in tier_names] + coh = [ + cohort[t].cohort_split_auc if not math.isnan(cohort[t].cohort_split_auc) else 0.0 + for t in tier_names + ] + width = 0.35 + ax.bar(xs - width / 2, rand, width=width, label="random split AUC") + ax.bar(xs + width / 2, coh, width=width, label="cohort-shift AUC") + ax.set_xticks(xs) + ax.set_xticklabels(tier_names) + ax.set_ylabel("AUC") + ax.set_ylim(0.4, 1.0) + ax.axhline(0.5, color="grey", linestyle="--") + ax.set_title("Cohort-shift evaluation") + ax.legend(loc="best") + ax.grid(True, linestyle=":", axis="y") + _save(fig, path) + + +def _write_value_capture(tiers: Mapping[str, CrossSeedTierMetrics], path: Path) -> None: + """ACV captured at K (across the K values in :data:`PRECISION_KS`).""" + import numpy as np + + fig, ax = _figure(figsize=(7.0, 4.5)) + has_any = False + for tier_name in sorted(tiers.keys()): + csm = tiers[tier_name] + # Median across seeds for each K. + ys = [] + xs: list[int] = [] + for k in PRECISION_KS: + vals = [ + m.expected_acv_capture_at_k.get(str(k)) + for m in csm.per_seed + if str(k) in m.expected_acv_capture_at_k + ] + vals_clean = [v for v in vals if v is not None and not math.isnan(v)] + if vals_clean: + xs.append(int(k)) + ys.append(float(np.median(vals_clean))) + if xs: + has_any = True + ax.plot(xs, ys, marker="o", label=tier_name) + if not has_any: + _save(fig, path) + return + ax.set_xlabel("Top-K leads ranked by P(convert)") + ax.set_ylabel("Fraction of total converted-ACV captured") + ax.set_title("Value capture at top-K") + ax.set_ylim(0, 1.05) + ax.legend(loc="best") + ax.grid(True, linestyle=":") + _save(fig, path) + + +# --------------------------------------------------------------------------- +# Re-export for convenience — callers commonly import the top-level names +# from this module rather than ``release_quality``. +# --------------------------------------------------------------------------- + +__all__ = [ + "CALIBRATION_FIGURE", + "COHORT_SHIFT_FIGURE", + "FIGURES_DIRNAME", + "LEAKAGE_DELTA_FIGURE", + "LIFT_CURVE_FIGURE_TEMPLATE", + "REPORT_JSON", + "REPORT_MD", + "VALUE_CAPTURE_FIGURE", + "render_report", +] diff --git a/pyproject.toml b/pyproject.toml index 5ec1d50..29bacba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,9 +39,11 @@ dev = [ "pre-commit>=3.7", "types-pyyaml>=6.0", "scikit-learn>=1.3", + "matplotlib>=3.7", ] scripts = [ "scikit-learn>=1.3", + "matplotlib>=3.7", ] [project.scripts] @@ -85,5 +87,9 @@ ignore_missing_imports = true module = ["sklearn", "sklearn.*"] ignore_missing_imports = true +[[tool.mypy.overrides]] +module = ["matplotlib", "matplotlib.*"] +ignore_missing_imports = true + [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/tests/integration/test_release_quality_round_trip.py b/tests/integration/test_release_quality_round_trip.py new file mode 100644 index 0000000..636207c --- /dev/null +++ b/tests/integration/test_release_quality_round_trip.py @@ -0,0 +1,198 @@ +"""End-to-end: ``Generator`` → ``release_quality.measure`` → ``reporting.render``. + +Slow integration test (one full simulation, twice — N=2 seeds) that +verifies the PR 3.2 modules compose against the real bundle writer. + +The unit suites in ``tests/validation/test_release_quality.py`` and +``tests/validation/test_reporting.py`` cover individual primitives; this +test asserts they actually plug together. + +Tiny population sizes per the PR plan (the per-tier sweep with full +counts is PR 3.3's driver). +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from leadforge.api.generator import Generator +from leadforge.validation.release_quality import ( + TierBuildSpec, + measure_release_quality, + regenerate_tier_for_seeds, +) +from leadforge.validation.reporting import ( + CALIBRATION_FIGURE, + COHORT_SHIFT_FIGURE, + FIGURES_DIRNAME, + LEAKAGE_DELTA_FIGURE, + LIFT_CURVE_FIGURE_TEMPLATE, + REPORT_JSON, + REPORT_MD, + VALUE_CAPTURE_FIGURE, + render_report, +) + +_SMALL = {"n_leads": 50, "n_accounts": 25, "n_contacts": 75} + + +@pytest.fixture(scope="module") +def small_intermediate_bundle(tmp_path_factory: pytest.TempPathFactory) -> Path: + """One full Generator run at the smallest size that still produces + a non-degenerate label distribution. Reused across the round-trip + tests to amortise the simulation cost.""" + pytest.importorskip("sklearn") + out = tmp_path_factory.mktemp("rq_intermediate") / "bundle" + gen = Generator.from_recipe( + "b2b_saas_procurement_v1", + seed=42, + exposure_mode="student_public", + difficulty="intermediate", + ) + gen.generate(**_SMALL).save(str(out)) + return out + + +def test_round_trip_against_real_bundle(small_intermediate_bundle: Path, tmp_path: Path) -> None: + pytest.importorskip("sklearn") + report = measure_release_quality( + {"intermediate": {42: small_intermediate_bundle}}, + generation_timestamp="2026-05-06T12:00:00+00:00", + ) + out = tmp_path / "release/validation" + written = render_report(report, out) + + # Every promised file is produced and non-empty for a single-tier + # report. Lift curve renders only for ``intermediate`` here (the + # other two tier names aren't in the report). + assert (out / REPORT_JSON).exists() + assert (out / REPORT_JSON).stat().st_size > 0 + assert (out / REPORT_MD).exists() + assert (out / REPORT_MD).stat().st_size > 0 + for fig in ( + LIFT_CURVE_FIGURE_TEMPLATE.format(tier="intermediate"), + CALIBRATION_FIGURE, + LEAKAGE_DELTA_FIGURE, + COHORT_SHIFT_FIGURE, + VALUE_CAPTURE_FIGURE, + ): + f = out / FIGURES_DIRNAME / fig + assert f.exists(), f"missing figure: {fig}" + assert f.stat().st_size > 0, f"empty figure: {fig}" + # JSON shape is the agreed contract. + d = json.loads((out / REPORT_JSON).read_text()) + assert d["release_id"] == "leadforge-lead-scoring-v1" + tier = d["tiers"]["intermediate"] + assert tier["seeds"] == [42] + per_seed = tier["per_seed"][0] + for fld in ( + "lr_auc", + "gbm_auc", + "gbm_minus_lr_auc", + "lr_average_precision", + "brier_score", + "log_loss", + "calibration_max_bin_error", + "calibration_bins", + "baselines", + "precision_at_k", + "recall_at_k", + "lift_at_pct", + "expected_acv_capture_at_k", + ): + assert fld in per_seed, f"missing field in per_seed payload: {fld}" + # Cohort shift entry carries the canonical seed. + assert d["cohort_shift"]["intermediate"]["seed"] == 42 + # Markdown exists with at least the GBM-vs-LR ordering bool citation. + md = (out / REPORT_MD).read_text() + assert "$.cross_tier_ordering.gbm_minus_lr_positive_in_every_tier" in md + assert "intermediate" in md + assert sorted(written.keys()) == sorted( + { + "json", + "md", + "lift_curve_intermediate", + "calibration", + "leakage_delta", + "cohort_shift", + "value_capture", + } + ) + + +def test_regenerate_tier_for_seeds_n2(small_intermediate_bundle: Path, tmp_path: Path) -> None: + """``regenerate_tier_for_seeds`` produces one bundle dir per seed + and is idempotent — a re-run reuses the existing manifest. + + N=2 here per the PR plan; the full N=5 release-time sweep is PR + 3.3's driver. + """ + pytest.importorskip("sklearn") + spec = TierBuildSpec.from_bundle(small_intermediate_bundle, name="intermediate_small") + # Override population sizes so the rebuild stays fast; tier from the + # canonical bundle has the same difficulty/recipe/exposure_mode. + spec = TierBuildSpec( + name=spec.name, + recipe_id=spec.recipe_id, + difficulty=spec.difficulty, + n_leads=_SMALL["n_leads"], + n_accounts=_SMALL["n_accounts"], + n_contacts=_SMALL["n_contacts"], + snapshot_day=spec.snapshot_day, + primary_task=spec.primary_task, + label_window_days=spec.label_window_days, + exposure_mode=spec.exposure_mode, + ) + workdir = tmp_path / "regen" + out = regenerate_tier_for_seeds(spec, [42, 43], workdir) + assert sorted(out.keys()) == [42, 43] + for seed, p in out.items(): + assert (p / "manifest.json").exists() + manifest = json.loads((p / "manifest.json").read_text()) + assert manifest["seed"] == seed + # Idempotent re-run returns the same paths. + out2 = regenerate_tier_for_seeds(spec, [42, 43], workdir) + assert out == out2 + + +def test_full_release_quality_n2(small_intermediate_bundle: Path, tmp_path: Path) -> None: + """N=2 cross-seed sweep through the full orchestrator + renderer. + + This exercises everything PR 3.3's driver will call: regenerate → + measure_release_quality → render_report. At N=2 we only assert + structural shape; PR 3.3 calibrates the bands themselves. + """ + pytest.importorskip("sklearn") + spec = TierBuildSpec( + name="intermediate", + recipe_id="b2b_saas_procurement_v1", + difficulty="intermediate", + n_leads=_SMALL["n_leads"], + n_accounts=_SMALL["n_accounts"], + n_contacts=_SMALL["n_contacts"], + snapshot_day=30, + primary_task="converted_within_90_days", + label_window_days=90, + exposure_mode="student_public", + ) + workdir = tmp_path / "regen" + bundles = regenerate_tier_for_seeds(spec, [42, 43], workdir) + report = measure_release_quality( + {"intermediate": bundles}, generation_timestamp="2026-05-06T12:00:00+00:00" + ) + csm = report.tiers["intermediate"] + assert csm.seeds == [42, 43] + assert len(csm.per_seed) == 2 + assert "lr_auc" in csm.medians + assert "lr_auc" in csm.spreads + # Each seed's TierMetrics is a separate measurement, so the spread + # is non-negative and finite. + assert csm.spreads["lr_auc"] >= 0.0 + out = tmp_path / "release/validation" + written = render_report(report, out) + assert (out / REPORT_JSON).exists() + assert (out / FIGURES_DIRNAME / LIFT_CURVE_FIGURE_TEMPLATE.format(tier="intermediate")).exists() + assert "lift_curve_intermediate" in written diff --git a/tests/validation/test_release_quality.py b/tests/validation/test_release_quality.py new file mode 100644 index 0000000..9d805dc --- /dev/null +++ b/tests/validation/test_release_quality.py @@ -0,0 +1,511 @@ +"""Tests for :mod:`leadforge.validation.release_quality`. + +The fast tests in this file exercise individual metric primitives and +the dataclass plumbing using hand-built train/test pairs that have +known answers. The slow round-trip through ``Generator`` lives in +``tests/integration/test_release_quality_round_trip.py`` so the unit +suite stays under a second. +""" + +from __future__ import annotations + +import json +import math +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +import pytest + +from leadforge.validation.release_quality import ( + LABEL_COLUMN, + PRECISION_KS, + CalibrationBin, + CohortShiftMetrics, + CrossSeedTierMetrics, + CrossTierOrdering, + ReleaseQualityReport, + TierMetrics, + _calibration_bins, + _expected_acv_capture, + _lift_at_pct, + _precision_at_k, + _recall_at_k, + _top_decile_rate, + measure_cohort_shift_from_bundle, + measure_release_quality, + measure_tier_from_bundle, + report_to_dict, + report_to_json, +) + +# --------------------------------------------------------------------------- +# Metric primitives — hand-built inputs with known answers. +# --------------------------------------------------------------------------- + + +class TestMetricPrimitives: + def test_precision_at_k_perfect_ranker(self) -> None: + # Top-K predictions should perfectly match the K positives. + probs = np.array([0.9, 0.85, 0.8, 0.1, 0.05]) + y = np.array([1, 1, 1, 0, 0]) + assert _precision_at_k(probs, y, 3) == pytest.approx(1.0) + assert _recall_at_k(probs, y, 3) == pytest.approx(1.0) + + def test_precision_at_k_random_score_returns_base_rate_in_expectation(self) -> None: + # A constant score → ties break by stable order; precision@k + # equals first-k label fraction, not necessarily base rate. + # This is a structural sanity test — the function must not + # raise and must return a value in [0, 1]. + probs = np.full(10, 0.5) + y = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) + p = _precision_at_k(probs, y, 5) + assert 0.0 <= p <= 1.0 + + def test_recall_at_k_no_positives_returns_nan(self) -> None: + probs = np.array([0.9, 0.5, 0.1]) + y = np.array([0, 0, 0]) + assert math.isnan(_recall_at_k(probs, y, 2)) + + def test_lift_at_pct_perfect_ranker_above_one(self) -> None: + probs = np.array([0.9, 0.85, 0.8, 0.1, 0.05, 0.04, 0.03, 0.02, 0.01, 0.0]) + y = np.array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) + # Top-30% (3 leads) all positive; base rate = 0.3 → lift = 1.0/0.3 ≈ 3.33. + lift = _lift_at_pct(probs, y, 30.0) + assert lift == pytest.approx(1.0 / 0.3, rel=1e-3) + + def test_lift_at_pct_zero_base_rate_returns_nan(self) -> None: + probs = np.array([0.5, 0.5, 0.5]) + y = np.array([0, 0, 0]) + assert math.isnan(_lift_at_pct(probs, y, 50.0)) + + def test_top_decile_rate_perfect_ranker(self) -> None: + probs = np.linspace(1.0, 0.0, 10) # descending + y = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + # Top-10% = top-1 = the single positive. + assert _top_decile_rate(probs, y) == pytest.approx(1.0) + + def test_expected_acv_capture_perfect_ranker_captures_all(self) -> None: + probs = np.array([0.9, 0.8, 0.5, 0.1, 0.05]) + y = np.array([1, 1, 0, 0, 0]) + acv = np.array([100, 200, 50, 50, 50], dtype=float) + # Top-2 ranks both positives, capturing 100 + 200 of the + # converted-ACV total of 300. + assert _expected_acv_capture(probs, y, acv, 2) == pytest.approx(1.0) + + def test_expected_acv_capture_no_converted_returns_nan(self) -> None: + probs = np.array([0.9, 0.5, 0.1]) + y = np.array([0, 0, 0]) + acv = np.array([100, 100, 100], dtype=float) + assert math.isnan(_expected_acv_capture(probs, y, acv, 2)) + + def test_calibration_bins_perfectly_calibrated_zero_error(self) -> None: + # 1000 points where p == y_mean per bin (perfectly calibrated). + rng = np.random.default_rng(42) + probs = rng.uniform(0, 1, size=2000) + y = (rng.uniform(0, 1, size=2000) < probs).astype(int) + bins, max_err = _calibration_bins(probs, y, n_bins=10) + assert len(bins) >= 5 # most bins populated + # With 2000 samples the bin-mean-actual concentrates around the + # bin midpoint; the max error should be well under 0.10. + assert max_err < 0.10 + + def test_calibration_bins_known_miscalibration(self) -> None: + # Predict 0.9 always; truth is 0.1 base rate → max error 0.8. + probs = np.full(500, 0.9) + y = np.zeros(500, dtype=int) + y[:50] = 1 # 10% positives + bins, max_err = _calibration_bins(probs, y, n_bins=10) + assert max_err == pytest.approx(0.8, abs=1e-6) + # Only the bin containing 0.9 should be populated. + populated = [b for b in bins if b.n > 0] + assert len(populated) == 1 + + +# --------------------------------------------------------------------------- +# Dataclass plumbing + JSON serialisation. +# --------------------------------------------------------------------------- + + +def _fixture_tier_metrics(tier: str, seed: int, **overrides: Any) -> TierMetrics: + base: dict[str, Any] = { + "tier": tier, + "seed": seed, + "n_train": 100, + "n_test": 30, + "base_rate": 0.3, + "conversion_rate_train": 0.31, + "conversion_rate_test": 0.30, + "lr_auc": 0.85, + "gbm_auc": 0.88, + "gbm_minus_lr_auc": 0.03, + "lr_average_precision": 0.62, + "gbm_average_precision": 0.65, + "average_precision": 0.62, + "precision_at_k": {"50": 0.66, "100": 0.55}, + "recall_at_k": {"50": 0.45, "100": 0.78}, + "lift_at_pct": {"1": 3.0, "5": 2.5, "10": 2.0}, + "top_decile_rate": 0.6, + "expected_acv_capture_at_k": {"50": 0.55, "100": 0.80}, + "brier_score": 0.12, + "log_loss": 0.34, + "calibration_max_bin_error": 0.18, + "calibration_bins": [ + CalibrationBin(0.0, 0.1, 5, 0.05, 0.02), + CalibrationBin(0.4, 0.5, 4, 0.45, 0.5), + ], + "baselines": { + "source_only": 0.52, + "engagement_only": 0.66, + "post_snapshot_aggregates": 0.55, + "id_only": 0.50, + }, + } + base.update(overrides) + return TierMetrics(**base) + + +_DEFAULT_TIERS: tuple[str, ...] = ("intro", "intermediate", "advanced") + + +def _fixture_report(tiers: tuple[str, ...] = _DEFAULT_TIERS) -> ReleaseQualityReport: + cross_seed: dict[str, CrossSeedTierMetrics] = {} + cohort: dict[str, CohortShiftMetrics] = {} + for tier in tiers: + # Two seeds per tier — minimum for cross-seed spread. + m1 = _fixture_tier_metrics( + tier=tier, + seed=42, + lr_auc={"intro": 0.92, "intermediate": 0.88, "advanced": 0.85}.get(tier, 0.85), + gbm_auc={"intro": 0.94, "intermediate": 0.91, "advanced": 0.88}.get(tier, 0.88), + gbm_minus_lr_auc=0.03, + lr_average_precision={"intro": 0.85, "intermediate": 0.65, "advanced": 0.40}.get( + tier, 0.5 + ), + conversion_rate_test={"intro": 0.42, "intermediate": 0.20, "advanced": 0.08}.get( + tier, 0.2 + ), + ) + m2 = _fixture_tier_metrics(tier=tier, seed=43) + cross_seed[tier] = CrossSeedTierMetrics( + tier=tier, + seeds=[42, 43], + per_seed=[m1, m2], + medians={ + "lr_auc": (m1.lr_auc + m2.lr_auc) / 2, + "gbm_auc": (m1.gbm_auc + m2.gbm_auc) / 2, + "gbm_minus_lr_auc": 0.03, + "lr_average_precision": m1.lr_average_precision, + "gbm_average_precision": m1.gbm_average_precision, + "brier_score": 0.12, + "log_loss": 0.34, + "calibration_max_bin_error": 0.18, + "top_decile_rate": 0.6, + "conversion_rate_test": m1.conversion_rate_test, + }, + spreads=dict.fromkeys( + ("lr_auc", "gbm_auc", "lr_average_precision", "brier_score"), 0.01 + ), + ) + cohort[tier] = CohortShiftMetrics( + tier=tier, + seed=42, + random_split_auc=0.85, + cohort_split_auc=0.78, + auc_degradation=0.07, + ) + ordering = CrossTierOrdering( + by_average_precision=["intro", "intermediate", "advanced"], + by_precision_at_100=["intro", "intermediate", "advanced"], + by_gbm_minus_lr=["intro", "intermediate", "advanced"], + by_conversion_rate=["intro", "intermediate", "advanced"], + average_precision_intro_gt_intermediate=True, + average_precision_intermediate_gt_advanced=True, + precision_at_100_intro_gt_intermediate=True, + precision_at_100_intermediate_gt_advanced=True, + conversion_rate_intro_gt_intermediate=True, + conversion_rate_intermediate_gt_advanced=True, + gbm_minus_lr_positive_in_every_tier=True, + ) + return ReleaseQualityReport( + release_id="leadforge-lead-scoring-v1", + package_version="1.0.0", + generation_timestamp="2026-05-06T12:00:00+00:00", + seeds=[42, 43], + tiers=cross_seed, + cohort_shift=cohort, + cross_tier_ordering=ordering, + ) + + +class TestJsonSerialisation: + def test_report_to_json_round_trips(self) -> None: + report = _fixture_report() + s = report_to_json(report) + d = json.loads(s) + assert d["release_id"] == "leadforge-lead-scoring-v1" + assert sorted(d["tiers"].keys()) == ["advanced", "intermediate", "intro"] + assert d["tiers"]["intro"]["per_seed"][0]["lr_auc"] == pytest.approx(0.92) + + def test_report_to_dict_coerces_nan_to_none(self) -> None: + # Build a TierMetrics with NaN in a nested dict and verify it + # comes out as None — json.dumps would otherwise fail with + # ``ValueError: Out of range float values are not JSON compliant`` + # under some encoder configurations. + m = _fixture_tier_metrics(tier="intro", seed=42, calibration_max_bin_error=float("nan")) + # Patch a NaN inside a nested float dict. + m = TierMetrics(**{**m.__dict__, "lift_at_pct": {"1": float("nan"), "5": 2.0, "10": 1.5}}) + report = ReleaseQualityReport( + release_id="x", + package_version="1.0.0", + generation_timestamp="2026-05-06T12:00:00+00:00", + seeds=[42], + tiers={ + "intro": CrossSeedTierMetrics( + tier="intro", seeds=[42], per_seed=[m], medians={}, spreads={} + ) + }, + cohort_shift={}, + cross_tier_ordering=_fixture_report(("intro",)).cross_tier_ordering, + ) + d = report_to_dict(report) + # NaN survived as None. + assert d["tiers"]["intro"]["per_seed"][0]["calibration_max_bin_error"] is None + assert d["tiers"]["intro"]["per_seed"][0]["lift_at_pct"]["1"] is None + # And the result is JSON-serialisable end-to-end. + json.dumps(d) + + +# --------------------------------------------------------------------------- +# Cross-tier ordering computation +# --------------------------------------------------------------------------- + + +class TestCrossTierOrdering: + def test_ordering_when_intro_is_easiest(self) -> None: + report = _fixture_report() + o = report.cross_tier_ordering + assert o.by_average_precision[0] == "intro" + assert o.by_conversion_rate[0] == "intro" + assert o.average_precision_intro_gt_intermediate + assert o.gbm_minus_lr_positive_in_every_tier + + def test_ordering_with_partial_release(self) -> None: + # Only intro present — other ordering booleans default to True. + from leadforge.validation.release_quality import _compute_cross_tier_ordering + + partial = _fixture_report(("intro",)) + o = _compute_cross_tier_ordering(partial.tiers) + assert o.by_average_precision == ["intro"] + assert o.average_precision_intro_gt_intermediate is True + assert o.gbm_minus_lr_positive_in_every_tier + + +# --------------------------------------------------------------------------- +# Bundle-level measurement against a synthetic mini-bundle. +# --------------------------------------------------------------------------- + + +def _write_minimal_bundle( + root: Path, + *, + n: int = 200, + seed: int = 42, + tier_signal: float = 1.0, + include_cohort: bool = True, +) -> Path: + """Hand-build a bundle directory with the minimal contract: + manifest.json + tasks//{train,test,valid}.parquet. + + Tables/ is not required by ``measure_tier_from_bundle`` (only the + task splits and manifest are read). + """ + rng = np.random.default_rng(seed) + n_train = int(n * 0.7) + n_test = int(n * 0.15) + n_valid = n - n_train - n_test + + def _make(n_rows: int, base_day: int) -> pd.DataFrame: + # Generate a feature with explicit signal toward the label. + latent = rng.normal(size=n_rows) + engagement = latent * tier_signal + rng.normal(scale=0.5, size=n_rows) + prob = 1.0 / (1.0 + np.exp(-engagement)) + y = pd.Series(rng.uniform(size=n_rows) < prob, dtype="boolean") + if include_cohort: + ts = pd.date_range("2026-01-01", periods=n_rows, freq="h") + pd.Timedelta(days=base_day) + else: + ts = pd.NaT + df = pd.DataFrame( + { + "lead_id": [f"lead_{base_day:03d}_{i:05d}" for i in range(n_rows)], + "account_id": [f"acct_{base_day:03d}_{i:05d}" for i in range(n_rows)], + "contact_id": [f"cont_{base_day:03d}_{i:05d}" for i in range(n_rows)], + "lead_created_at": pd.Series(ts).astype(str) + if include_cohort + else pd.Series([pd.NA] * n_rows, dtype="string"), + "lead_source": rng.choice( + ["inbound_marketing", "sdr_outbound", "partner_referral"], size=n_rows + ), + "first_touch_channel": rng.choice(["seo", "ppc", "email"], size=n_rows), + "industry": rng.choice(["fintech", "manufacturing", "healthcare"], size=n_rows), + "region": rng.choice(["us", "uk", "eu"], size=n_rows), + "employee_band": rng.choice(["50-200", "200-500"], size=n_rows), + "estimated_revenue_band": rng.choice(["10m-50m", "50m-100m"], size=n_rows), + "process_maturity_band": rng.choice(["low", "med", "high"], size=n_rows), + "role_function": rng.choice(["finance", "ops"], size=n_rows), + "seniority": rng.choice(["manager", "director"], size=n_rows), + "buyer_role": rng.choice(["champion", "economic_buyer"], size=n_rows), + "touch_count": pd.Series( + np.maximum(0, np.round(engagement * 3 + 5)).astype("int64"), dtype="Int64" + ), + "session_count": pd.Series(rng.integers(0, 10, size=n_rows), dtype="Int64"), + "expected_acv": pd.Series( + rng.uniform(20_000, 100_000, size=n_rows), dtype="Float64" + ), + "total_touches_all": pd.Series( + np.maximum( + 0, np.round(engagement * 3 + 5 + rng.normal(0, 1, size=n_rows)) + ).astype("int64"), + dtype="Int64", + ), + LABEL_COLUMN: y, + } + ) + return df + + train = _make(n_train, base_day=0) + valid = _make(n_valid, base_day=20) + test = _make(n_test, base_day=30) + + task_dir = root / "tasks" / "converted_within_90_days" + task_dir.mkdir(parents=True, exist_ok=True) + train.to_parquet(task_dir / "train.parquet", index=False) + valid.to_parquet(task_dir / "valid.parquet", index=False) + test.to_parquet(task_dir / "test.parquet", index=False) + + manifest = { + "bundle_schema_version": "5", + "package_version": "1.0.0", + "recipe_id": "b2b_saas_procurement_v1", + "seed": seed, + "exposure_mode": "student_public", + "difficulty": "intermediate", + "n_accounts": n, + "n_contacts": n * 3, + "n_leads": n, + "snapshot_day": 30, + "primary_task": "converted_within_90_days", + "label_window_days": 90, + "tasks": { + "converted_within_90_days": { + "train_rows": n_train, + "valid_rows": n_valid, + "test_rows": n_test, + } + }, + } + (root / "manifest.json").write_text(json.dumps(manifest, indent=2)) + return root + + +class TestBundleMeasurement: + def test_measure_tier_from_synthetic_bundle(self, tmp_path: Path) -> None: + pytest.importorskip("sklearn") + bundle = _write_minimal_bundle(tmp_path / "intermediate", n=400, seed=42) + m = measure_tier_from_bundle(bundle, seed=42, tier_name="intermediate") + assert m.tier == "intermediate" + assert m.n_train > 0 + assert m.n_test > 0 + # Synthetic signal: LR AUC should clear chance comfortably. + assert m.lr_auc > 0.6 + # Headline serialisation contract: every PRECISION_KS key is + # present as a string-keyed entry. + for k in PRECISION_KS: + assert str(k) in m.precision_at_k + assert str(k) in m.recall_at_k + # Calibration bins integrate to the test size. + assert sum(b.n for b in m.calibration_bins) == m.n_test + # ID-only baseline ≈ chance — the synthetic IDs are + # uncorrelated with the latent. Allow generous slack: with + # only 60 test rows (15% of 400) the AUC variance is wide. + assert "id_only" in m.baselines + assert m.baselines["id_only"] > 0.3 + assert m.baselines["id_only"] < 0.7 + + def test_measure_tier_raises_when_train_single_class(self, tmp_path: Path) -> None: + pytest.importorskip("sklearn") + bundle = tmp_path / "degenerate" + # Build a bundle and overwrite the train split with all-zeros. + _write_minimal_bundle(bundle, n=200, seed=42) + train_path = bundle / "tasks/converted_within_90_days/train.parquet" + df = pd.read_parquet(train_path) + df[LABEL_COLUMN] = pd.Series([False] * len(df), dtype="boolean") + df.to_parquet(train_path, index=False) + with pytest.raises(ValueError, match="train split has fewer than two classes"): + measure_tier_from_bundle(bundle, seed=42) + + def test_measure_cohort_shift_returns_random_auc_when_no_timestamp( + self, tmp_path: Path + ) -> None: + pytest.importorskip("sklearn") + # Build a bundle without lead_created_at usable as datetime. + bundle = _write_minimal_bundle(tmp_path / "no_cohort", n=200, seed=42) + train_path = bundle / "tasks/converted_within_90_days/train.parquet" + test_path = bundle / "tasks/converted_within_90_days/test.parquet" + for p in (train_path, test_path): + df = pd.read_parquet(p) + df["lead_created_at"] = pd.Series(["not-a-date"] * len(df), dtype="string") + df.to_parquet(p, index=False) + cs = measure_cohort_shift_from_bundle(bundle, seed=42) + assert not math.isnan(cs.random_split_auc) + assert math.isnan(cs.cohort_split_auc) + assert math.isnan(cs.auc_degradation) + + def test_cohort_shift_returns_well_formed_auc_pair(self, tmp_path: Path) -> None: + """Cohort-shift evaluation returns finite AUCs in [0, 1] and a + signed degradation when ``lead_created_at`` is parseable. + + We don't assert ``cohort_split_auc < random_split_auc`` on + synthetic data — random vs chronological splits over a flat + latent are both ~chance, so ordering is dominated by sample + noise. The behavioural ordering test lives in the round-trip + integration suite where the engine produces a real time-shift. + """ + pytest.importorskip("sklearn") + bundle = _write_minimal_bundle(tmp_path / "cohort", n=500, seed=11) + cs = measure_cohort_shift_from_bundle(bundle, seed=11) + for auc in (cs.random_split_auc, cs.cohort_split_auc): + assert 0.0 <= auc <= 1.0 + assert not math.isnan(auc) + assert cs.auc_degradation == pytest.approx( + cs.random_split_auc - cs.cohort_split_auc, abs=1e-9 + ) + + +# --------------------------------------------------------------------------- +# Cross-seed orchestration on synthetic bundles. +# --------------------------------------------------------------------------- + + +class TestMeasureReleaseQuality: + def test_orchestrator_aggregates_two_seeds(self, tmp_path: Path) -> None: + pytest.importorskip("sklearn") + b42 = _write_minimal_bundle(tmp_path / "intermediate__seed42", n=400, seed=42) + b43 = _write_minimal_bundle(tmp_path / "intermediate__seed43", n=400, seed=43) + report = measure_release_quality( + {"intermediate": {42: b42, 43: b43}}, + generation_timestamp="2026-05-06T12:00:00+00:00", + ) + assert sorted(report.tiers.keys()) == ["intermediate"] + csm = report.tiers["intermediate"] + assert csm.seeds == [42, 43] + assert len(csm.per_seed) == 2 + assert "lr_auc" in csm.medians + assert "lr_auc" in csm.spreads + # Cohort shift was run for the canonical seed (smallest). + assert "intermediate" in report.cohort_shift + assert report.cohort_shift["intermediate"].seed == 42 + # JSON round trip works. + d = report_to_dict(report) + json.dumps(d) diff --git a/tests/validation/test_reporting.py b/tests/validation/test_reporting.py new file mode 100644 index 0000000..edf5bcc --- /dev/null +++ b/tests/validation/test_reporting.py @@ -0,0 +1,256 @@ +"""Tests for :mod:`leadforge.validation.reporting`. + +The renderer is matplotlib-Agg-only and deterministic. We don't visual- +diff the figures (out of scope per the PR plan); we just assert each +contract file is created with non-empty bytes and that the markdown +report cites every JSON path it surfaces. +""" + +from __future__ import annotations + +import json +import re +from pathlib import Path + +from leadforge.validation.release_quality import ( + CalibrationBin, + CohortShiftMetrics, + CrossSeedTierMetrics, + CrossTierOrdering, + ReleaseQualityReport, + TierMetrics, +) +from leadforge.validation.reporting import ( + CALIBRATION_FIGURE, + COHORT_SHIFT_FIGURE, + FIGURES_DIRNAME, + LEAKAGE_DELTA_FIGURE, + LIFT_CURVE_FIGURE_TEMPLATE, + REPORT_JSON, + REPORT_MD, + VALUE_CAPTURE_FIGURE, + render_report, +) + + +def _tier_metrics(tier: str, seed: int, **overrides: object) -> TierMetrics: + base: dict[str, object] = { + "tier": tier, + "seed": seed, + "n_train": 100, + "n_test": 30, + "base_rate": 0.3, + "conversion_rate_train": 0.31, + "conversion_rate_test": 0.30, + "lr_auc": 0.85, + "gbm_auc": 0.88, + "gbm_minus_lr_auc": 0.03, + "lr_average_precision": 0.62, + "gbm_average_precision": 0.65, + "average_precision": 0.62, + "precision_at_k": {"50": 0.66, "100": 0.55}, + "recall_at_k": {"50": 0.45, "100": 0.78}, + "lift_at_pct": {"1": 3.0, "5": 2.5, "10": 2.0}, + "top_decile_rate": 0.6, + "expected_acv_capture_at_k": {"50": 0.55, "100": 0.80}, + "brier_score": 0.12, + "log_loss": 0.34, + "calibration_max_bin_error": 0.18, + "calibration_bins": [ + CalibrationBin(0.0, 0.1, 5, 0.05, 0.02), + CalibrationBin(0.4, 0.5, 4, 0.45, 0.50), + CalibrationBin(0.8, 0.9, 6, 0.85, 0.83), + ], + "baselines": { + "source_only": 0.52, + "engagement_only": 0.66, + "post_snapshot_aggregates": 0.55, + "id_only": 0.50, + }, + } + base.update(overrides) + return TierMetrics(**base) # type: ignore[arg-type] + + +_DEFAULT_TIERS: tuple[str, ...] = ("intro", "intermediate", "advanced") + + +def _build_report(tiers: tuple[str, ...] = _DEFAULT_TIERS) -> ReleaseQualityReport: + cs_tiers: dict[str, CrossSeedTierMetrics] = {} + cohort: dict[str, CohortShiftMetrics] = {} + for tier in tiers: + m1 = _tier_metrics(tier=tier, seed=42) + m2 = _tier_metrics(tier=tier, seed=43, lr_auc=0.86, gbm_auc=0.89) + cs_tiers[tier] = CrossSeedTierMetrics( + tier=tier, + seeds=[42, 43], + per_seed=[m1, m2], + medians={ + "lr_auc": 0.855, + "gbm_auc": 0.885, + "gbm_minus_lr_auc": 0.03, + "lr_average_precision": 0.62, + "gbm_average_precision": 0.65, + "brier_score": 0.12, + "log_loss": 0.34, + "calibration_max_bin_error": 0.18, + "top_decile_rate": 0.6, + "conversion_rate_test": 0.30, + }, + spreads={ + "lr_auc": 0.01, + "gbm_auc": 0.01, + "gbm_minus_lr_auc": 0.0, + "lr_average_precision": 0.0, + "brier_score": 0.0, + "calibration_max_bin_error": 0.0, + "top_decile_rate": 0.0, + "conversion_rate_test": 0.0, + }, + ) + cohort[tier] = CohortShiftMetrics( + tier=tier, seed=42, random_split_auc=0.85, cohort_split_auc=0.78, auc_degradation=0.07 + ) + tier_list = list(tiers) + ordering = CrossTierOrdering( + by_average_precision=tier_list, + by_precision_at_100=tier_list, + by_gbm_minus_lr=tier_list, + by_conversion_rate=tier_list, + average_precision_intro_gt_intermediate=True, + average_precision_intermediate_gt_advanced=True, + precision_at_100_intro_gt_intermediate=True, + precision_at_100_intermediate_gt_advanced=True, + conversion_rate_intro_gt_intermediate=True, + conversion_rate_intermediate_gt_advanced=True, + gbm_minus_lr_positive_in_every_tier=True, + ) + return ReleaseQualityReport( + release_id="leadforge-lead-scoring-v1", + package_version="1.0.0", + generation_timestamp="2026-05-06T12:00:00+00:00", + seeds=[42, 43], + tiers=cs_tiers, + cohort_shift=cohort, + cross_tier_ordering=ordering, + ) + + +class TestRenderReport: + def test_writes_every_contract_file(self, tmp_path: Path) -> None: + report = _build_report() + out = tmp_path / "release/validation" + written = render_report(report, out) + assert (out / REPORT_JSON).exists() + assert (out / REPORT_MD).exists() + for tier in ("intro", "intermediate", "advanced"): + f = out / FIGURES_DIRNAME / LIFT_CURVE_FIGURE_TEMPLATE.format(tier=tier) + assert f.exists() + assert f.stat().st_size > 0 + for fig in ( + CALIBRATION_FIGURE, + LEAKAGE_DELTA_FIGURE, + COHORT_SHIFT_FIGURE, + VALUE_CAPTURE_FIGURE, + ): + f = out / FIGURES_DIRNAME / fig + assert f.exists() + assert f.stat().st_size > 0 + # Returned mapping covers every artefact name we just inspected. + assert set(written) >= { + "json", + "md", + "lift_curve_intro", + "lift_curve_intermediate", + "lift_curve_advanced", + "calibration", + "leakage_delta", + "cohort_shift", + "value_capture", + } + + def test_json_is_well_formed(self, tmp_path: Path) -> None: + report = _build_report() + out = tmp_path / "v" + render_report(report, out) + d = json.loads((out / REPORT_JSON).read_text()) + assert d["release_id"] == "leadforge-lead-scoring-v1" + assert "tiers" in d + assert "cross_tier_ordering" in d + # Ordering booleans round-trip as JSON true/false (not Python str). + assert d["cross_tier_ordering"]["gbm_minus_lr_positive_in_every_tier"] is True + + def test_markdown_cites_json_paths_for_every_metric_cell(self, tmp_path: Path) -> None: + """G10.6 — every claim has a backing JSON reference.""" + report = _build_report() + out = tmp_path / "v" + render_report(report, out) + md = (out / REPORT_MD).read_text() + # Every numeric cell should be followed by ``(`$.`)``. + # Find table rows under the per-tier headline section. + # We assert at least one citation exists per tier. + for tier in ("intro", "intermediate", "advanced"): + assert re.search( + rf"\| {tier} \|.*\$\.tiers\.{tier}\.medians\.lr_auc", + md, + ), f"missing JSON citation for {tier}.medians.lr_auc" + # Cohort shift section cites every degradation value. + for tier in ("intro", "intermediate", "advanced"): + assert f"$.cohort_shift.{tier}.auc_degradation" in md + # Cross-tier ordering booleans cite their JSON keys. + assert "$.cross_tier_ordering.gbm_minus_lr_positive_in_every_tier" in md + + def test_partial_release_renders_partial_outputs(self, tmp_path: Path) -> None: + """One-tier reports skip lift curves for the missing tiers and + also skip calibration (which is intermediate-only).""" + report = _build_report(("intro",)) + out = tmp_path / "v" + render_report(report, out) + assert (out / FIGURES_DIRNAME / LIFT_CURVE_FIGURE_TEMPLATE.format(tier="intro")).exists() + assert not ( + out / FIGURES_DIRNAME / LIFT_CURVE_FIGURE_TEMPLATE.format(tier="intermediate") + ).exists() + # Calibration figure only renders for the intermediate tier per + # the design contract; intro-only reports skip it. + assert not (out / FIGURES_DIRNAME / CALIBRATION_FIGURE).exists() + + def test_render_is_deterministic_given_same_input(self, tmp_path: Path) -> None: + """Two consecutive renders of the same report produce + byte-identical JSON. Markdown and figures are also stable but + figures depend on the matplotlib version's font cache, so we + only assert byte-equality on the text artefacts.""" + report = _build_report() + a = tmp_path / "a" + b = tmp_path / "b" + render_report(report, a) + render_report(report, b) + assert (a / REPORT_JSON).read_bytes() == (b / REPORT_JSON).read_bytes() + assert (a / REPORT_MD).read_bytes() == (b / REPORT_MD).read_bytes() + + +class TestNanRenderingIsClean: + def test_nan_metrics_render_as_n_a_in_markdown(self, tmp_path: Path) -> None: + report = _build_report() + # Patch one cohort entry to NaN. + report.cohort_shift["intro"] = CohortShiftMetrics( + tier="intro", + seed=42, + random_split_auc=0.85, + cohort_split_auc=float("nan"), + auc_degradation=float("nan"), + ) + out = tmp_path / "v" + render_report(report, out) + md = (out / REPORT_MD).read_text() + # The intro cohort row should carry an n/a marker rather than + # the literal string ``nan``. + intro_row = next( + line + for line in md.splitlines() + if line.startswith("| intro ") and "$.cohort_shift" in line + ) + assert "_n/a_" in intro_row + assert "nan" not in intro_row.lower().replace("$.cohort_shift.intro.", "") + # JSON converted NaN to null. + d = json.loads((out / REPORT_JSON).read_text()) + assert d["cohort_shift"]["intro"]["cohort_split_auc"] is None From 6c0f835294fcd846ea21773dab03f3acb18bb290 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 00:51:39 +0300 Subject: [PATCH 2/4] refactor(validation): cohort-shift uses caller seed; factor NaN return MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review pass on PR 3.2: - ``measure_cohort_shift_from_bundle`` was overriding the caller's ``seed`` with a hardcoded ``COHORT_SEED`` for the chronological-split HistGBM fit only, while the random-split fit used the caller's seed. That made the AUC pair non-comparable across reseeded calls and was surprising — drop the constant and use the caller's seed for both fits. - The four NaN-return branches duplicated the same ``CohortShiftMetrics`` literal with the ``label = tier_name or manifest.difficulty`` expansion inline. Factor into a local ``_no_cohort()`` closure; reduces the function from ~85 to ~65 lines without changing behaviour. Tests + ruff + mypy still clean (28/28 PR-3.2 tests pass). Co-Authored-By: Claude Opus 4.7 --- leadforge/validation/release_quality.py | 54 +++++++++---------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/leadforge/validation/release_quality.py b/leadforge/validation/release_quality.py index f682250..387d4b4 100644 --- a/leadforge/validation/release_quality.py +++ b/leadforge/validation/release_quality.py @@ -76,10 +76,6 @@ #: (15%) so the two splits are roughly comparable in test size. COHORT_TRAIN_FRAC: float = 0.85 -#: Random-state seed used inside the cohort-shift HistGBM so the -#: random-vs-cohort comparison shares one source of randomness. -COHORT_SEED: int = 42 - # --------------------------------------------------------------------------- # Result dataclasses @@ -383,6 +379,7 @@ def measure_cohort_shift_from_bundle( sk = _import_sklearn() manifest = load_json(bundle_dir / "manifest.json") primary_task = str(manifest.get("primary_task", "converted_within_90_days")) + label = tier_name or str(manifest.get("difficulty", bundle_dir.name)) train = pd.read_parquet(bundle_dir / f"tasks/{primary_task}/train.parquet") test = pd.read_parquet(bundle_dir / f"tasks/{primary_task}/test.parquet") @@ -398,67 +395,54 @@ def measure_cohort_shift_from_bundle( rand_probs = rand_pipe.predict_proba(x_test)[:, 1] random_auc = float(sk.roc_auc_score(y_test, rand_probs)) - if "lead_created_at" not in train.columns: - # Without a timestamp column, "cohort" has no meaning; emit a - # NaN degradation so PR 3.3 can see this is unsupported on the - # bundle rather than silently report 0. + def _no_cohort() -> CohortShiftMetrics: + # Surface NaN rather than inventing a value when chronological + # resplit is unsupported (no timestamp column / unparseable + # timestamps / single-class early or late half / empty late + # half). PR 3.3's gating layer can then treat NaN as "skip" + # rather than silently scoring 0. return CohortShiftMetrics( - tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), + tier=label, seed=seed, random_split_auc=random_auc, cohort_split_auc=float("nan"), auc_degradation=float("nan"), ) + if "lead_created_at" not in train.columns: + return _no_cohort() + pooled = pd.concat([train, test], ignore_index=True) ts = pd.to_datetime(pooled["lead_created_at"], errors="coerce") if ts.isna().any(): - # Same posture as ``probe_snapshot_window`` — a malformed anchor - # would mask the cohort split. Surface it as NaN rather than - # invent a value. - return CohortShiftMetrics( - tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), - seed=seed, - random_split_auc=random_auc, - cohort_split_auc=float("nan"), - auc_degradation=float("nan"), - ) + return _no_cohort() order = np.argsort(ts.values, kind="stable") cutoff = int(round(len(pooled) * COHORT_TRAIN_FRAC)) early_idx = order[:cutoff] late_idx = order[cutoff:] if len(late_idx) == 0: - return CohortShiftMetrics( - tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), - seed=seed, - random_split_auc=random_auc, - cohort_split_auc=float("nan"), - auc_degradation=float("nan"), - ) + return _no_cohort() early = pooled.iloc[early_idx] late = pooled.iloc[late_idx] y_early = early[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values y_late = late[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values if len(set(y_early)) < 2 or len(set(y_late)) < 2: - return CohortShiftMetrics( - tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), - seed=seed, - random_split_auc=random_auc, - cohort_split_auc=float("nan"), - auc_degradation=float("nan"), - ) + return _no_cohort() x_early = _sanitize_categoricals(early[cat_cols + num_cols], cat_cols) x_late = _sanitize_categoricals(late[cat_cols + num_cols], cat_cols) - cohort_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=COHORT_SEED, sk=sk) + # Caller-pinned ``seed`` for both fits; keeping the random and cohort + # pipelines on the same RNG seed makes the AUC pair comparable across + # re-runs of the same bundle (they differ only in train/test split). + cohort_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=seed, sk=sk) cohort_pipe.fit(x_early, y_early) cohort_probs = cohort_pipe.predict_proba(x_late)[:, 1] cohort_auc = float(sk.roc_auc_score(y_late, cohort_probs)) return CohortShiftMetrics( - tier=tier_name or str(manifest.get("difficulty", bundle_dir.name)), + tier=label, seed=seed, random_split_auc=random_auc, cohort_split_auc=cohort_auc, From 29a15922f4b79b89f542be874592a33ed9b89c08 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 01:04:01 +0300 Subject: [PATCH 3/4] =?UTF-8?q?refactor(validation):=20self-review=20pass?= =?UTF-8?q?=20on=20PR=203.2=20=E2=80=94=207=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brutal self-review caught seven real issues; this commit addresses them all. C1. Lift curve was fabricated. ``_write_lift_curve`` plotted three measured lift@pct values (1/5/10%) connected by straight lines, then jumped to (100%, 1.0). Saturated ``min(1.0, lift × pct/100)`` lied for high-lift models. Fix: add ``cumulative_gains`` to ``TierMetrics``, sampled at CUMULATIVE_GAINS_PCTS = (0, 10, …, 100); the renderer plots the actual curve. New ``_cumulative_gains_curve`` helper + 2 unit tests (perfect-ranker, no-positives-NaN). C2. Orchestrator passed the bundle's *generation* seed as the model's ``random_state``. ``measure_tier_from_bundle.seed`` was double-dipped: stored on the result for traceability AND used as ``random_state`` for the LR / HistGBM fits. Cross-seed sweeps then confounded data variance with model-RNG variance. Fix: add ``model_random_state: int = DEFAULT_MODEL_RANDOM_STATE`` parameter; ``measure_release_quality`` passes a constant. New unit test asserts identical AUCs across two distinct ``seed`` values when ``model_random_state`` is held constant. C3. ``len(set(np.ndarray))`` replaced with ``np.unique(...).size`` at three call sites — idiomatic, type-safe under nullable dtypes. C4. Removed duplicate ``average_precision`` field on ``TierMetrics`` (it was an exact alias of ``lr_average_precision``). D1. ``CrossTierOrdering`` booleans were defaulting to ``True`` when tiers were missing — silently green-lit partial releases at the PR 3.3 gating layer. Fix: type as ``bool | None``; missing pair → ``None``. Test updated to assert ``None`` on partial / empty releases. D2. ``_HEADLINE_FIELDS`` was hand-maintained with no drift guard. Fix: add ``TestHeadlineFieldsRegistry`` meta-test (mirrors the ``PROBE_REGISTRY`` pattern in ``leakage_probes.py``). Asserts every entry is a scalar ``float`` field on ``TierMetrics`` and that the cross-seed aggregator emits a median + spread for each. D3. Cohort-split tie-breaking on duplicate timestamps was non- deterministic across pandas versions / concat orders. Fix: explicit secondary sort key (``lead_id``) when present. P1. Removed three redundant ``import numpy as np`` calls inside figure helpers (numpy is already imported at module top). P3. Markdown report now footnotes the gate references (G6.4 / G7.* / G7.4 / G8.1) so a reader without the acceptance-gates doc can decode the section headers. Acceptance: - 1101/1101 tests pass (was 1095 — 6 new tests). - ruff check + format clean; mypy clean. - ``BUNDLE_SCHEMA_VERSION`` unchanged. - ``probe_relational_leakage.py`` exits 0 on every public tier. Co-Authored-By: Claude Opus 4.7 --- leadforge/validation/release_quality.py | 219 +++++++++++++++++------ leadforge/validation/reporting.py | 53 +++--- tests/validation/test_release_quality.py | 115 +++++++++++- tests/validation/test_reporting.py | 8 +- 4 files changed, 318 insertions(+), 77 deletions(-) diff --git a/leadforge/validation/release_quality.py b/leadforge/validation/release_quality.py index 387d4b4..2768bd4 100644 --- a/leadforge/validation/release_quality.py +++ b/leadforge/validation/release_quality.py @@ -54,10 +54,22 @@ #: label-resolution path that goes through the bundle manifest). LABEL_COLUMN = "converted_within_90_days" -#: Default seed used when the caller doesn't pin one. Held constant so -#: the released report is reproducible across re-runs of the driver. +#: Default generation-seed alias used by the driver when invoking +#: :func:`measure_tier_from_bundle` directly on a single bundle. The +#: seed is the bundle's *generation* seed, NOT the model's +#: ``random_state`` — see :data:`DEFAULT_MODEL_RANDOM_STATE` for that. DEFAULT_SEED: int = 42 +#: ``random_state`` used for every sklearn estimator inside this module. +#: Held constant across the cross-seed sweep so the AUC variance the +#: report attributes to "data variance" is *only* data variance — not +#: data-seed × model-seed interaction. Decoupling this from the +#: bundle's generation seed is a real correctness concern: with the +#: previous design, two consecutive seeds happening to align with a +#: HistGBM tree-split tie-break could masquerade as cross-seed +#: instability. +DEFAULT_MODEL_RANDOM_STATE: int = 0 + #: K values for ``precision_at_k`` / ``recall_at_k``. Matches G7.*.6 in #: ``v1_acceptance_gates.md`` (P@100 is the headline; P@50 carries the #: tighter top-of-funnel bound). @@ -67,6 +79,13 @@ #: design-doc §"Release validation" call for "lift@1/5/10%". LIFT_PCTS: tuple[float, ...] = (1.0, 5.0, 10.0) +#: Cumulative-gains curve sampling points (top-X% of leads, by score). +#: 11 points at 0%, 10%, …, 100% — coarse enough for a deterministic +#: byte-stable PNG, fine enough that the plotted curve actually traces +#: the ranking quality (rather than 3 measured points connected by a +#: straight line, which was misleading). +CUMULATIVE_GAINS_PCTS: tuple[float, ...] = tuple(float(p) for p in range(0, 101, 10)) + #: Number of equal-width bins for the calibration / reliability diagram. N_CALIBRATION_BINS: int = 10 @@ -113,22 +132,30 @@ class TierMetrics: conversion_rate_test: float # Headline pair: LR (interpretable) vs HistGBM (sophistication-rewarding). + # ``lr_average_precision`` is the canonical "AP" reported in the + # dataset card — the previous version of this dataclass also carried + # an ``average_precision`` field that was an exact alias; that field + # has been removed because two fields with the same value confused + # readers and added JSON noise. lr_auc: float gbm_auc: float gbm_minus_lr_auc: float lr_average_precision: float gbm_average_precision: float - # Average-precision under the LR model (G7.*.5 reports the LR number - # because the LR model is the canonical baseline in the dataset card; - # GBM AP is reported for the cross-tier ordering check below). - average_precision: float - precision_at_k: dict[str, float] recall_at_k: dict[str, float] lift_at_pct: dict[str, float] top_decile_rate: float + # Cumulative-gains curve sampled at :data:`CUMULATIVE_GAINS_PCTS`. + # Each entry maps ``""`` → fraction of positives captured among + # the top-pct% of leads (sorted descending by predicted P(convert)). + # The renderer plots this directly — earlier versions fabricated the + # curve by interpolating between the three lift@pct measurements, + # which lied about the shape between data points. + cumulative_gains: dict[str, float] + # Value-aware ranking (G7.*.5 / design-doc "expected ACV captured at K"). expected_acv_capture_at_k: dict[str, float] @@ -169,19 +196,27 @@ class CohortShiftMetrics: @dataclass(frozen=True) class CrossTierOrdering: - """Cross-tier difficulty ordering (G7.4.*).""" + """Cross-tier difficulty ordering (G7.4.*). + + Each ``*_gt_*`` boolean is ``True`` / ``False`` when both tiers in + the comparison are present in the report, and ``None`` when one or + both are missing. The previous design defaulted these to ``True`` + on missing data, which silently green-lit partial releases at the + PR 3.3 gating layer; ``None`` forces the gating layer to make an + explicit decision (skip vs fail) per tier pair. + """ by_average_precision: list[str] by_precision_at_100: list[str] by_gbm_minus_lr: list[str] by_conversion_rate: list[str] - average_precision_intro_gt_intermediate: bool - average_precision_intermediate_gt_advanced: bool - precision_at_100_intro_gt_intermediate: bool - precision_at_100_intermediate_gt_advanced: bool - conversion_rate_intro_gt_intermediate: bool - conversion_rate_intermediate_gt_advanced: bool - gbm_minus_lr_positive_in_every_tier: bool + average_precision_intro_gt_intermediate: bool | None + average_precision_intermediate_gt_advanced: bool | None + precision_at_100_intro_gt_intermediate: bool | None + precision_at_100_intermediate_gt_advanced: bool | None + conversion_rate_intro_gt_intermediate: bool | None + conversion_rate_intermediate_gt_advanced: bool | None + gbm_minus_lr_positive_in_every_tier: bool | None @dataclass(frozen=True) @@ -245,6 +280,7 @@ def measure_tier_from_bundle( *, seed: int = DEFAULT_SEED, tier_name: str | None = None, + model_random_state: int = DEFAULT_MODEL_RANDOM_STATE, ) -> TierMetrics: """Compute the full :class:`TierMetrics` panel for one bundle. @@ -255,11 +291,17 @@ def measure_tier_from_bundle( Args: bundle_dir: Path to a single-seed bundle root. - seed: Random-state seed for the sklearn estimators. Bundle-level - generation seed is read from the manifest separately and is - independent of this argument. + seed: Bundle's *generation* seed; recorded on the result for + traceability and used as the row label in reports. The + sklearn estimator's ``random_state`` is governed by + ``model_random_state`` instead — they MUST be independent + so the cross-seed sweep measures only data variance, not + data-seed × model-seed interaction. tier_name: Override the tier label. Defaults to the bundle's declared difficulty. + model_random_state: ``random_state`` for every sklearn + estimator fitted by this call. Held constant across the + sweep by :func:`measure_release_quality`. Raises: FileNotFoundError: when the manifest or task files are missing. @@ -298,8 +340,8 @@ def measure_tier_from_bundle( x_train = _sanitize_categoricals(train[cat_cols + num_cols], cat_cols) x_test = _sanitize_categoricals(test[cat_cols + num_cols], cat_cols) - lr_pipe = _build_pipeline(num_cols, cat_cols, model="lr", seed=seed, sk=sk) - gbm_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=seed, sk=sk) + lr_pipe = _build_pipeline(num_cols, cat_cols, model="lr", seed=model_random_state, sk=sk) + gbm_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=model_random_state, sk=sk) lr_pipe.fit(x_train, y_train.values) gbm_pipe.fit(x_train, y_train.values) @@ -318,6 +360,7 @@ def measure_tier_from_bundle( r_at_k[str(k)] = _recall_at_k(lr_probs, y_test.values, k) lift_at_pct = {f"{p:g}": _lift_at_pct(lr_probs, y_test.values, p) for p in LIFT_PCTS} top_decile = _top_decile_rate(lr_probs, y_test.values) + cumulative_gains = _cumulative_gains_curve(lr_probs, y_test.values, CUMULATIVE_GAINS_PCTS) acv_capture: dict[str, float] = {} if "expected_acv" in test.columns: @@ -332,7 +375,12 @@ def measure_tier_from_bundle( bins, max_bin_err = _calibration_bins(lr_probs, y_test.values, n_bins=N_CALIBRATION_BINS) baselines = _compute_baselines( - train=train, test=test, y_train=y_train.values, y_test=y_test.values, seed=seed, sk=sk + train=train, + test=test, + y_train=y_train.values, + y_test=y_test.values, + seed=model_random_state, + sk=sk, ) return TierMetrics( @@ -348,11 +396,11 @@ def measure_tier_from_bundle( gbm_minus_lr_auc=gbm_auc - lr_auc, lr_average_precision=lr_ap, gbm_average_precision=gbm_ap, - average_precision=lr_ap, precision_at_k=p_at_k, recall_at_k=r_at_k, lift_at_pct=lift_at_pct, top_decile_rate=top_decile, + cumulative_gains=cumulative_gains, expected_acv_capture_at_k=acv_capture, brier_score=brier, log_loss=log_loss, @@ -367,6 +415,7 @@ def measure_cohort_shift_from_bundle( *, seed: int = DEFAULT_SEED, tier_name: str | None = None, + model_random_state: int = DEFAULT_MODEL_RANDOM_STATE, ) -> CohortShiftMetrics: """Random-vs-chronological-cohort split AUC degradation (G6.4). @@ -375,6 +424,9 @@ def measure_cohort_shift_from_bundle( cohort-split AUC. HistGBM is used for both — it handles NaN natively so we don't have to thread a separate imputation pipeline through the chronological resplit. + + See :func:`measure_tier_from_bundle` for the seed / model-seed + decoupling rationale. """ sk = _import_sklearn() manifest = load_json(bundle_dir / "manifest.json") @@ -390,7 +442,7 @@ def measure_cohort_shift_from_bundle( y_train = train[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values y_test = test[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values - rand_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=seed, sk=sk) + rand_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=model_random_state, sk=sk) rand_pipe.fit(x_train, y_train) rand_probs = rand_pipe.predict_proba(x_test)[:, 1] random_auc = float(sk.roc_auc_score(y_test, rand_probs)) @@ -417,7 +469,15 @@ def _no_cohort() -> CohortShiftMetrics: if ts.isna().any(): return _no_cohort() - order = np.argsort(ts.values, kind="stable") + # Stable primary key = ``lead_created_at``; deterministic + # tie-breaker = ``lead_id`` so that bundles with many leads sharing + # one timestamp (common with synthetic generators that anchor every + # day) split the same way across pandas versions and concat orders. + if "lead_id" in pooled.columns: + sort_frame = pd.DataFrame({"_ts": ts.values, "_lid": pooled["lead_id"].astype(str).values}) + order = sort_frame.sort_values(["_ts", "_lid"], kind="stable").index.to_numpy() + else: + order = np.argsort(ts.values, kind="stable") cutoff = int(round(len(pooled) * COHORT_TRAIN_FRAC)) early_idx = order[:cutoff] late_idx = order[cutoff:] @@ -428,15 +488,12 @@ def _no_cohort() -> CohortShiftMetrics: late = pooled.iloc[late_idx] y_early = early[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values y_late = late[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values - if len(set(y_early)) < 2 or len(set(y_late)) < 2: + if np.unique(y_early).size < 2 or np.unique(y_late).size < 2: return _no_cohort() x_early = _sanitize_categoricals(early[cat_cols + num_cols], cat_cols) x_late = _sanitize_categoricals(late[cat_cols + num_cols], cat_cols) - # Caller-pinned ``seed`` for both fits; keeping the random and cohort - # pipelines on the same RNG seed makes the AUC pair comparable across - # re-runs of the same bundle (they differ only in train/test split). - cohort_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=seed, sk=sk) + cohort_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=model_random_state, sk=sk) cohort_pipe.fit(x_early, y_early) cohort_probs = cohort_pipe.predict_proba(x_late)[:, 1] cohort_auc = float(sk.roc_auc_score(y_late, cohort_probs)) @@ -501,6 +558,7 @@ def measure_release_quality( release_id: str = "leadforge-lead-scoring-v1", package_version: str | None = None, generation_timestamp: str | None = None, + model_random_state: int = DEFAULT_MODEL_RANDOM_STATE, ) -> ReleaseQualityReport: """Aggregate per-(tier, seed) measurements into a full report. @@ -529,7 +587,13 @@ def measure_release_quality( for tier_name, by_seed in tier_bundles.items(): seeds = sorted(by_seed.keys()) per_seed_metrics = [ - measure_tier_from_bundle(by_seed[s], seed=s, tier_name=tier_name) for s in seeds + measure_tier_from_bundle( + by_seed[s], + seed=s, + tier_name=tier_name, + model_random_state=model_random_state, + ) + for s in seeds ] medians, spreads = _aggregate_cross_seed(per_seed_metrics) cross_seed[tier_name] = CrossSeedTierMetrics( @@ -545,7 +609,10 @@ def measure_release_quality( else seeds[0] ) cohort[tier_name] = measure_cohort_shift_from_bundle( - by_seed[canonical], seed=canonical, tier_name=tier_name + by_seed[canonical], + seed=canonical, + tier_name=tier_name, + model_random_state=model_random_state, ) ordering = _compute_cross_tier_ordering(cross_seed) @@ -608,23 +675,32 @@ def _aggregate_cross_seed( def _compute_cross_tier_ordering( cross_seed: Mapping[str, CrossSeedTierMetrics], ) -> CrossTierOrdering: - """Derive G7.4.* ordering booleans + descending tier rankings.""" + """Derive G7.4.* ordering booleans + descending tier rankings. + + Each ``*_gt_*`` boolean is ``None`` when one or both compared tiers + are absent from the report (or carry NaN medians). The previous + design defaulted to ``True`` on missing data, which silently + green-lit partial releases at the PR 3.3 gating layer; ``None`` + forces an explicit decision per pair. + + The ``intro`` / ``intermediate`` / ``advanced`` tier names are + hardcoded because they are the v1 dataset family (per + ``docs/release/v1_release_design.md`` §"Dataset family architecture"); + this function is therefore not a general N-tier comparator. + """ if not cross_seed: - # Empty release → all-True booleans (vacuously satisfied) plus - # empty rankings. PR 3.3's gating layer is the place to assert - # presence of all three canonical tiers, not this function. return CrossTierOrdering( by_average_precision=[], by_precision_at_100=[], by_gbm_minus_lr=[], by_conversion_rate=[], - average_precision_intro_gt_intermediate=True, - average_precision_intermediate_gt_advanced=True, - precision_at_100_intro_gt_intermediate=True, - precision_at_100_intermediate_gt_advanced=True, - conversion_rate_intro_gt_intermediate=True, - conversion_rate_intermediate_gt_advanced=True, - gbm_minus_lr_positive_in_every_tier=True, + average_precision_intro_gt_intermediate=None, + average_precision_intermediate_gt_advanced=None, + precision_at_100_intro_gt_intermediate=None, + precision_at_100_intermediate_gt_advanced=None, + conversion_rate_intro_gt_intermediate=None, + conversion_rate_intermediate_gt_advanced=None, + gbm_minus_lr_positive_in_every_tier=None, ) # Build per-tier representative numbers from the median across seeds. @@ -648,13 +724,19 @@ def _sorted_desc(d: Mapping[str, float]) -> list[str]: # NaN sorts last so it doesn't artificially top the ranking. return sorted(d, key=lambda k: (math.isnan(d[k]), -d[k] if not math.isnan(d[k]) else 0.0)) - def _gt(d: Mapping[str, float], a: str, b: str) -> bool: + def _gt(d: Mapping[str, float], a: str, b: str) -> bool | None: + # Missing tier or NaN median → undefined, surface as ``None``. if a not in d or b not in d: - return True # tier missing → vacuous + return None if math.isnan(d[a]) or math.isnan(d[b]): - return True + return None return d[a] > d[b] + finite_gbm_lr = [v for v in median_gbm_lr.values() if not math.isnan(v)] + gbm_minus_lr_positive: bool | None = ( + all(v > 0 for v in finite_gbm_lr) if finite_gbm_lr else None + ) + return CrossTierOrdering( by_average_precision=_sorted_desc(median_ap), by_precision_at_100=_sorted_desc(median_p100), @@ -666,9 +748,7 @@ def _gt(d: Mapping[str, float], a: str, b: str) -> bool: precision_at_100_intermediate_gt_advanced=_gt(median_p100, "intermediate", "advanced"), conversion_rate_intro_gt_intermediate=_gt(median_rate, "intro", "intermediate"), conversion_rate_intermediate_gt_advanced=_gt(median_rate, "intermediate", "advanced"), - gbm_minus_lr_positive_in_every_tier=all( - v > 0 for v in median_gbm_lr.values() if not math.isnan(v) - ), + gbm_minus_lr_positive_in_every_tier=gbm_minus_lr_positive, ) @@ -726,6 +806,43 @@ def _expected_acv_capture(probs: np.ndarray, y: np.ndarray, acv: np.ndarray, k: return captured / total +def _cumulative_gains_curve( + probs: np.ndarray, + y: np.ndarray, + pcts: tuple[float, ...], +) -> dict[str, float]: + """Fraction of positives captured at each top-pct% cut-off. + + Stored on :class:`TierMetrics` so the renderer plots the actual + ranking-quality curve instead of fabricating one by interpolating + between three lift@pct measurements. + + For ``pct == 0`` returns 0.0 (an empty selection captures nothing); + for ``pct == 100`` returns 1.0. When there are no positives in + ``y`` the entire curve is NaN — there's no denominator. + """ + n = len(y) + n_pos = int(np.sum(y)) + out: dict[str, float] = {} + if n == 0 or n_pos == 0: + for p in pcts: + out[f"{p:g}"] = float("nan") + return out + order = np.argsort(-np.asarray(probs), kind="stable") + y_sorted = np.asarray(y)[order] + cum = np.cumsum(y_sorted) + for p in pcts: + if p <= 0: + out[f"{p:g}"] = 0.0 + continue + if p >= 100: + out[f"{p:g}"] = 1.0 + continue + k = max(1, int(round(n * p / 100.0))) + out[f"{p:g}"] = float(cum[k - 1] / n_pos) + return out + + def _calibration_bins( probs: np.ndarray, y: np.ndarray, *, n_bins: int = 10 ) -> tuple[list[CalibrationBin], float]: @@ -918,7 +1035,7 @@ def _subset_auc( num_in_subset = [c for c in cols if c not in cat_in_subset] x_tr = _sanitize_categoricals(train[cols], cat_in_subset) x_te = _sanitize_categoricals(test[cols], cat_in_subset) - if len(set(y_train)) < 2 or len(set(y_test)) < 2: + if np.unique(y_train).size < 2 or np.unique(y_test).size < 2: return None pipe = _build_pipeline(num_in_subset, cat_in_subset, model="gbm", seed=seed, sk=sk) pipe.fit(x_tr, y_train) @@ -942,7 +1059,7 @@ def _id_only_auc( so the leakage-probe baseline and the release-quality baseline produce comparable numbers. Expected ≈ 0.5 + ε on a clean bundle. """ - if len(set(y_train)) < 2 or len(set(y_test)) < 2: + if np.unique(y_train).size < 2 or np.unique(y_test).size < 2: return None x_tr = _hash_id_columns(train[id_cols]) x_te = _hash_id_columns(test[id_cols]) @@ -1063,6 +1180,8 @@ def report_to_json(report: ReleaseQualityReport, *, indent: int = 2) -> str: __all__ = [ "COHORT_TRAIN_FRAC", + "CUMULATIVE_GAINS_PCTS", + "DEFAULT_MODEL_RANDOM_STATE", "DEFAULT_SEED", "LABEL_COLUMN", "LIFT_PCTS", diff --git a/leadforge/validation/reporting.py b/leadforge/validation/reporting.py index 1d8afc3..deaec6c 100644 --- a/leadforge/validation/reporting.py +++ b/leadforge/validation/reporting.py @@ -36,6 +36,7 @@ from typing import Any import matplotlib +import numpy as np matplotlib.use("Agg") # headless / deterministic; must precede pyplot import. @@ -297,6 +298,13 @@ def _render_markdown(report: ReleaseQualityReport) -> str: out.append("---") out.append("") + out.append("**Gate references** (see `docs/release/v1_acceptance_gates.md`):") + out.append("") + out.append("- **G6.4** — Cohort/time-shift AUC degradation band.") + out.append("- **G7.\\*** — Per-tier ROC-AUC, AP, P@K, lift, calibration bands.") + out.append("- **G7.4** — Cross-tier ordering (AP / P@K / GBM−LR / conversion-rate).") + out.append("- **G8.1** — Cross-seed stability (per-metric spread within tolerance).") + out.append("") out.append(f"_Renderer: `leadforge.validation.reporting`. JSON sibling: `{REPORT_JSON}`._") return "\n".join(out) + "\n" @@ -338,10 +346,14 @@ def _save(fig: Any, path: Path) -> None: def _write_lift_curve(csm: CrossSeedTierMetrics, path: Path) -> None: """Cumulative-gains chart at the median seed for one tier. - The headline-tier picture: x = fraction of leads (sorted by score - descending), y = fraction of positives captured. Diagonal = random - baseline. The lift table in the markdown gives the numbers; the - figure gives the shape. + Plots the actual ``cumulative_gains`` curve sampled by + :func:`leadforge.validation.release_quality._cumulative_gains_curve`. + Earlier versions of this function fabricated the curve by + interpolating between the three measured ``lift_at_pct`` points + (1% / 5% / 10%) and then jumping straight to (100%, 1.0); that lied + about model quality between the data points and saturated at 1.0 + for high-lift models. The fix is to plot the precomputed curve + directly — no interpolation tricks. """ if not csm.per_seed: empty_fig, _ = _figure() @@ -349,18 +361,21 @@ def _write_lift_curve(csm: CrossSeedTierMetrics, path: Path) -> None: return metrics = csm.per_seed[len(csm.per_seed) // 2] fig, ax = _figure(figsize=(6.0, 5.0)) - # Convert lift@pct (precision/base) into a cumulative-gains-style - # coordinate: y = lift × pct/100, capped at 1.0. Computed only - # for the LIFT_PCTS we have; for {20, 50, 100} we fall back to - # base-rate diagonal as those points were not measured. - measured: dict[float, float] = {} - for p in (1.0, 5.0, 10.0): - v = metrics.lift_at_pct.get(f"{p:g}") - if v is not None and not math.isnan(v): - measured[p] = v - pcts = sorted(measured) - ys = [min(1.0, measured[p] * p / 100.0) for p in pcts] - ax.plot([0.0, *pcts, 100.0], [0.0, *ys, 1.0], marker="o", label=f"{csm.tier} (median seed)") + + points: list[tuple[float, float]] = [] + for key, v in metrics.cumulative_gains.items(): + try: + pct = float(key) + except ValueError: + continue + if v is None or math.isnan(v): + continue + points.append((pct, v)) + points.sort() + if points: + xs = [p for p, _ in points] + ys = [v for _, v in points] + ax.plot(xs, ys, marker="o", label=f"{csm.tier} (median seed)") ax.plot([0, 100], [0, 1], linestyle="--", color="grey", label="random") ax.set_xlabel("Top-K% of leads (sorted by predicted P(convert))") ax.set_ylabel("Fraction of positives captured") @@ -413,8 +428,6 @@ def _write_leakage_delta(tiers: Mapping[str, CrossSeedTierMetrics], path: Path) n_groups = len(tier_names) n_bars = len(baseline_names) bar_w = 0.8 / max(1, n_bars) - import numpy as np # local import so the module top stays sklearn-free - xs = np.arange(n_groups) for i, bn in enumerate(baseline_names): ys: list[float] = [] @@ -436,8 +449,6 @@ def _write_leakage_delta(tiers: Mapping[str, CrossSeedTierMetrics], path: Path) def _write_cohort_shift(cohort: Mapping[str, CohortShiftMetrics], path: Path) -> None: """Side-by-side bars: random vs chronological-cohort split AUC per tier.""" - import numpy as np - fig, ax = _figure(figsize=(7.0, 4.5)) tier_names = sorted(cohort.keys()) xs = np.arange(len(tier_names)) @@ -462,8 +473,6 @@ def _write_cohort_shift(cohort: Mapping[str, CohortShiftMetrics], path: Path) -> def _write_value_capture(tiers: Mapping[str, CrossSeedTierMetrics], path: Path) -> None: """ACV captured at K (across the K values in :data:`PRECISION_KS`).""" - import numpy as np - fig, ax = _figure(figsize=(7.0, 4.5)) has_any = False for tier_name in sorted(tiers.keys()): diff --git a/tests/validation/test_release_quality.py b/tests/validation/test_release_quality.py index 9d805dc..a7a0730 100644 --- a/tests/validation/test_release_quality.py +++ b/tests/validation/test_release_quality.py @@ -19,6 +19,9 @@ import pytest from leadforge.validation.release_quality import ( + _HEADLINE_FIELDS, + CUMULATIVE_GAINS_PCTS, + DEFAULT_MODEL_RANDOM_STATE, LABEL_COLUMN, PRECISION_KS, CalibrationBin, @@ -27,7 +30,9 @@ CrossTierOrdering, ReleaseQualityReport, TierMetrics, + _aggregate_cross_seed, _calibration_bins, + _cumulative_gains_curve, _expected_acv_capture, _lift_at_pct, _precision_at_k, @@ -122,6 +127,28 @@ def test_calibration_bins_known_miscalibration(self) -> None: populated = [b for b in bins if b.n > 0] assert len(populated) == 1 + def test_cumulative_gains_perfect_ranker_captures_at_top_k(self) -> None: + # 100 leads, 30 positives at top → top-30% captures 100% of positives. + probs = np.linspace(1.0, 0.0, 100) + y = np.zeros(100, dtype=int) + y[:30] = 1 + out = _cumulative_gains_curve(probs, y, (0.0, 10.0, 30.0, 50.0, 100.0)) + assert out["0"] == 0.0 + assert out["100"] == 1.0 + # Top-30% captures all 30 positives. + assert out["30"] == pytest.approx(1.0) + # Top-10% captures 10/30. + assert out["10"] == pytest.approx(10 / 30, abs=1e-6) + # Curve is monotonic non-decreasing. + ordered = [out[f"{p:g}"] for p in (0.0, 10.0, 30.0, 50.0, 100.0)] + assert ordered == sorted(ordered) + + def test_cumulative_gains_no_positives_returns_nan(self) -> None: + out = _cumulative_gains_curve( + np.array([0.9, 0.5, 0.1]), np.zeros(3, dtype=int), (0.0, 50.0, 100.0) + ) + assert all(math.isnan(v) for v in out.values()) + # --------------------------------------------------------------------------- # Dataclass plumbing + JSON serialisation. @@ -142,11 +169,17 @@ def _fixture_tier_metrics(tier: str, seed: int, **overrides: Any) -> TierMetrics "gbm_minus_lr_auc": 0.03, "lr_average_precision": 0.62, "gbm_average_precision": 0.65, - "average_precision": 0.62, "precision_at_k": {"50": 0.66, "100": 0.55}, "recall_at_k": {"50": 0.45, "100": 0.78}, "lift_at_pct": {"1": 3.0, "5": 2.5, "10": 2.0}, "top_decile_rate": 0.6, + "cumulative_gains": { + "0": 0.0, + "10": 0.4, + "20": 0.6, + "50": 0.85, + "100": 1.0, + }, "expected_acv_capture_at_k": {"50": 0.55, "100": 0.80}, "brier_score": 0.12, "log_loss": 0.34, @@ -292,14 +325,56 @@ def test_ordering_when_intro_is_easiest(self) -> None: assert o.gbm_minus_lr_positive_in_every_tier def test_ordering_with_partial_release(self) -> None: - # Only intro present — other ordering booleans default to True. + # Only intro present — pairs that include a missing tier become + # ``None``; the gbm-vs-lr "every tier" still resolves on the + # finite intro median (positive in the fixture). The previous + # design defaulted these to ``True``, silently green-lighting + # partial releases. from leadforge.validation.release_quality import _compute_cross_tier_ordering partial = _fixture_report(("intro",)) o = _compute_cross_tier_ordering(partial.tiers) assert o.by_average_precision == ["intro"] - assert o.average_precision_intro_gt_intermediate is True - assert o.gbm_minus_lr_positive_in_every_tier + assert o.average_precision_intro_gt_intermediate is None + assert o.average_precision_intermediate_gt_advanced is None + assert o.gbm_minus_lr_positive_in_every_tier is True + + def test_ordering_returns_none_on_empty_release(self) -> None: + from leadforge.validation.release_quality import _compute_cross_tier_ordering + + o = _compute_cross_tier_ordering({}) + assert o.by_average_precision == [] + assert o.average_precision_intro_gt_intermediate is None + assert o.gbm_minus_lr_positive_in_every_tier is None + + +class TestHeadlineFieldsRegistry: + """Drift-guard for ``_HEADLINE_FIELDS``. + + Mirrors the meta-test pattern in ``test_leakage_probes.py`` — + catches the failure mode where a new metric is added to + :class:`TierMetrics` but the cross-seed aggregator forgets to + include it. + """ + + def test_every_headline_field_is_a_scalar_float_on_tier_metrics(self) -> None: + import typing + + # ``from __future__ import annotations`` stores annotations as + # strings; ``get_type_hints`` resolves them back to real types. + hints = typing.get_type_hints(TierMetrics) + scalar_floats = {name for name, t in hints.items() if t is float} + unknown = set(_HEADLINE_FIELDS) - scalar_floats + assert not unknown, ( + f"_HEADLINE_FIELDS contains entries that are not scalar floats on " + f"TierMetrics: {sorted(unknown)}" + ) + + def test_aggregator_emits_a_median_and_spread_per_field(self) -> None: + per_seed = [_fixture_tier_metrics("intermediate", seed=42)] + medians, spreads = _aggregate_cross_seed(per_seed) + assert set(_HEADLINE_FIELDS) == set(medians.keys()) + assert set(_HEADLINE_FIELDS) == set(spreads.keys()) # --------------------------------------------------------------------------- @@ -432,6 +507,15 @@ def test_measure_tier_from_synthetic_bundle(self, tmp_path: Path) -> None: assert "id_only" in m.baselines assert m.baselines["id_only"] > 0.3 assert m.baselines["id_only"] < 0.7 + # Cumulative gains carries one entry per CUMULATIVE_GAINS_PCTS. + assert set(m.cumulative_gains.keys()) == {f"{p:g}" for p in CUMULATIVE_GAINS_PCTS} + assert m.cumulative_gains["0"] == 0.0 + assert m.cumulative_gains["100"] == pytest.approx(1.0) + # Curve is monotonic non-decreasing. + sorted_pcts = sorted(float(k) for k in m.cumulative_gains) + ys = [m.cumulative_gains[f"{p:g}"] for p in sorted_pcts] + for prev, cur in zip(ys[:-1], ys[1:], strict=True): + assert cur >= prev - 1e-9, f"cumulative gains decreased: {prev} -> {cur}" def test_measure_tier_raises_when_train_single_class(self, tmp_path: Path) -> None: pytest.importorskip("sklearn") @@ -462,6 +546,29 @@ def test_measure_cohort_shift_returns_random_auc_when_no_timestamp( assert math.isnan(cs.cohort_split_auc) assert math.isnan(cs.auc_degradation) + def test_model_random_state_decoupled_from_generation_seed(self, tmp_path: Path) -> None: + """``model_random_state`` controls the sklearn ``random_state``; + the ``seed`` argument is just the bundle's generation seed + recorded for traceability. + + Two ``measure_tier_from_bundle`` calls on the *same* bundle with + the same ``model_random_state`` but different ``seed`` arguments + must produce identical AUCs (data is identical → model is + identical → AUCs are identical). Earlier versions of this + function used ``seed`` for both, so the cross-seed sweep + confounded data variance with model-RNG variance — that's the + bug this test guards against. + """ + pytest.importorskip("sklearn") + bundle = _write_minimal_bundle(tmp_path / "fixed_data", n=400, seed=42) + a = measure_tier_from_bundle(bundle, seed=1, model_random_state=DEFAULT_MODEL_RANDOM_STATE) + b = measure_tier_from_bundle(bundle, seed=2, model_random_state=DEFAULT_MODEL_RANDOM_STATE) + assert a.lr_auc == b.lr_auc + assert a.gbm_auc == b.gbm_auc + # The traceability seed differs. + assert a.seed == 1 + assert b.seed == 2 + def test_cohort_shift_returns_well_formed_auc_pair(self, tmp_path: Path) -> None: """Cohort-shift evaluation returns finite AUCs in [0, 1] and a signed degradation when ``lead_created_at`` is parseable. diff --git a/tests/validation/test_reporting.py b/tests/validation/test_reporting.py index edf5bcc..0064030 100644 --- a/tests/validation/test_reporting.py +++ b/tests/validation/test_reporting.py @@ -47,11 +47,17 @@ def _tier_metrics(tier: str, seed: int, **overrides: object) -> TierMetrics: "gbm_minus_lr_auc": 0.03, "lr_average_precision": 0.62, "gbm_average_precision": 0.65, - "average_precision": 0.62, "precision_at_k": {"50": 0.66, "100": 0.55}, "recall_at_k": {"50": 0.45, "100": 0.78}, "lift_at_pct": {"1": 3.0, "5": 2.5, "10": 2.0}, "top_decile_rate": 0.6, + "cumulative_gains": { + "0": 0.0, + "10": 0.4, + "20": 0.6, + "50": 0.85, + "100": 1.0, + }, "expected_acv_capture_at_k": {"50": 0.55, "100": 0.80}, "brier_score": 0.12, "log_loss": 0.34, From 286a633e90f77a4a4169f3f7bdeb39ea6a848237 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 01:09:39 +0300 Subject: [PATCH 4/4] refactor(validation): address Copilot review on PR 3.2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Triage of 6 Copilot comments: 3 real, 3 false-positives / already-treated. Real fixes: * COPILOT-2 — ``measure_cohort_shift_from_bundle`` now class-checks ``y_train`` / ``y_test`` and raises a clear ``ValueError`` on degeneracy, matching ``measure_tier_from_bundle``'s posture instead of letting sklearn's ``roc_auc_score`` raise from inside the call. NaN-return is still reserved for *missing* inputs (no timestamp, unparseable timestamps); a degenerate label is a structural bundle problem and fails loudly. * COPILOT-4 — markdown baseline-cell citations now use the ``$.tiers..per_seed[].baselines.`` list-index form that matches the actual JSON shape. The previous ``[seed=]`` selector was invented and unverifiable. * COPILOT-5 — ``Path.write_text`` calls in ``render_report`` pin ``encoding="utf-8"``. The default ``locale.getpreferredencoding(False)`` would mojibake the markdown's em-dashes / minus signs on legacy ANSI-locale Windows systems and break the byte-identical-text-artefact promise. * COPILOT-6 — module docstring softened. We no longer claim figures are deterministic byte-for-byte across environments; the guarantee is now JSON+markdown only, and figures are stable within an environment. Pinning matplotlib rcParams + fonts for cross-environment determinism is fragile in practice (font availability / hinting / antialiasing) and out of scope for v1. Resolved without changes: * COPILOT-1 — ``isinstance(obj, list | tuple)`` is valid in Python 3.10+ (PEP 604); leadforge requires-python is ``>=3.11``. Code is correct; verified by every passing test. * COPILOT-3 — the ``COHORT_SEED`` / ``seed`` divergence was already fixed in the previous self-review commit (``6c0f835``). Copilot reviewed the merge-base diff which still showed the original code. New tests (3): - ``test_markdown_baseline_citations_use_list_index`` — citation format must match the actual JSON shape; the invented ``[seed=...]`` selector is gone. - ``test_text_outputs_are_utf8_encoded`` — UTF-8 round-trip on JSON+markdown including the em-dash. - ``test_cohort_shift_raises_on_degenerate_train`` — clear ``ValueError`` on degenerate task splits. Acceptance: 1104/1104 tests pass; ruff + mypy clean. Co-Authored-By: Claude Opus 4.7 --- leadforge/validation/release_quality.py | 18 +++++++++++++ leadforge/validation/reporting.py | 34 +++++++++++++++++++----- tests/validation/test_release_quality.py | 14 ++++++++++ tests/validation/test_reporting.py | 25 +++++++++++++++++ 4 files changed, 84 insertions(+), 7 deletions(-) diff --git a/leadforge/validation/release_quality.py b/leadforge/validation/release_quality.py index 2768bd4..87b6883 100644 --- a/leadforge/validation/release_quality.py +++ b/leadforge/validation/release_quality.py @@ -436,12 +436,30 @@ def measure_cohort_shift_from_bundle( train = pd.read_parquet(bundle_dir / f"tasks/{primary_task}/train.parquet") test = pd.read_parquet(bundle_dir / f"tasks/{primary_task}/test.parquet") + if LABEL_COLUMN not in train.columns or LABEL_COLUMN not in test.columns: + raise ValueError(f"task splits must contain the {LABEL_COLUMN!r} label column") + cat_cols, num_cols = _partition_columns(train, exclude={LABEL_COLUMN}) x_train = _sanitize_categoricals(train[cat_cols + num_cols], cat_cols) x_test = _sanitize_categoricals(test[cat_cols + num_cols], cat_cols) y_train = train[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values y_test = test[LABEL_COLUMN].astype("boolean").fillna(False).astype(int).values + # Match the posture of ``measure_tier_from_bundle`` — surface a + # clear ValueError on degenerate task splits rather than letting + # sklearn raise from inside ``roc_auc_score`` with a less + # informative message. The unsupported-cohort path below uses + # NaN, but that is reserved for *missing inputs* (no timestamp, + # unparseable timestamps); a degenerate label is a structural + # bundle problem and should fail loudly. + if np.unique(y_train).size < 2: + raise ValueError( + "train split has fewer than two classes; refusing to fit " + "(a single-class regime breaks every downstream metric)" + ) + if np.unique(y_test).size < 2: + raise ValueError("test split has fewer than two classes; refusing to score") + rand_pipe = _build_pipeline(num_cols, cat_cols, model="gbm", seed=model_random_state, sk=sk) rand_pipe.fit(x_train, y_train) rand_probs = rand_pipe.predict_proba(x_test)[:, 1] diff --git a/leadforge/validation/reporting.py b/leadforge/validation/reporting.py index deaec6c..c3e37de 100644 --- a/leadforge/validation/reporting.py +++ b/leadforge/validation/reporting.py @@ -23,9 +23,17 @@ Matplotlib is the only figure dependency; we force the Agg backend before importing :mod:`matplotlib.pyplot` so this module is safe in -headless CI. Figures are deterministic byte-for-byte under the same -:class:`ReleaseQualityReport` input — the renderer does no sampling and -pins every text-source font option. +headless CI. Determinism guarantees: + +* JSON and markdown outputs are byte-identical across runs of the + renderer on the same :class:`ReleaseQualityReport` (UTF-8 encoded; + ``json.dumps`` uses ``sort_keys=True``). +* PNG figures are stable *within an environment* but may drift across + matplotlib versions and font caches; we deliberately do not pin + rcParams because cross-environment font determinism is fragile in + practice (font availability, hinting, antialiasing all vary). PR + 3.3's driver should regenerate figures fresh per release rather + than relying on hash-based equality. """ from __future__ import annotations @@ -99,12 +107,18 @@ def render_report(report: ReleaseQualityReport, output_dir: Path) -> dict[str, P written: dict[str, Path] = {} + # Pin UTF-8 explicitly — Path.write_text otherwise uses + # ``locale.getpreferredencoding(False)``, which mojibakes the + # markdown's em-dashes / minus signs on non-UTF-8 systems + # (notably Windows in legacy ANSI locales). The renderer claims + # byte-identical text artefacts across runs; that promise needs a + # pinned encoding to hold. json_path = output_dir / REPORT_JSON - json_path.write_text(report_to_json(report)) + json_path.write_text(report_to_json(report), encoding="utf-8") written["json"] = json_path md_path = output_dir / REPORT_MD - md_path.write_text(_render_markdown(report)) + md_path.write_text(_render_markdown(report), encoding="utf-8") written["md"] = md_path for tier_name in _LIFT_CURVE_TIERS: @@ -267,10 +281,16 @@ def _render_markdown(report: ReleaseQualityReport) -> str: out.append(header) out.append(sep) for tier_name, csm in sorted(report.tiers.items()): - for tm in csm.per_seed: + # ``per_seed`` is serialised as a plain JSON list, so the + # citation must be by list index (``[i]``) rather than the + # invented ``[seed=]`` selector. Index follows the + # same ordering the orchestrator builds — sorted ascending + # by seed — which the meta-test ``test_orchestrator...`` + # asserts. + for idx, tm in enumerate(csm.per_seed): cells = [tier_name, str(tm.seed)] for bn in baseline_names: - cell_path = f"$.tiers.{tier_name}.per_seed[seed={tm.seed}].baselines.{bn}" + cell_path = f"$.tiers.{tier_name}.per_seed[{idx}].baselines.{bn}" cells.append(_fmt(tm.baselines.get(bn), cell_path)) out.append("| " + " | ".join(cells) + " |") out.append("") diff --git a/tests/validation/test_release_quality.py b/tests/validation/test_release_quality.py index a7a0730..e387038 100644 --- a/tests/validation/test_release_quality.py +++ b/tests/validation/test_release_quality.py @@ -529,6 +529,20 @@ def test_measure_tier_raises_when_train_single_class(self, tmp_path: Path) -> No with pytest.raises(ValueError, match="train split has fewer than two classes"): measure_tier_from_bundle(bundle, seed=42) + def test_cohort_shift_raises_on_degenerate_train(self, tmp_path: Path) -> None: + """Match ``measure_tier_from_bundle``'s posture — surface a + clear ``ValueError`` rather than letting sklearn raise from + inside ``roc_auc_score`` with a less informative message. + """ + pytest.importorskip("sklearn") + bundle = _write_minimal_bundle(tmp_path / "degenerate", n=200, seed=42) + train_path = bundle / "tasks/converted_within_90_days/train.parquet" + df = pd.read_parquet(train_path) + df[LABEL_COLUMN] = pd.Series([False] * len(df), dtype="boolean") + df.to_parquet(train_path, index=False) + with pytest.raises(ValueError, match="train split has fewer than two classes"): + measure_cohort_shift_from_bundle(bundle, seed=42) + def test_measure_cohort_shift_returns_random_auc_when_no_timestamp( self, tmp_path: Path ) -> None: diff --git a/tests/validation/test_reporting.py b/tests/validation/test_reporting.py index 0064030..1390297 100644 --- a/tests/validation/test_reporting.py +++ b/tests/validation/test_reporting.py @@ -186,6 +186,31 @@ def test_json_is_well_formed(self, tmp_path: Path) -> None: # Ordering booleans round-trip as JSON true/false (not Python str). assert d["cross_tier_ordering"]["gbm_minus_lr_positive_in_every_tier"] is True + def test_markdown_baseline_citations_use_list_index(self, tmp_path: Path) -> None: + """Per-seed citations must use ``[]`` list-index syntax that + matches the actual JSON shape (``per_seed`` is a list); the + previous ``[seed=]`` selector was invented and unverifiable + against the JSON.""" + report = _build_report() + out = tmp_path / "v" + render_report(report, out) + md = (out / REPORT_MD).read_text() + assert "$.tiers.intro.per_seed[0].baselines." in md + # The invented selector must be gone. + assert "[seed=" not in md + + def test_text_outputs_are_utf8_encoded(self, tmp_path: Path) -> None: + """Markdown contains em-dashes / minus signs; pinning UTF-8 + explicitly is the contract the renderer promises.""" + report = _build_report() + out = tmp_path / "v" + render_report(report, out) + # Decoding as UTF-8 must succeed for both artefacts. + (out / REPORT_JSON).read_text(encoding="utf-8") + md = (out / REPORT_MD).read_text(encoding="utf-8") + # Em-dash in the title section survives. + assert "—" in md + def test_markdown_cites_json_paths_for_every_metric_cell(self, tmp_path: Path) -> None: """G10.6 — every claim has a backing JSON reference.""" report = _build_report()