From 564aaa8899b188dd64045a03b0428cc3c17e7209 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 10:58:56 +0300 Subject: [PATCH 1/7] feat(scripts,docs): channel-signal audit (PR 4.1 deliverable 1+2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/audit_channel_signal.py audits how strongly source channel signals conversion across the release tier family. For each tier we compute per-channel conversion rates and the univariate AUC of channel against converted_within_90_days (scored as the empirical positive rate per channel — a 1-D Bayes classifier equivalent to a saturated logistic regression on one-hot channel features). Outputs JSON + Markdown to docs/release/channel_signal_audit.{json,md}. Tests guard determinism against the committed release/ bundles (a double-run produces byte-identical output) plus per-channel rollup, univariate AUC closed-form, single-class fallback, error paths, and the CLI wiring. The audit confirms what the v1 DGP predicts: channel signal in v1 is weak — across all three tiers the largest per-channel rate spread is 0.043 and the largest univariate AUC is 0.521, well below the G2 / Gemini v2 industry MQL→SQL band (SEO ~51% vs Email <1%). v1 drives conversion through motif-family hazards keyed off latent traits, not channel-conditional probabilities; channel-conditional encoding is tracked as post-v1 work in docs/release/post_v1_roadmap.md. Roadmap: docs/release/v1_release_roadmap.md §"Phase 4 — PR 4.1". Co-Authored-By: Claude Opus 4.7 --- docs/release/channel_signal_audit.json | 217 +++++++++ docs/release/channel_signal_audit.md | 91 ++++ scripts/audit_channel_signal.py | 510 +++++++++++++++++++++ tests/scripts/test_audit_channel_signal.py | 237 ++++++++++ 4 files changed, 1055 insertions(+) create mode 100644 docs/release/channel_signal_audit.json create mode 100644 docs/release/channel_signal_audit.md create mode 100644 scripts/audit_channel_signal.py create mode 100644 tests/scripts/test_audit_channel_signal.py diff --git a/docs/release/channel_signal_audit.json b/docs/release/channel_signal_audit.json new file mode 100644 index 0000000..31885a8 --- /dev/null +++ b/docs/release/channel_signal_audit.json @@ -0,0 +1,217 @@ +{ + "channel_columns": [ + "lead_source", + "first_touch_channel" + ], + "industry_mql_to_sql_benchmarks": { + "Email": 0.005, + "PPC": 0.26, + "SEO": 0.51 + }, + "label_column": "converted_within_90_days", + "release_dir": "release", + "task": "converted_within_90_days", + "tiers": [ + { + "columns": [ + { + "channels": [ + { + "conversion_rate": 0.43439490445859874, + "n": 1570, + "n_converted": 682, + "name": "inbound_marketing", + "share": 0.44857142857142857 + }, + { + "conversion_rate": 0.39111747851002865, + "n": 698, + "n_converted": 273, + "name": "partner_referral", + "share": 0.19942857142857143 + }, + { + "conversion_rate": 0.4025974025974026, + "n": 1232, + "n_converted": 496, + "name": "sdr_outbound", + "share": 0.352 + } + ], + "column": "lead_source", + "n_total": 3500, + "overall_conversion_rate": 0.4145714285714286, + "rate_spread": 0.04327742594857009, + "univariate_auc": 0.5199794894149169 + }, + { + "channels": [ + { + "conversion_rate": 0.43439490445859874, + "n": 1570, + "n_converted": 682, + "name": "inbound_marketing", + "share": 0.44857142857142857 + }, + { + "conversion_rate": 0.39111747851002865, + "n": 698, + "n_converted": 273, + "name": "partner_referral", + "share": 0.19942857142857143 + }, + { + "conversion_rate": 0.4025974025974026, + "n": 1232, + "n_converted": 496, + "name": "sdr_outbound", + "share": 0.352 + } + ], + "column": "first_touch_channel", + "n_total": 3500, + "overall_conversion_rate": 0.4145714285714286, + "rate_spread": 0.04327742594857009, + "univariate_auc": 0.5199794894149169 + } + ], + "conversion_rate_overall": 0.4145714285714286, + "n_leads": 3500, + "tier": "intro" + }, + { + "columns": [ + { + "channels": [ + { + "conversion_rate": 0.21273885350318472, + "n": 1570, + "n_converted": 334, + "name": "inbound_marketing", + "share": 0.44857142857142857 + }, + { + "conversion_rate": 0.17621776504297995, + "n": 698, + "n_converted": 123, + "name": "partner_referral", + "share": 0.19942857142857143 + }, + { + "conversion_rate": 0.2012987012987013, + "n": 1232, + "n_converted": 248, + "name": "sdr_outbound", + "share": 0.352 + } + ], + "column": "lead_source", + "n_total": 3500, + "overall_conversion_rate": 0.20142857142857143, + "rate_spread": 0.03652108846020477, + "univariate_auc": 0.5212431012826857 + }, + { + "channels": [ + { + "conversion_rate": 0.21273885350318472, + "n": 1570, + "n_converted": 334, + "name": "inbound_marketing", + "share": 0.44857142857142857 + }, + { + "conversion_rate": 0.17621776504297995, + "n": 698, + "n_converted": 123, + "name": "partner_referral", + "share": 0.19942857142857143 + }, + { + "conversion_rate": 0.2012987012987013, + "n": 1232, + "n_converted": 248, + "name": "sdr_outbound", + "share": 0.352 + } + ], + "column": "first_touch_channel", + "n_total": 3500, + "overall_conversion_rate": 0.20142857142857143, + "rate_spread": 0.03652108846020477, + "univariate_auc": 0.5212431012826857 + } + ], + "conversion_rate_overall": 0.20142857142857143, + "n_leads": 3500, + "tier": "intermediate" + }, + { + "columns": [ + { + "channels": [ + { + "conversion_rate": 0.08152866242038216, + "n": 1570, + "n_converted": 128, + "name": "inbound_marketing", + "share": 0.44857142857142857 + }, + { + "conversion_rate": 0.07593123209169055, + "n": 698, + "n_converted": 53, + "name": "partner_referral", + "share": 0.19942857142857143 + }, + { + "conversion_rate": 0.07792207792207792, + "n": 1232, + "n_converted": 96, + "name": "sdr_outbound", + "share": 0.352 + } + ], + "column": "lead_source", + "n_total": 3500, + "overall_conversion_rate": 0.07914285714285714, + "rate_spread": 0.005597430328691616, + "univariate_auc": 0.5083011208921436 + }, + { + "channels": [ + { + "conversion_rate": 0.08152866242038216, + "n": 1570, + "n_converted": 128, + "name": "inbound_marketing", + "share": 0.44857142857142857 + }, + { + "conversion_rate": 0.07593123209169055, + "n": 698, + "n_converted": 53, + "name": "partner_referral", + "share": 0.19942857142857143 + }, + { + "conversion_rate": 0.07792207792207792, + "n": 1232, + "n_converted": 96, + "name": "sdr_outbound", + "share": 0.352 + } + ], + "column": "first_touch_channel", + "n_total": 3500, + "overall_conversion_rate": 0.07914285714285714, + "rate_spread": 0.005597430328691616, + "univariate_auc": 0.5083011208921436 + } + ], + "conversion_rate_overall": 0.07914285714285714, + "n_leads": 3500, + "tier": "advanced" + } + ] +} diff --git a/docs/release/channel_signal_audit.md b/docs/release/channel_signal_audit.md new file mode 100644 index 0000000..a1f8fc8 --- /dev/null +++ b/docs/release/channel_signal_audit.md @@ -0,0 +1,91 @@ +# Channel-signal audit — leadforge-lead-scoring-v1 + +Audit produced by `scripts/audit_channel_signal.py`; see also `docs/release/channel_signal_audit.json` for the machine-readable form. + +**Scope.** For every tier we compute per-channel conversion rates and the univariate AUC of channel against `converted_within_90_days`, scored as the empirical positive rate per channel (a 1-D Bayes classifier, equivalent to a saturated logistic regression on one-hot channel features). Compared against the G2 / Gemini v2 industry MQL→SQL benchmark band (SEO ~51%, PPC ~26%, Email <1%, surfaced in `docs/external_review/summaries/recommendations_pass.md` recommendation #8). + +**Caveat.** Industry benchmarks are MQL→SQL rates, not 90-day closed-won rates. They are the closest public anchor for *how much* channel ought to matter; use them as a band of reference, not a hard target. + +## Industry benchmark band + +| Channel | MQL→SQL conversion rate | +|---|---| +| Email | 0.50% | +| PPC | 26.00% | +| SEO | 51.00% | + +## Tier: `intro` + +`n_leads = 3500`, overall 90-day conversion rate 41.46%. + +### Column: `lead_source` + +Univariate AUC: **0.5200** · Per-channel rate spread (max − min): **0.0433** · Verdict: **weak signal** + +| Channel | n | Share | Converted | Conversion rate | +|---|---:|---:|---:|---:| +| `inbound_marketing` | 1570 | 44.86% | 682 | 43.44% | +| `partner_referral` | 698 | 19.94% | 273 | 39.11% | +| `sdr_outbound` | 1232 | 35.20% | 496 | 40.26% | + +### Column: `first_touch_channel` + +Univariate AUC: **0.5200** · Per-channel rate spread (max − min): **0.0433** · Verdict: **weak signal** + +| Channel | n | Share | Converted | Conversion rate | +|---|---:|---:|---:|---:| +| `inbound_marketing` | 1570 | 44.86% | 682 | 43.44% | +| `partner_referral` | 698 | 19.94% | 273 | 39.11% | +| `sdr_outbound` | 1232 | 35.20% | 496 | 40.26% | + +## Tier: `intermediate` + +`n_leads = 3500`, overall 90-day conversion rate 20.14%. + +### Column: `lead_source` + +Univariate AUC: **0.5212** · Per-channel rate spread (max − min): **0.0365** · Verdict: **weak signal** + +| Channel | n | Share | Converted | Conversion rate | +|---|---:|---:|---:|---:| +| `inbound_marketing` | 1570 | 44.86% | 334 | 21.27% | +| `partner_referral` | 698 | 19.94% | 123 | 17.62% | +| `sdr_outbound` | 1232 | 35.20% | 248 | 20.13% | + +### Column: `first_touch_channel` + +Univariate AUC: **0.5212** · Per-channel rate spread (max − min): **0.0365** · Verdict: **weak signal** + +| Channel | n | Share | Converted | Conversion rate | +|---|---:|---:|---:|---:| +| `inbound_marketing` | 1570 | 44.86% | 334 | 21.27% | +| `partner_referral` | 698 | 19.94% | 123 | 17.62% | +| `sdr_outbound` | 1232 | 35.20% | 248 | 20.13% | + +## Tier: `advanced` + +`n_leads = 3500`, overall 90-day conversion rate 7.91%. + +### Column: `lead_source` + +Univariate AUC: **0.5083** · Per-channel rate spread (max − min): **0.0056** · Verdict: **weak signal** + +| Channel | n | Share | Converted | Conversion rate | +|---|---:|---:|---:|---:| +| `inbound_marketing` | 1570 | 44.86% | 128 | 8.15% | +| `partner_referral` | 698 | 19.94% | 53 | 7.59% | +| `sdr_outbound` | 1232 | 35.20% | 96 | 7.79% | + +### Column: `first_touch_channel` + +Univariate AUC: **0.5083** · Per-channel rate spread (max − min): **0.0056** · Verdict: **weak signal** + +| Channel | n | Share | Converted | Conversion rate | +|---|---:|---:|---:|---:| +| `inbound_marketing` | 1570 | 44.86% | 128 | 8.15% | +| `partner_referral` | 698 | 19.94% | 53 | 7.59% | +| `sdr_outbound` | 1232 | 35.20% | 96 | 7.79% | + +## Verdict + +v1's channel signal is **weak**: across all tiers and both channel columns the largest per-channel conversion-rate spread is 0.043 and the largest univariate AUC is 0.521. That is well below the G2 / Gemini v2 industry MQL→SQL benchmark band, where SEO leads convert 50 percentage points more than Email leads. v1 drives conversion through motif-family hazards keyed off latent traits, not channel-conditional probabilities, so this is the expected outcome; channel-conditional encoding is tracked as post-v1 work in `docs/release/post_v1_roadmap.md`. diff --git a/scripts/audit_channel_signal.py b/scripts/audit_channel_signal.py new file mode 100644 index 0000000..e1d390a --- /dev/null +++ b/scripts/audit_channel_signal.py @@ -0,0 +1,510 @@ +#!/usr/bin/env python3 +"""Audit how strongly the lead-source channel signals conversion. + +Companion analysis for PR 4.1 (recommendation #8 v1 scope from +``docs/external_review/summaries/recommendations_pass.md``). For every +tier in a release bundle family we compute: + +* conversion rate by channel (``lead_source`` and ``first_touch_channel``) +* the univariate AUC of channel against ``converted_within_90_days``, + scored as the empirical positive rate per channel (a 1-D Bayes + classifier; equivalent to a saturated logistic regression on one-hot + channel features) + +and compare those to the G2 / Gemini v2 industry MQL→SQL benchmarks. + +Outputs (defaults are pinned via the v1 acceptance gates): + +* ``docs/release/channel_signal_audit.md`` — human-readable audit +* ``docs/release/channel_signal_audit.json`` — machine-readable sibling + +The script is deterministic given a fixed bundle: it reads +``train.parquet`` only, derives empirical rates, and uses +``sklearn.metrics.roc_auc_score`` with no fit-time randomness. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections.abc import Mapping, Sequence +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Final + +import pandas as pd +from sklearn.metrics import roc_auc_score + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +CHANNEL_COLUMNS: Final[tuple[str, ...]] = ("lead_source", "first_touch_channel") +LABEL_COLUMN: Final[str] = "converted_within_90_days" +DEFAULT_TIERS: Final[tuple[str, ...]] = ("intro", "intermediate", "advanced") +DEFAULT_TASK: Final[str] = "converted_within_90_days" + +#: G2 industry MQL→SQL conversion rates surfaced in +#: ``gemini_v2_summary.md`` (recommendation #8). These are not directly +#: comparable to v1's 90-day closed-won label, but they are the closest +#: public anchor for "how much should channel matter" and the audit +#: reports the comparison band rather than asserting a hard match. +INDUSTRY_MQL_TO_SQL_BENCHMARKS: Final[Mapping[str, float]] = { + "SEO": 0.51, + "PPC": 0.26, + "Email": 0.005, +} + +DEFAULT_RELEASE_DIR: Final[Path] = Path("release") +DEFAULT_OUT_MD: Final[Path] = Path("docs/release/channel_signal_audit.md") +DEFAULT_OUT_JSON: Final[Path] = Path("docs/release/channel_signal_audit.json") + +#: Bands used to label the verdict for each channel column. Tuned to +#: surface "weak / moderate / strong" against G2-style benchmarks where +#: SEO vs Email differs by ~50 percentage points. Bands operate on the +#: per-channel max-min conversion-rate spread. +SIGNAL_BAND_WEAK_MAX: Final[float] = 0.05 +SIGNAL_BAND_MODERATE_MAX: Final[float] = 0.15 +AUC_NEAR_CHANCE_MAX: Final[float] = 0.55 + + +# --------------------------------------------------------------------------- +# Result dataclasses +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class ChannelStats: + """Per-channel rollup for one channel column in one tier.""" + + name: str + n: int + share: float + n_converted: int + conversion_rate: float + + +@dataclass(frozen=True) +class ChannelAudit: + """Audit results for one channel column in one tier.""" + + column: str + n_total: int + overall_conversion_rate: float + channels: tuple[ChannelStats, ...] + rate_spread: float + univariate_auc: float + + +@dataclass(frozen=True) +class TierAudit: + """Audit results for one tier across every channel column.""" + + tier: str + n_leads: int + conversion_rate_overall: float + columns: tuple[ChannelAudit, ...] + + +@dataclass(frozen=True) +class AuditReport: + """Full audit: every requested tier × channel column.""" + + release_dir: str + task: str + label_column: str + channel_columns: tuple[str, ...] + tiers: tuple[TierAudit, ...] + industry_mql_to_sql_benchmarks: Mapping[str, float] + + +# --------------------------------------------------------------------------- +# Pure functions +# --------------------------------------------------------------------------- + + +def _label_to_int(series: pd.Series) -> pd.Series: + """Coerce a (possibly nullable boolean) label to ``int``.""" + + if series.dtype == "bool": + return series.astype(int) + return pd.to_numeric(series, errors="raise").astype(int) + + +def audit_channel( + df: pd.DataFrame, + channel_col: str, + label_col: str = LABEL_COLUMN, +) -> ChannelAudit: + """Per-channel stats + univariate AUC for a single channel column. + + ``univariate_auc`` is the AUC obtained by replacing each row's + channel value with that channel's empirical positive rate. This is + a 1-D Bayes classifier, equivalent (up to ties) to a saturated + logistic regression on one-hot channel features and stable across + sklearn versions. Returns ``0.5`` when the label has only one + class, since AUC is undefined. + """ + + if channel_col not in df.columns: + raise KeyError(f"channel column {channel_col!r} not present") + if label_col not in df.columns: + raise KeyError(f"label column {label_col!r} not present") + + y = _label_to_int(df[label_col]) + n_total = len(df) + n_converted_total = int(y.sum()) + overall_rate = float(n_converted_total / n_total) if n_total else 0.0 + + # Per-channel rollup, sorted by name for determinism. + grouped = df.assign(_y=y).groupby(channel_col, dropna=False) + rows: list[ChannelStats] = [] + for name, sub in sorted(grouped, key=lambda kv: str(kv[0])): + n = len(sub) + n_conv = int(sub["_y"].sum()) + rows.append( + ChannelStats( + name=str(name), + n=n, + share=float(n / n_total) if n_total else 0.0, + n_converted=n_conv, + conversion_rate=float(n_conv / n) if n else 0.0, + ) + ) + + rate_spread = ( + max(c.conversion_rate for c in rows) - min(c.conversion_rate for c in rows) if rows else 0.0 + ) + + if y.nunique() < 2 or len(rows) < 2: + univariate_auc = 0.5 + else: + rate_lookup = {c.name: c.conversion_rate for c in rows} + scores = df[channel_col].astype(str).map(rate_lookup).astype(float) + univariate_auc = float(roc_auc_score(y.to_numpy(), scores.to_numpy())) + + return ChannelAudit( + column=channel_col, + n_total=n_total, + overall_conversion_rate=overall_rate, + channels=tuple(rows), + rate_spread=float(rate_spread), + univariate_auc=univariate_auc, + ) + + +def audit_tier( + df: pd.DataFrame, + tier: str, + *, + channel_columns: Sequence[str] = CHANNEL_COLUMNS, + label_col: str = LABEL_COLUMN, +) -> TierAudit: + """Run :func:`audit_channel` for every channel column on one tier.""" + + y = _label_to_int(df[label_col]) + n = len(df) + overall_rate = float(int(y.sum()) / n) if n else 0.0 + + columns = tuple(audit_channel(df, col, label_col=label_col) for col in channel_columns) + return TierAudit( + tier=tier, + n_leads=n, + conversion_rate_overall=overall_rate, + columns=columns, + ) + + +def load_train_df(release_dir: Path, tier: str, task: str = DEFAULT_TASK) -> pd.DataFrame: + """Load ``release_dir//tasks//train.parquet``.""" + + path = release_dir / tier / "tasks" / task / "train.parquet" + if not path.exists(): + raise FileNotFoundError(f"missing train split for tier {tier!r}: {path}") + return pd.read_parquet(path) + + +def build_report( + release_dir: Path, + tiers: Sequence[str] = DEFAULT_TIERS, + *, + task: str = DEFAULT_TASK, + channel_columns: Sequence[str] = CHANNEL_COLUMNS, + label_col: str = LABEL_COLUMN, +) -> AuditReport: + """Run the audit across every requested tier.""" + + tier_audits: list[TierAudit] = [] + for tier in tiers: + df = load_train_df(release_dir, tier, task=task) + tier_audits.append( + audit_tier( + df, + tier=tier, + channel_columns=channel_columns, + label_col=label_col, + ) + ) + + return AuditReport( + release_dir=str(release_dir), + task=task, + label_column=label_col, + channel_columns=tuple(channel_columns), + tiers=tuple(tier_audits), + industry_mql_to_sql_benchmarks=dict(INDUSTRY_MQL_TO_SQL_BENCHMARKS), + ) + + +# --------------------------------------------------------------------------- +# Verdict +# --------------------------------------------------------------------------- + + +def _classify_signal(audit: ChannelAudit) -> str: + """Map (rate spread, univariate AUC) to one of weak/moderate/strong.""" + + if audit.univariate_auc < AUC_NEAR_CHANCE_MAX and audit.rate_spread < SIGNAL_BAND_WEAK_MAX: + return "weak" + if audit.rate_spread < SIGNAL_BAND_MODERATE_MAX: + return "moderate" + return "strong" + + +def _verdict_paragraph(report: AuditReport) -> str: + """One-paragraph human-readable verdict.""" + + rows = [ + (tier.tier, col.column, col.rate_spread, col.univariate_auc, _classify_signal(col)) + for tier in report.tiers + for col in tier.columns + ] + strengths = {row[4] for row in rows} + max_spread = max((row[2] for row in rows), default=0.0) + max_auc = max((row[3] for row in rows), default=0.5) + + seo_minus_email = ( + INDUSTRY_MQL_TO_SQL_BENCHMARKS["SEO"] - INDUSTRY_MQL_TO_SQL_BENCHMARKS["Email"] + ) + + if strengths <= {"weak"}: + verdict = "weak" + intent = ( + "well below the G2 / Gemini v2 industry MQL→SQL benchmark band, where SEO leads " + f"convert {seo_minus_email * 100:.0f} percentage points more than Email leads." + ) + elif "strong" in strengths: + verdict = "strong" + intent = ( + "comparable to or stronger than the G2 / Gemini v2 industry benchmark band — " + "channel-conditional encoding may already be implicit in v1." + ) + else: + verdict = "moderate" + intent = ( + "below the G2 / Gemini v2 industry benchmark band — channel signal is present but " + "weaker than published MQL→SQL spreads." + ) + + return ( + f"v1's channel signal is **{verdict}**: across all tiers and both channel columns the " + f"largest per-channel conversion-rate spread is {max_spread:.3f} and the largest " + f"univariate AUC is {max_auc:.3f}. That is {intent} v1 drives conversion through " + "motif-family hazards keyed off latent traits, not channel-conditional probabilities, " + "so this is the expected outcome; channel-conditional encoding is tracked as post-v1 " + "work in `docs/release/post_v1_roadmap.md`." + ) + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + + +def report_to_dict(report: AuditReport) -> dict[str, Any]: + """Convert the report to a JSON-primitive dict (deterministic).""" + + payload = asdict(report) + payload["industry_mql_to_sql_benchmarks"] = dict(report.industry_mql_to_sql_benchmarks) + return payload + + +def render_json(report: AuditReport) -> str: + """Render the audit report as a deterministic JSON string.""" + + return json.dumps(report_to_dict(report), indent=2, sort_keys=True) + "\n" + + +def _format_pct(x: float) -> str: + return f"{x * 100:.2f}%" + + +def render_markdown(report: AuditReport) -> str: + """Render the audit report as Markdown.""" + + lines: list[str] = [] + lines.append("# Channel-signal audit — leadforge-lead-scoring-v1") + lines.append("") + lines.append( + "Audit produced by `scripts/audit_channel_signal.py`; see also " + "`docs/release/channel_signal_audit.json` for the machine-readable form." + ) + lines.append("") + lines.append( + "**Scope.** For every tier we compute per-channel conversion rates and the univariate " + "AUC of channel against `converted_within_90_days`, scored as the empirical positive " + "rate per channel (a 1-D Bayes classifier, equivalent to a saturated logistic " + "regression on one-hot channel features). Compared against the G2 / Gemini v2 industry " + "MQL→SQL benchmark band (SEO ~51%, PPC ~26%, Email <1%, surfaced in " + "`docs/external_review/summaries/recommendations_pass.md` recommendation #8)." + ) + lines.append("") + lines.append( + "**Caveat.** Industry benchmarks are MQL→SQL rates, not 90-day closed-won rates. They " + "are the closest public anchor for *how much* channel ought to matter; use them as a " + "band of reference, not a hard target." + ) + lines.append("") + + lines.append("## Industry benchmark band") + lines.append("") + lines.append("| Channel | MQL→SQL conversion rate |") + lines.append("|---|---|") + for name, rate in sorted(report.industry_mql_to_sql_benchmarks.items()): + lines.append(f"| {name} | {_format_pct(rate)} |") + lines.append("") + + for tier in report.tiers: + lines.append(f"## Tier: `{tier.tier}`") + lines.append("") + lines.append( + f"`n_leads = {tier.n_leads}`, overall 90-day conversion rate " + f"{_format_pct(tier.conversion_rate_overall)}." + ) + lines.append("") + + for col in tier.columns: + lines.append(f"### Column: `{col.column}`") + lines.append("") + lines.append( + f"Univariate AUC: **{col.univariate_auc:.4f}** · " + f"Per-channel rate spread (max − min): **{col.rate_spread:.4f}** · " + f"Verdict: **{_classify_signal(col)} signal**" + ) + lines.append("") + lines.append("| Channel | n | Share | Converted | Conversion rate |") + lines.append("|---|---:|---:|---:|---:|") + for ch in col.channels: + lines.append( + f"| `{ch.name}` | {ch.n} | {_format_pct(ch.share)} | " + f"{ch.n_converted} | {_format_pct(ch.conversion_rate)} |" + ) + lines.append("") + + lines.append("## Verdict") + lines.append("") + lines.append(_verdict_paragraph(report)) + lines.append("") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _parse_args(argv: Sequence[str] | None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Audit how strongly source channel signals conversion in a release " + "bundle family.", + ) + parser.add_argument( + "--release-dir", + type=Path, + default=DEFAULT_RELEASE_DIR, + help="release bundle root containing one subdirectory per tier (default: %(default)s)", + ) + parser.add_argument( + "--tier", + action="append", + dest="tiers", + default=None, + help="limit the audit to one tier (repeatable; default: intro/intermediate/advanced)", + ) + parser.add_argument( + "--task", + default=DEFAULT_TASK, + help="task subdirectory under each tier (default: %(default)s)", + ) + parser.add_argument( + "--channel-column", + action="append", + dest="channel_columns", + default=None, + help="channel column to audit (repeatable; default: lead_source + first_touch_channel)", + ) + parser.add_argument( + "--out-md", + type=Path, + default=DEFAULT_OUT_MD, + help="markdown output path (default: %(default)s)", + ) + parser.add_argument( + "--out-json", + type=Path, + default=DEFAULT_OUT_JSON, + help="JSON output path (default: %(default)s)", + ) + parser.add_argument( + "--print", + action="store_true", + help="print the markdown report to stdout in addition to writing it", + ) + return parser.parse_args(argv) + + +def main(argv: Sequence[str] | None = None) -> int: + args = _parse_args(argv) + release_dir: Path = args.release_dir + tiers: tuple[str, ...] = tuple(args.tiers) if args.tiers else DEFAULT_TIERS + channel_columns: tuple[str, ...] = ( + tuple(args.channel_columns) if args.channel_columns else CHANNEL_COLUMNS + ) + + if not release_dir.exists(): + print(f"error: release directory not found: {release_dir}", file=sys.stderr) + return 2 + + try: + report = build_report( + release_dir, + tiers, + task=args.task, + channel_columns=channel_columns, + ) + except FileNotFoundError as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + except KeyError as exc: + print(f"error: required column missing: {exc}", file=sys.stderr) + return 2 + + md = render_markdown(report) + js = render_json(report) + + args.out_md.parent.mkdir(parents=True, exist_ok=True) + args.out_json.parent.mkdir(parents=True, exist_ok=True) + args.out_md.write_text(md) + args.out_json.write_text(js) + + if args.print: + sys.stdout.write(md) + + print(f"wrote {args.out_md}", file=sys.stderr) + print(f"wrote {args.out_json}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/scripts/test_audit_channel_signal.py b/tests/scripts/test_audit_channel_signal.py new file mode 100644 index 0000000..ff46dea --- /dev/null +++ b/tests/scripts/test_audit_channel_signal.py @@ -0,0 +1,237 @@ +"""Tests for ``scripts/audit_channel_signal.py``. + +Exercises the per-channel rollup, univariate-AUC scorer, and the JSON + +markdown rendering paths. A determinism guard ensures the script's +output is byte-identical across runs against the committed +``release/`` bundles. +""" + +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path + +import pandas as pd +import pytest + +_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "audit_channel_signal.py" +_REPO_ROOT = Path(__file__).resolve().parents[2] +_spec = importlib.util.spec_from_file_location("audit_channel_signal", _SCRIPT_PATH) +assert _spec is not None +assert _spec.loader is not None +audit_module = importlib.util.module_from_spec(_spec) +sys.modules["audit_channel_signal"] = audit_module +_spec.loader.exec_module(audit_module) + + +# --------------------------------------------------------------------------- +# Synthetic fixture +# --------------------------------------------------------------------------- + + +def _toy_train(n_per_channel: int = 20) -> pd.DataFrame: + """Three channels with deliberately different conversion rates. + + Channel rates: ``A`` 100%, ``B`` 50%, ``C`` 0%. Univariate AUC for + a perfectly separating saturated classifier on this is 1.0 only if + ``B`` is treated as a tied middle class — otherwise it's the + standard 1-D Bayes AUC against a 3-bucket score. + """ + + rows = [] + for ch, rate in [("A", 1.0), ("B", 0.5), ("C", 0.0)]: + for i in range(n_per_channel): + rows.append( + { + "lead_source": ch, + "first_touch_channel": ch, + "converted_within_90_days": bool(i < int(rate * n_per_channel)), + } + ) + return pd.DataFrame(rows) + + +# --------------------------------------------------------------------------- +# Per-channel rollup +# --------------------------------------------------------------------------- + + +def test_audit_channel_returns_per_channel_stats() -> None: + df = _toy_train() + audit = audit_module.audit_channel(df, "lead_source") + assert audit.column == "lead_source" + assert audit.n_total == 60 + assert audit.overall_conversion_rate == pytest.approx(0.5) + names = [c.name for c in audit.channels] + assert names == ["A", "B", "C"] # sorted by name + by_name = {c.name: c for c in audit.channels} + assert by_name["A"].conversion_rate == pytest.approx(1.0) + assert by_name["B"].conversion_rate == pytest.approx(0.5) + assert by_name["C"].conversion_rate == pytest.approx(0.0) + assert audit.rate_spread == pytest.approx(1.0) + + +def test_audit_channel_univariate_auc_perfectly_separable() -> None: + df = _toy_train() + audit = audit_module.audit_channel(df, "lead_source") + # 20 pos from A (rate 1.0), 10 pos / 10 neg from B (rate 0.5, tied), + # 20 neg from C (rate 0.0). Pair-counting AUC: + # A_pos vs B_neg : 200 wins + # A_pos vs C_neg : 400 wins + # B_pos vs B_neg : 100 ties → +50 + # B_pos vs C_neg : 200 wins + # → 850 / 900 = 17/18. + assert audit.univariate_auc == pytest.approx(17 / 18) + + +def test_audit_channel_handles_single_class_label() -> None: + df = _toy_train() + df["converted_within_90_days"] = False + audit = audit_module.audit_channel(df, "lead_source") + assert audit.univariate_auc == 0.5 # AUC undefined → reported as chance + + +def test_audit_channel_raises_on_missing_column() -> None: + df = _toy_train() + with pytest.raises(KeyError): + audit_module.audit_channel(df, "no_such_column") + + +def test_audit_tier_runs_every_channel_column() -> None: + df = _toy_train() + tier = audit_module.audit_tier(df, "intro") + cols = {c.column for c in tier.columns} + assert cols == {"lead_source", "first_touch_channel"} + assert tier.tier == "intro" + assert tier.n_leads == 60 + + +# --------------------------------------------------------------------------- +# Build / render +# --------------------------------------------------------------------------- + + +def test_build_report_round_trips_through_render_json() -> None: + df = _toy_train() + tier = audit_module.audit_tier(df, "intro") + report = audit_module.AuditReport( + release_dir="release", + task="converted_within_90_days", + label_column="converted_within_90_days", + channel_columns=audit_module.CHANNEL_COLUMNS, + tiers=(tier,), + industry_mql_to_sql_benchmarks=audit_module.INDUSTRY_MQL_TO_SQL_BENCHMARKS, + ) + js = audit_module.render_json(report) + parsed = json.loads(js) + assert parsed["tiers"][0]["tier"] == "intro" + assert parsed["industry_mql_to_sql_benchmarks"]["SEO"] == pytest.approx(0.51) + + +def test_render_markdown_includes_verdict_section() -> None: + df = _toy_train() + tier = audit_module.audit_tier(df, "intro") + report = audit_module.AuditReport( + release_dir="release", + task="converted_within_90_days", + label_column="converted_within_90_days", + channel_columns=audit_module.CHANNEL_COLUMNS, + tiers=(tier,), + industry_mql_to_sql_benchmarks=audit_module.INDUSTRY_MQL_TO_SQL_BENCHMARKS, + ) + md = audit_module.render_markdown(report) + assert "## Verdict" in md + assert "## Industry benchmark band" in md + assert "Tier: `intro`" in md + + +# --------------------------------------------------------------------------- +# CLI determinism (guards against accidental nondeterminism in either +# the audit functions or the rendering layer) +# --------------------------------------------------------------------------- + + +_INTRO_TRAIN = ( + _REPO_ROOT / "release" / "intro" / "tasks" / "converted_within_90_days" / "train.parquet" +) + + +@pytest.mark.skipif( + not _INTRO_TRAIN.exists(), + reason="release/intro bundle not present; skipping determinism guard", +) +def test_release_audit_is_deterministic(tmp_path: Path) -> None: + """Two back-to-back runs against the committed release bundle must + produce byte-identical JSON and markdown output.""" + + out_md_a = tmp_path / "a.md" + out_json_a = tmp_path / "a.json" + out_md_b = tmp_path / "b.md" + out_json_b = tmp_path / "b.json" + + rc_a = audit_module.main( + [ + "--release-dir", + str(_REPO_ROOT / "release"), + "--out-md", + str(out_md_a), + "--out-json", + str(out_json_a), + ] + ) + rc_b = audit_module.main( + [ + "--release-dir", + str(_REPO_ROOT / "release"), + "--out-md", + str(out_md_b), + "--out-json", + str(out_json_b), + ] + ) + assert rc_a == 0 + assert rc_b == 0 + assert out_md_a.read_bytes() == out_md_b.read_bytes() + assert out_json_a.read_bytes() == out_json_b.read_bytes() + + +def test_main_reports_missing_release_dir( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + rc = audit_module.main( + [ + "--release-dir", + str(tmp_path / "nope"), + "--out-md", + str(tmp_path / "audit.md"), + "--out-json", + str(tmp_path / "audit.json"), + ] + ) + captured = capsys.readouterr() + assert rc == 2 + assert "release directory not found" in captured.err + + +def test_main_reports_missing_train_split( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + # Empty release dir — tier subdirectory missing. + (tmp_path / "release").mkdir() + rc = audit_module.main( + [ + "--release-dir", + str(tmp_path / "release"), + "--tier", + "intro", + "--out-md", + str(tmp_path / "audit.md"), + "--out-json", + str(tmp_path / "audit.json"), + ] + ) + captured = capsys.readouterr() + assert rc == 2 + assert "missing train split" in captured.err From 6008e8402801b8a64b6460f9c4824c7c2b423c42 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 11:04:16 +0300 Subject: [PATCH 2/7] docs(release): release-grade dataset card + generation method + feature dictionary (PR 4.1 deliverable 3-5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs/release/generation_method.md (new) — standalone DGP summary for external readers. Reads alone, references the architecture spec. Covers the five generation layers (motif families → mechanism layer → population → simulation engine → snapshot rendering), the public- vs-instructor split, calibration / validation, and the explicit "what this is not" boundary. * docs/release/feature_dictionary.md (new) — narrative companion to the per-bundle feature_dictionary.csv. Groups the 32 public columns by analytical role (lead identity / firmographics / personographics / engagement / funnel / value) plus the deliberate trap and the target. Documents difficulty modulation parameters, modelling defaults, and pedagogical caveats. Satisfies G10.3. * release/README.md (substantial rewrite) — release-grade dataset card per Datasheets-for-Datasets / Data Cards Playbook checklist (G10.1): - macro framing paragraph (2024–2026 SaaS context, recommendation #19) - simulation simplifications section (chatgpt v2 §2.6 — modelled / approximate / not modelled) - calibration documentation linking to validation_report.md - public-vs-instructor redaction policy with concrete column lists citing BANNED_LEAD_COLUMNS / BANNED_OPP_COLUMNS / BANNED_TABLES / SNAPSHOT_FILTERED_TABLES from leakage_probes.py - intended use vs out-of-scope use - known limitations including the G7.4.4 GBM-vs-LR finding and the weak channel signal from the Phase 4 audit - composition section (entities / features / label / splits / provenance) per Datasheets format - adversarial-framing pointer (placeholder link to break-me guide that lands in PR 6.3) - maintenance plan All claims about realism, calibration, or difficulty are anchored to release/validation/validation_report.md per G10.6. Co-Authored-By: Claude Opus 4.7 --- docs/release/feature_dictionary.md | 180 +++++++++++ docs/release/generation_method.md | 178 +++++++++++ release/README.md | 459 +++++++++++++++++++++++------ 3 files changed, 719 insertions(+), 98 deletions(-) create mode 100644 docs/release/feature_dictionary.md create mode 100644 docs/release/generation_method.md diff --git a/docs/release/feature_dictionary.md b/docs/release/feature_dictionary.md new file mode 100644 index 0000000..823b676 --- /dev/null +++ b/docs/release/feature_dictionary.md @@ -0,0 +1,180 @@ +# Feature dictionary — `leadforge-lead-scoring-v1` + +Narrative companion to the per-tier `feature_dictionary.csv` shipped +inside each public bundle. The CSV is the authoritative +machine-readable spec (column / dtype / description / category / +target flag / leakage flag); this document groups features by +analytical role and adds the prose explanation, modelling +recommendations, and pedagogical caveats that don't fit a CSV row. + +The grouping below covers every feature in the public student-facing +snapshot — the same 32 columns ship in `intro`, `intermediate`, and +`advanced` bundles. The instructor companion adds the hidden truth +in `metadata/`; it does not change the feature list. + +| Category | Columns | Modelling default | +|---|---|---| +| Lead identity & timing | 4 | drop `lead_id`; keep `lead_created_at` for cohort splits, drop for production | +| Firmographics | 6 | keep all | +| Personographics | 4 | keep all (categorical encoders welcome) | +| Engagement (snapshot-window) | 10 | keep all | +| Funnel & sales-process | 4 | keep all | +| Value | 2 | keep all | +| Leakage trap | 1 | **drop** unless deliberately demonstrating leakage | +| Target | 1 | label — never used as a feature | + +## Lead identity and timing + +| Column | Dtype | Source | Modelling notes | +|---|---|---|---| +| `lead_id` | string | identity | Opaque, deterministic per run; not informative. Use as a join key or row index, never as a feature. | +| `account_id` | string | identity | Foreign key into `tables/accounts.parquet`. Out-of-sample accounts may appear in test; if you fit account-level features, watch for cold-start. | +| `contact_id` | string | identity | Foreign key into `tables/contacts.parquet`. Same warning. | +| `lead_created_at` | string (ISO-8601) | simulation clock | Lead birthday; useful for cohort/time-shift evaluation (see `docs/release/v1_acceptance_gates.md` G6.4). Drop or bin it for production models — feeding raw timestamps to a linear model is rarely what you want. | + +## Firmographics (account-level) + +These describe the buying organisation. They come from the recipe's +narrative spec (industry, region, employee bands, revenue bands) +and from latent traits sampled per account. Six columns; all are +fair to use. + +| Column | Dtype | Why it might matter | +|---|---|---| +| `industry` | string | Categorical mix is fixed by the recipe (`manufacturing`, `logistics`, `professional_services`, `healthcare_non_clinical`); motif-family latent biases create modest cross-industry conversion-rate differences. | +| `region` | string | `US` / `UK`. Currently a low-signal axis — the simulator does not model channel-by-region interactions. | +| `employee_band` | string | Bands are aligned with the ICP range (200–2,000 employees, plus tails). Larger accounts trend toward higher expected ACV. | +| `estimated_revenue_band` | string | Bands span `$1M-$10M` to `$200M+`; correlated with `employee_band` by design. | +| `process_maturity_band` | string | A bandage of the latent `process_maturity` trait — *visible* signal of `motif_family.fit_dominant`'s "fit beats engagement" story. | + +## Personographics (contact-level) + +These describe the primary contact attached to the lead. +Three categorical features plus the buyer-role label; all four +are fair to use. + +| Column | Dtype | Why it might matter | +|---|---|---| +| `role_function` | string | Functional area: `finance`, `ops`, `it`, `procurement`. Drives demo-page views and the demo/trial path through `motif_family.demo_trial_mediated`. | +| `seniority` | string | `c_suite` / `vp` / `director` / `manager` / `individual_contributor`. Strongly correlated with the latent `contact_authority` trait that gates `motif_family.buying_committee_friction`. | +| `buyer_role` | string | `economic_buyer`, `champion`, `technical_evaluator`, `end_user`. Hand-mapped from `role_function` × `seniority`. | + +## Engagement (snapshot-window aggregates) + +Ten engagement features computed strictly over events on days +`[0, snapshot_day]` (with `snapshot_day = 30` for v1). The simulator +emits touches, sessions, and page views every day from +`lead_created_at` onward; the renderer aggregates them up to but +not past day 30. The 90-day label window resolves separately, so +features cannot encode events that drove the late-window outcome. + +| Column | Dtype | What it captures | +|---|---|---| +| `touch_count` | Int64 | All marketing/sales touches in the snapshot window. | +| `inbound_touch_count` | Int64 | Inbound touches only. | +| `outbound_touch_count` | Int64 | Outbound touches only. | +| `session_count` | Int64 | Web/trial session count. | +| `pricing_page_views` | Int64 | Cumulative pricing-page views across sessions. | +| `demo_page_views` | Int64 | Cumulative demo-page views across sessions. | +| `total_session_duration_seconds` | Int64 | Cumulative seconds across all sessions. | +| `touches_week_1` | Int64 | Touches in days 0–6 (early urgency proxy). | +| `touches_last_7_days` | Int64 | Touches in days 24–30 (late-window momentum proxy). | +| `days_since_first_touch` | Float64 | NaN if the lead has had zero touches by snapshot day. | + +## Funnel and sales-process + +The funnel state at snapshot day, exposed via four columns. None of +these are terminal stages — `current_stage` (which can encode +`closed_won` / `closed_lost`) is redacted from public bundles via +the exposure layer. + +| Column | Dtype | What it captures | +|---|---|---| +| `activity_count` | Int64 | Sales-activity events (calls, demos, follow-ups) in the snapshot window. | +| `days_since_last_touch` | Float64 | Recency of the most recent touch; NaN if zero touches. | +| `opportunity_created` | boolean | Whether *any* opportunity was created by snapshot day, regardless of state. | +| `has_open_opportunity` | boolean | Whether an opportunity existed in an open stage at snapshot day. | + +## Value + +Two value features. Both are useful as inputs to value-aware +ranking (`expected_acv × P(convert)`); see notebook 4 once Phase 6 +ships. + +| Column | Dtype | What it captures | +|---|---|---| +| `opportunity_estimated_acv` | Float64 | Estimated ACV of the most recent open opportunity at snapshot day; NaN if no opportunity. | +| `expected_acv` | Float64 | Falls back to a revenue-band midpoint heuristic when no opportunity exists, so it has fewer NaNs than `opportunity_estimated_acv`. | + +## Leakage trap (deliberate) + +| Column | Dtype | Why it ships | +|---|---|---| +| `total_touches_all` | Int64 | Counts touches across the full 90-day horizon — not the snapshot window. Flagged `leakage_risk=True` and `is_leakage_trap=True` in the CSV; documented in `release/README.md`. The gap `total_touches_all − touch_count` carries label-correlated signal because high-converting leads accumulate more late-window touches in the simulator. **Drop this column from your features unless you are explicitly demonstrating leakage detection.** | + +## Target + +| Column | Dtype | Definition | +|---|---|---| +| `converted_within_90_days` | boolean | True iff a `closed_won` event occurred within 90 days of `lead_created_at`. Derived from simulated events; never sampled directly. | + +## Difficulty modulation + +Difficulty profiles distort the same feature set with different +parameters; columns and dtypes are identical across tiers. The +distortions are applied in `leadforge/render/snapshots.py` via +`_apply_difficulty_distortions()`: + +- **Gaussian noise** on float features. `intro` 0.10, `intermediate` + 0.30, `advanced` 0.55 (multipliers applied to per-feature + standard deviations). +- **MCAR missingness.** `intro` 2%, `intermediate` 8%, + `advanced` 18%. +- **Outlier injection** at the same per-tier rate as missingness. +- **Signal strength.** Latent-score weights are multiplied by 0.90 + (`intro`), 0.70 (`intermediate`), and 0.50 (`advanced`), + weakening the link between latent traits and conversion as + difficulty rises. + +The conversion-rate band for each tier is recipe-defined; observed +medians across the canonical seed sweep (42–46) are +0.4267 (`intro`), 0.2160 (`intermediate`), 0.0840 (`advanced`). +See `release/validation/validation_report.md` for the full +cross-seed × cross-tier metrics panel. + +## Recommended modelling defaults + +A short opinionated checklist for a first model: + +1. Drop `lead_id`. Drop or bin `lead_created_at`. Drop + `total_touches_all`. Drop `account_id` / `contact_id` unless + you're joining the relational tables on purpose. +2. One-hot or target-encode the categorical columns + (`industry`, `region`, `employee_band`, + `estimated_revenue_band`, `process_maturity_band`, + `role_function`, `seniority`, `buyer_role`, + `lead_source`, `first_touch_channel`). +3. Keep all snapshot-window engagement and funnel features; the + `Float64` columns carry NaN for "no event in window", which is + itself a signal — encode missingness explicitly rather than + imputing to zero blindly. +4. For value-aware ranking, use `expected_acv` over + `opportunity_estimated_acv` since the latter is missing for + leads without an opportunity. Multiply by your model's predicted + probability for a default value-weighted ranker. +5. For cohort/time-shift evaluation, sort by `lead_created_at` and + split chronologically; the random-split AUC is *not* the + right number to report if your downstream use is forecasting. + +## See also + +- `release/{intro,intermediate,advanced}/feature_dictionary.csv` — + the authoritative machine-readable spec, regenerated with each + bundle. +- `release/README.md` — the dataset card. +- `docs/release/generation_method.md` — how the underlying + events are generated. +- `docs/release/channel_signal_audit.md` — how strongly each + channel column signals conversion in v1. +- `release/validation/validation_report.md` — calibration, lift, + P@K, model-family deltas, cross-seed bands. diff --git a/docs/release/generation_method.md b/docs/release/generation_method.md new file mode 100644 index 0000000..d19ef18 --- /dev/null +++ b/docs/release/generation_method.md @@ -0,0 +1,178 @@ +# Generation method — `leadforge-lead-scoring-v1` + +A standalone summary of how the dataset is generated, written for +external readers. Read this before opening the bundle if you want to +know what the data is and how much you can trust each piece of it; for +the full architecture, see [`docs/leadforge_architecture_spec.md`]. + +## What the dataset is + +`leadforge-lead-scoring-v1` is a synthetic mid-market B2B SaaS +lead-scoring dataset generated by +[leadforge](https://github.com/leadforge-dev/leadforge), an +open-source Python framework. Every row, event, and edge is produced +by code in this repository — there is no real CRM behind the data. +The generator is deterministic given a fixed +`(recipe, configuration, seed, package version)` tuple, and the +recipe and seed are recorded in each bundle's `manifest.json`. + +The published family contains three difficulty tiers — `intro`, +`intermediate`, and `advanced` — sharing one fictional company +narrative ("Veridian Procure", a procurement / AP automation SaaS). +The tiers differ only in noise, missingness, and signal strength, +modulated by a difficulty profile that the simulator consumes; the +underlying causal structure is identical. A separate +`*_instructor` companion ships the full hidden truth (causal graph, +latent registry, mechanism summary, full-horizon relational tables). + +## Generation pipeline at a glance + +Generation runs in five layers, top to bottom. Every layer is +deterministic, every layer is seeded from a single root via named +substreams, and every layer is testable in isolation. + +1. **Hidden world structure.** A directed acyclic graph (DAG) of + latent traits, lead states, sales-process states, and the + `Converted within 90 days` outcome node, sampled from one of five + *motif families* and then perturbed by stochastic rewiring. The + motif families are intentionally non-uniform: `fit_dominant`, + `intent_dominant`, `sales_execution_sensitive`, + `demo_trial_mediated`, `buying_committee_friction`. Two + independently-sampled bundles share neither the exact graph nor + the edge weights, but they share the constraint that the graph is + acyclic, every node is reachable from a root, and the outcome + node is reachable from every non-root subgraph. +2. **Mechanism layer.** Every node in the sampled graph receives a + concrete mechanism — a logistic latent score, a Poisson intensity + for touch counts, a recency-decayed engagement intensity for + sessions, a categorical influence for source channel, a stage + transition hazard, a conversion hazard, etc. Mechanisms are + assigned by motif family, so a `fit_dominant` graph and an + `intent_dominant` graph end up with materially different + behavior at simulation time. Mechanism parameters are calibrated + so each tier hits its target conversion-rate band; the + `intermediate` tier is the canonical difficulty profile. +3. **Population layer.** Accounts (1,500), contacts (4,200), and + leads (5,000) are drawn with deterministic foreign keys and + ID-stable namespaces (`acct_000001`, `lead_000001`, …). Each + entity carries a vector of latent traits seeded from the world + graph: account fit, process maturity, contact authority, + problem awareness, urgency, etc. Industry, region, employee + band, role, and seniority are all drawn from the recipe's + narrative spec; firmographic correlations come from + motif-family latent biases applied during sampling. +4. **Simulation engine.** A 90-day discrete-time simulator + advances every lead day-by-day from MQL through the funnel + (`mql → sal → sql → demo_scheduled → demo_completed → + proposal_sent → negotiation → closed_won/closed_lost`). Each + day, hazards from the mechanism layer fire: stage transitions, + touches (inbound vs outbound, recency-decayed), web sessions + (pricing-page views, demo-page views), sales activities, + churn, and direct conversion for unusual fast paths. Once a + lead reaches `closed_won`, opportunities, customers, and + subscriptions materialise with deterministic foreign keys. + `converted_within_90_days` is *event-derived*: it is true iff + a `closed_won` event occurred within the configured label + window, never sampled directly. +5. **Snapshot rendering.** For every lead, the renderer freezes a + feature snapshot at `snapshot_day` (30 days for v1). + Aggregates such as `touch_count`, `session_count`, + `pricing_page_views`, `expected_acv`, and + `days_since_last_touch` only see events on days + `[0, snapshot_day]`; the label resolves over the full 90-day + horizon. The deliberate exception is `total_touches_all`, + which counts the full-horizon touch history and is flagged as + a pedagogical leakage trap in the feature dictionary. + +## Bundle output + +Each bundle writes a fixed directory layout — a manifest, dataset +card, feature dictionary, relational tables, and the +`converted_within_90_days` task split. The manifest records the +recipe, seed, package version, exposure mode, snapshot day, label +window, schema version, table inventory with row counts, SHA-256 +hashes for every file, and the exact set of redacted columns. Two +runs with the same `(recipe, seed, version)` produce byte-identical +bundles modulo the wall-clock `generation_timestamp` field; +`scripts/verify_hash_determinism.py` enforces this. + +The public (`student_public`) bundle and the instructor companion +share the same generator run; they differ only in *what is +published*. Filtering happens during rendering, not during +simulation: + +- Public bundles route relational tables through + `to_dataframes_snapshot_safe`, which (a) filters event tables + per-lead by `lead_created_at + snapshot_day`, (b) drops + terminal-state columns from `leads` and `opportunities`, and + (c) omits `customers` and `subscriptions` entirely (their + presence is conversion-conditional). +- Instructor companions skip the snapshot-safe writer and ship + full-horizon tables plus a `metadata/` directory containing the + hidden world graph, latent registry, mechanism summary, and + full world spec. They are not appropriate input for the + student-facing task. + +The exact column lists are pinned by `BANNED_LEAD_COLUMNS`, +`BANNED_OPP_COLUMNS`, `BANNED_TABLES`, and +`SNAPSHOT_FILTERED_TABLES` in +`leadforge/validation/leakage_probes.py`; the validator imports the +same constants the writer uses, so the contract is single-sourced. + +## Calibration and validation + +Difficulty calibration is empirical, not analytic: the +intermediate tier is sampled, the conversion-rate band is checked, +and the signal-strength multiplier is tuned until five seeds +(42–46) hit the target band with stable variance. The intro and +advanced tiers reuse the same mechanism assignments with different +distortion parameters (Gaussian noise on float features, MCAR +missingness, outlier injection) calibrated the same way. + +Every claim made about realism, calibration, or difficulty is +backed by `release/validation/validation_report.md`, which is +regenerated by `scripts/validate_release_candidate.py`. The driver +runs the full release-quality panel — per-tier ROC-AUC, PR-AUC, log +loss, Brier, calibration bins, lift, P@K, top-decile rate, +expected-ACV capture, model-family deltas, cross-seed bands, +random-vs-cohort split degradation, and the full leakage probe +taxonomy — and exits non-zero if anything falls outside the bands +declared in `docs/release/v1_acceptance_gates_bands.yaml`. + +## What this is not + +- Not a substitute for real CRM data. The vertical, narrative, + and motif families are deliberate fictions chosen to teach + lead-scoring patterns without exposing real customer data. +- Not a benchmark. The difficulty tiers are calibrated for + pedagogy, not for cross-paper comparability. +- Not a temporally rich dataset. The simulator runs in + daily steps over a 90-day horizon. Sales-cycle distributions + are whatever falls out of the daily hazards, not log-normal / + Weibull tails. Demographic strings are clean (no + free-text-job-title messiness). Both are tracked as post-v1 + scope in `docs/release/post_v1_roadmap.md`. + +## Where the code lives + +| Layer | Module | +|---|---| +| Recipe loader, config resolution | `leadforge/api/recipes.py` | +| Public API entry point | `leadforge/api/generator.py` | +| Hidden world DAG | `leadforge/structure/{graph,motifs,rewiring,sampler}.py` | +| Mechanism assignment | `leadforge/mechanisms/{policies,hazards,transitions,counts,categorical}.py` | +| Population draw | `leadforge/simulation/population.py` | +| 90-day daily simulator | `leadforge/simulation/engine.py` | +| Snapshot rendering | `leadforge/render/snapshots.py` | +| Snapshot-safe relational writer | `leadforge/render/relational_snapshot_safe.py` | +| Exposure-mode filtering | `leadforge/exposure/{modes,filters,metadata}.py` | +| Bundle writer | `leadforge/api/bundle.py` | +| Validation contract | `leadforge/validation/{bundle_checks,leakage_probes,release_quality,reporting,difficulty}.py` | + +For the deeper design rationale — why a DAG, why motif families, +why event-derived labels, why public-vs-instructor — see +[`docs/leadforge_design_doc.md`] and +[`docs/leadforge_architecture_spec.md`]. + +[`docs/leadforge_design_doc.md`]: ../leadforge_design_doc.md +[`docs/leadforge_architecture_spec.md`]: ../leadforge_architecture_spec.md diff --git a/release/README.md b/release/README.md index cb85b61..19aec97 100644 --- a/release/README.md +++ b/release/README.md @@ -1,93 +1,124 @@ -# LeadForge: Synthetic B2B Lead Scoring Dataset - -A relational, reproducible, multi-difficulty lead scoring dataset generated by [leadforge](https://github.com/leadforge-dev/leadforge) -- an open-source Python framework for synthetic CRM/funnel data. - -## Why this dataset? - -Most public lead scoring datasets are flat CSVs with opaque provenance. This one is different: - -1. **Relational structure.** 9 normalized tables (accounts, contacts, leads, touches, sessions, sales activities, opportunities, customers, subscriptions) plus ML-ready task splits. Practice feature engineering from raw tables, or grab the flat file and start modeling. - -2. **Three difficulty tiers.** Same company, same product, same buyer personas -- different difficulty profiles that produce meaningfully different conversion rates, noise levels, and missingness. - -3. **Reproducible and leakage-safe.** Deterministic generation from a fixed seed. SHA-256 hashes for every file in `manifest.json`. The label-encoding `current_stage` column is stripped from the public bundles in the exposure layer. Event-aggregate features (`touch_count`, `session_count`, `pricing_page_views`, ...) are computed over a 30-day window — they cannot encode events that happen *after* day 30, even though the label resolves over a 90-day window. The only leakage-flagged column that ships in `student_public` is the deliberately included pedagogical trap `total_touches_all`, which counts the full 90-day touch history and is marked `is_leakage_trap=True` in the feature dictionary. +# LeadForge: Synthetic B2B Lead Scoring Dataset (`leadforge-lead-scoring-v1`) + +A relational, reproducible, three-tier synthetic CRM dataset family for +teaching lead scoring at scale. Generated by +[leadforge](https://github.com/leadforge-dev/leadforge) — an +open-source Python framework for synthetic CRM/funnel data — and +released as `leadforge-lead-scoring-v1`. The framework version is +decoupled from the dataset version: the package stays at `1.x`; the +dataset is published under the explicit `…-v1` tag. + +## Why lead scoring matters in 2024–2026 + +The 2024–2026 SaaS environment punishes inefficient sales motions: +median public-SaaS growth has slid from roughly 30% (2023) to about +25% (2025), and the New CAC Ratio rose sharply in 2024 — companies +spent close to $2 of sales-and-marketing for every $1 of net new ARR. +Mid-market vendors can no longer afford to chase every MQL. Predicting +*which* leads convert within a fixed window is now a survival skill, +not a marketing nicety. This dataset is built to teach exactly that +skill on a relational substrate, with the realistic confusions +(snapshot-window discipline, leakage traps, channel signal that's +weaker than vendor blogs imply) that students will hit when they +finally get hands on real CRM data. ## What's inside ``` release/ -|-- README.md # This file -|-- LICENSE # MIT -|-- intro/ # Difficulty tier 1 -| |-- manifest.json # Provenance: seed, recipe, version, file hashes -| |-- dataset_card.md # Human-readable dataset summary -| |-- feature_dictionary.csv # Column descriptions, types, leakage flags -| |-- lead_scoring.csv # Flat convenience file (all splits + split column) -| |-- tables/ # 9 relational Parquet tables -| | |-- accounts.parquet -| | |-- contacts.parquet -| | |-- leads.parquet -| | |-- touches.parquet -| | |-- sessions.parquet -| | |-- sales_activities.parquet -| | |-- opportunities.parquet -| | |-- customers.parquet -| | |-- subscriptions.parquet -| |-- tasks/converted_within_90_days/ # Pre-split ML task -| |-- train.parquet # 70% of leads -| |-- valid.parquet # 15% of leads -| |-- test.parquet # 15% of leads -|-- intermediate/ # Difficulty tier 2 (same structure) -|-- advanced/ # Difficulty tier 3 (same structure) -|-- intermediate_instructor/ # Research companion (adds metadata/) -| |-- metadata/ # Hidden causal structure -| |-- graph.json # World graph (DAG) -| |-- graph.graphml # World graph (GraphML) -| |-- world_spec.json # Full generation config -| |-- latent_registry.json # Per-entity latent trait values -| |-- mechanism_summary.json # Causal mechanism assignments -|-- notebooks/ - |-- 01_baseline_lead_scoring.ipynb # Baseline modeling walkthrough +├── README.md # This file +├── LICENSE # MIT +├── intro/ # Difficulty tier 1 (highest signal, lowest noise) +│ ├── manifest.json # Provenance: seed, recipe, version, file hashes +│ ├── dataset_card.md # Per-bundle dataset card (auto-rendered) +│ ├── feature_dictionary.csv # Authoritative column spec +│ ├── lead_scoring.csv # Flat convenience CSV (all splits + split column) +│ ├── tables/ # 7 snapshot-safe relational tables +│ │ ├── accounts.parquet +│ │ ├── contacts.parquet +│ │ ├── leads.parquet # No `converted_within_90_days` / `conversion_timestamp` +│ │ ├── touches.parquet # Filtered to `<= lead_created_at + snapshot_day` +│ │ ├── sessions.parquet # Same window +│ │ ├── sales_activities.parquet # Same window +│ │ └── opportunities.parquet # Filtered + no `close_outcome` / `closed_at` +│ └── tasks/converted_within_90_days/ # Pre-split ML task +│ ├── train.parquet # 70% +│ ├── valid.parquet # 15% +│ ├── test.parquet # 15% +│ └── task_manifest.json +├── intermediate/ # Difficulty tier 2 (same shape) +├── advanced/ # Difficulty tier 3 (same shape) +├── intermediate_instructor/ # Research companion (full-horizon + metadata/) +│ ├── … same files plus all 9 relational tables +│ └── metadata/ # Hidden causal structure +│ ├── graph.{json,graphml} # World DAG +│ ├── world_spec.json # Full generation config +│ ├── latent_registry.json # Per-entity latent traits +│ └── mechanism_summary.json +├── notebooks/ +│ └── 01_baseline_lead_scoring.ipynb # Baseline modelling walkthrough (more notebooks land in Phase 6) +└── validation/ + ├── validation_report.{json,md} # Calibration / lift / leakage panel + └── figures/ # Lift curves, calibration, cohort shift, value capture ``` +`student_public` bundles ship the snapshot-safe relational view. +`research_instructor` companions ship the full-horizon view and the +hidden causal truth in `metadata/`. The exposure-mode contract is +enforced in code (see "Public vs instructor: what's redacted" below). + ## Quick start -### Option 1: Flat CSV (simplest) +### Option 1 — flat CSV (simplest) ```python import pandas as pd df = pd.read_csv("intermediate/lead_scoring.csv") train = df[df["split"] == "train"].drop(columns=["split"]) -test = df[df["split"] == "test"].drop(columns=["split"]) +test = df[df["split"] == "test"].drop(columns=["split"]) ``` -### Option 2: Parquet task splits (recommended) +### Option 2 — Parquet task splits (recommended) ```python import pandas as pd train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet") -test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet") +test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet") ``` -**Note:** The label `converted_within_90_days` is evaluated over the full **90 days** from lead creation. Event-aggregate features (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) observe **only the first 30 days** of that window — so even when a lead converts on day 50, the features are frozen at day 30 and cannot encode the conversion event. The deliberate exception is `total_touches_all`, a leakage trap (flagged `leakage_risk=True` and `is_leakage_trap=True` in `feature_dictionary.csv`) that counts touches over the full 90-day horizon. Exclude it from your feature set unless you're explicitly demonstrating leakage detection. The label-encoding `current_stage` column is *not* present in `student_public` bundles -- it appears only in `intermediate_instructor/`. +Engagement features (`touch_count`, `session_count`, +`pricing_page_views`, `expected_acv`, `days_since_last_touch`, …) +are computed strictly over events on days `[0, snapshot_day]` with +`snapshot_day = 30`. The label `converted_within_90_days` resolves +over the full 90-day window, so even when a lead converts on day 50, +the features cannot encode the conversion event. The deliberate +exception is `total_touches_all`, which counts the full-horizon touch +history and is **flagged** in `feature_dictionary.csv` as +`leakage_risk=True`. Drop it from your feature set unless you are +demonstrating leakage detection. -### Option 3: Relational tables (feature engineering) +### Option 3 — relational tables (feature engineering) ```python import pandas as pd accounts = pd.read_parquet("intermediate/tables/accounts.parquet") -leads = pd.read_parquet("intermediate/tables/leads.parquet") -touches = pd.read_parquet("intermediate/tables/touches.parquet") +leads = pd.read_parquet("intermediate/tables/leads.parquet") +touches = pd.read_parquet("intermediate/tables/touches.parquet") -# Engineer your own features from raw event tables touch_counts = touches.groupby("lead_id").size().rename("my_touch_count") features = leads.merge(accounts, on="account_id").merge(touch_counts, on="lead_id", how="left") ``` -### Option 4: Reproduce from source +Public relational tables are *snapshot-safe*: terminal outcome columns +are dropped, event tables are filtered to events on or before the +snapshot day, and conversion-conditional entities (`customers`, +`subscriptions`) are absent. Joining the public tables cannot +reconstruct the label. + +### Option 4 — reproduce from source ```bash pip install leadforge @@ -106,65 +137,297 @@ leadforge generate \ | Leads | 5,000 | 5,000 | 5,000 | | Accounts | 1,500 | 1,500 | 1,500 | | Contacts | 4,200 | 4,200 | 4,200 | -| Columns | 32 (student_public) / 34 (instructor) | 32 / 34 | 32 / 34 | +| Snapshot columns | 32 (`student_public`) / 34 (`research_instructor`) | 32 / 34 | 32 / 34 | | Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` | -| Conversion rate (target) | 30-45% | 18-28% | 8-15% | -| Conversion rate (observed) | 41.5% | 20.1% | 7.9% | +| Conversion rate (recipe band) | 24–61% | 12–31% | 4–12% | +| Conversion rate (median, seeds 42–46) | 42.67% | 21.60% | 8.40% | | Signal strength | 0.90 | 0.70 | 0.50 | | Noise scale | 0.10 | 0.30 | 0.55 | | Missing rate | 2% | 8% | 18% | -Higher difficulty means weaker signal, more noise, more missingness, and lower base conversion rate -- all modulated in the simulation engine. Target ranges are defined in `difficulty_profiles.yaml`. +Higher difficulty means weaker latent-to-feature signal, more noise, +more missingness, and lower base conversion rate — all modulated by +the simulation engine, not by post-hoc label-flipping. The full +calibration panel (per-tier ROC-AUC, AP, P@K, lift, calibration, +cross-seed bands, cohort-shift degradation) lives in +[`validation/validation_report.md`](validation/validation_report.md). ## The scenario -**Veridian Technologies** is a Series B startup (Austin, US) selling **Veridian Procure**, a cloud-based procurement and AP automation platform, to mid-market firms (200-2,000 employees) in the US and UK. - -The sales funnel runs through inbound marketing (45%), SDR outbound (35%), and partner referrals (20%). Four buyer personas drive deals: VP Finance (economic buyer), AP Manager (champion), IT Director (technical evaluator), and Procurement Manager (end user). - -**Task:** predict whether a lead will convert (closed-won) within 90 days of entering the funnel. - -## Feature dictionary - -Each bundle contains a `dataset_card.md` and a `feature_dictionary.csv` with the authoritative, auto-generated column list, descriptions, dtypes, and `leakage_risk` flags. Refer to those rather than mirroring counts here, which would drift. - -**Leakage handling (bundle schema v4)** - -Two separate mechanisms keep the published feature set leakage-safe: - -1. **Windowed snapshot.** Every event-aggregate feature is computed over a 30-day window (`manifest.snapshot_day == 30`); the label resolves over the full 90 days (`manifest.label_window_days == 90`). Features cannot see touches, sessions, or opportunities that occurred after day 30. The only feature that intentionally crosses this line is `total_touches_all`, the pedagogical trap. -2. **Column redaction.** A small set of columns that *would* encode the label structurally (`current_stage`, `is_sql`) are stripped from `student_public` bundles entirely — both from `tasks/` splits and from `tables/leads.parquet`, so feature engineering off the relational tables cannot recover them. - -| Column | Status in `student_public` | Status in `intermediate_instructor` | Why | -|---|---|---|---| -| `current_stage` | redacted (gone from task splits and `tables/leads.parquet`) | retained | At day 90 this contains terminal stages (`closed_won`/`closed_lost`) that encode the label directly. | -| `is_sql` | redacted | retained | `is_sql=False` predicts non-conversion with very high probability — measured across 5 seeds, P(conv \| is_sql=False) = 0.061 ± 0.026 (intro) / 0.020 ± 0.010 (intermediate) / 0.011 ± 0.004 (advanced). | -| `is_mql` | removed entirely (no mode has it) | removed entirely | Every lead is initialised at MQL stage in the simulator, so the field was constant `True` and carried no information. | -| `total_touches_all` | retained | retained | Deliberate pedagogical leakage trap. Counts touches over the full 90-day horizon while every other touch feature stops at day 30, so the gap (`total_touches_all - touch_count`) carries real signal. Flagged `leakage_risk=True` in `feature_dictionary.csv`. Train with and without it, compare AUC, explain the gap. | - -The `redacted_columns` and `snapshot_day` fields in each bundle's `manifest.json` record exactly what was stripped and at what window features were computed. - -## Research companion - -The `intermediate_instructor/` bundle includes the full hidden causal structure: - -- **World graph:** The DAG of causal relationships driving lead outcomes -- **Latent registry:** Per-entity latent trait values (account fit, contact authority, engagement propensity) -- **Mechanism summary:** How each node in the graph maps to simulation behavior - -This enables research on causal inference, model interpretability, and DGP-aware evaluation. +**Veridian Technologies** is a Series B startup (Austin, US) selling +**Veridian Procure**, a cloud-based procurement and AP-automation +platform, to mid-market firms (200–2,000 employees) in the US and UK. +The sales funnel runs through inbound marketing (45%), SDR outbound +(35%), and partner referrals (20%). Four buyer personas drive deals: +VP Finance (economic buyer), AP Manager (champion), IT Director +(technical evaluator), and Procurement Manager (end user). + +**Task:** predict whether a lead converts (`closed_won`) within 90 +days of entering the funnel. + +The scenario is fictional but the funnel structure, role mix, and +ACV bands ($18k–$120k) sit in mid-market B2B SaaS norms. See +[`docs/release/generation_method.md`](../docs/release/generation_method.md) +for how the data is actually produced. + +## Generation method (one-paragraph version) + +The full method is documented in +[`docs/release/generation_method.md`](../docs/release/generation_method.md). +Briefly: a hidden DAG of latent traits and lead states is sampled +from one of five motif families (`fit_dominant`, `intent_dominant`, +`sales_execution_sensitive`, `demo_trial_mediated`, +`buying_committee_friction`) and stochastically rewired per seed. +Mechanisms (logistic latent scores, Poisson and recency-decayed +intensities, hazards, and stage transitions) are assigned per node +based on the motif family, calibrated so each tier hits its target +conversion-rate band. Accounts, contacts, and leads are sampled with +deterministic foreign keys; a 90-day daily simulator advances every +lead through the funnel; opportunities, customers, and subscriptions +materialise from `closed_won` events. +`converted_within_90_days` is **event-derived**, never sampled +directly. The renderer freezes a feature snapshot at day 30; the +label resolves over the full 90 days. + +## Simulation simplifications (what's modelled, what's approximate, what's not) + +This dataset is a teaching artifact, not a digital twin. The list +below makes the abstraction boundary explicit so users don't read +realism into things that aren't there. + +**Modelled.** +- Five distinct motif families with motif-conditioned mechanism + assignments. +- 90-day daily-step simulation with stage transitions, conversion + hazards, churn, direct conversion, and post-conversion + opportunity / customer / subscription materialisation. +- Snapshot-window discipline: every public feature aggregates over + events on days `[0, 30]` only; the 90-day label window resolves + separately. +- Difficulty tiers as a bundle of (signal-strength, noise scale, + missingness rate, outlier rate) parameters tuned per tier. +- Recipe-driven narrative: industry mix, region mix, employee / + revenue bands, role / seniority distributions, channel split. + +**Approximate.** +- Lead-source channels (`inbound_marketing`, `sdr_outbound`, + `partner_referral`) are categorical labels, not channel-conditional + generative axes. The audit + [`docs/release/channel_signal_audit.md`](../docs/release/channel_signal_audit.md) + measures how strongly channel actually signals conversion in + v1: weak — univariate AUC ≤ 0.521 across all tiers, well below + the G2 / Gemini v2 industry MQL→SQL band (SEO ~51% vs Email <1%). + Real channel-conditional encoding is post-v1 work. +- Sales cycles. Whatever distribution falls out of the daily + hazards. Not log-normal / Weibull-tuned to reproduce the + industry-typical ~84-day median. +- Demographic strings. Job titles and roles are clean categorical + labels, not free-text variants ("VP of Operations" vs "Head of + Ops" vs "Operations VP"). No NLP cleanup is required. +- Industry calibration. Conversion-rate bands are tuned for v1's + fictional vertical, not anchored to per-vertical CRM data + (cybersecurity, fintech, etc.). + +**Not modelled.** +- Macroeconomic shocks, seasonality, fiscal-quarter close cycles. +- Real customer support / churn dynamics post-conversion (the + customer + subscription tables exist for relational completeness + but are not the modelling target in v1). +- Multi-product / cross-sell motions. One product, one task. +- Deliberate noise injection at the *string* level (typos, + capitalisation, encoding). Free-text-cleanup work is post-v1. + +The post-v1 roadmap +([`docs/release/post_v1_roadmap.md`](../docs/release/post_v1_roadmap.md)) +tracks each "approximate" / "not modelled" axis with an explicit +v2 candidate scope. + +## Public vs instructor: what's redacted + +Filtering happens **during rendering**, not during simulation, and +the redaction contract is single-sourced in +[`leadforge/validation/leakage_probes.py`](../leadforge/validation/leakage_probes.py). +The same constants are imported by the snapshot-safe writer +([`leadforge/render/relational_snapshot_safe.py`](../leadforge/render/relational_snapshot_safe.py)) +and by the validator that polices public bundles, so the writer +and the gate cannot drift apart. + +| Constant | Public bundle treatment | +|---|---| +| `BANNED_LEAD_COLUMNS = ("converted_within_90_days", "conversion_timestamp")` | Dropped from `tables/leads.parquet` | +| `BANNED_OPP_COLUMNS = ("close_outcome", "closed_at")` | Dropped from `tables/opportunities.parquet` | +| `BANNED_TABLES = ("customers", "subscriptions")` | Omitted from public bundles entirely | +| `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` | +| Snapshot-feature redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` | +| `total_touches_all` (the deliberate trap) | **Retained** in both modes; flagged `leakage_risk=True` in `feature_dictionary.csv` | + +The `manifest.json` for each bundle records `relational_snapshot_safe` +(true for `student_public`, false for `research_instructor`), +`redacted_columns`, and `snapshot_day`; the bundle is +self-describing. + +The instructor companion (`intermediate_instructor/`) ships the +full-horizon view: all 9 relational tables, no column drops, plus +the hidden causal structure under `metadata/`. It is **not** +appropriate input for the student-facing task. + +## Calibration + +Every numeric claim about realism, calibration, or difficulty in +this README is backed by +[`validation/validation_report.md`](validation/validation_report.md), +which is regenerated by +[`scripts/validate_release_candidate.py`](../scripts/validate_release_candidate.py). +The driver runs an N=5 cross-seed sweep per tier (seeds 42–46), +applies the bands declared in +[`docs/release/v1_acceptance_gates_bands.yaml`](../docs/release/v1_acceptance_gates_bands.yaml), +and exits non-zero if anything falls outside band. The full gate +list is in +[`docs/release/v1_acceptance_gates.md`](../docs/release/v1_acceptance_gates.md); +every dataset-card claim cites the JSON path on the report so +machine-readable verification is possible. + +Headline cross-seed medians for the canonical sweep: + +| Tier | LR AUC | AP | P@100 | Brier | +|---|---|---|---|---| +| intro | 0.879 | 0.761 | 0.80 | 0.130 | +| intermediate | 0.886 | 0.575 | 0.59 | 0.110 | +| advanced | 0.886 | 0.351 | 0.34 | 0.061 | + +AP, P@100, conversion-rate, and lift orderings hold across the +intended difficulty axis (intro > intermediate > advanced); see +[`validation/validation_report.md`](validation/validation_report.md) +for cross-seed spreads, calibration bins, lift curves, and +cohort-shift degradation. + +## Intended uses + +- Teaching baseline lead-scoring modelling on a flat snapshot. +- Teaching relational feature engineering against snapshot-safe + raw tables. +- Teaching leakage detection: the deliberate trap + (`total_touches_all`) is designed to be discoverable. +- Teaching calibration, lift, P@K, value-aware ranking + (`expected_acv × P(convert)`), and cohort-shift evaluation. +- Comparing model families (linear vs tree) under a controlled DGP. + +## Out-of-scope uses + +- **Production lead scoring.** This is synthetic data; the company, + product, and customers are fictional. Do not deploy a model + trained on `leadforge-lead-scoring-v1` against real leads. +- **Vendor benchmarking or paper baselines.** The difficulty tiers + are calibrated for pedagogy; cross-paper comparability is not a + design goal. +- **Causal inference research that requires recovery of the true + DGP.** The instructor companion exposes the hidden graph for + teaching purposes, but real causal-inference benchmarks need + designed counterfactuals, not a sampled DAG. +- **Demographic / fairness research.** v1 does not model protected + attributes or sensitive demographic axes; any "bias" you find is + a bug in the simulation, not a teaching artefact. + +## Known limitations + +- **Difficulty signal on raw AUC is flat.** Across the canonical + sweep, LR AUC is ~0.88 in every tier. Difficulty is visible in + AP / P@K / Brier / value capture, not in AUC alone — the + validation report uses AP and P@K as the headline difficulty + axis. Treat AUC as a sanity check, not a difficulty signal. +- **GBM does not consistently beat LR on this snapshot + (gate G7.4.4).** Across the canonical sweep, the GBM−LR AUC + delta is slightly negative in every tier + (intro −0.0045, intermediate −0.0072, advanced −0.0133). v1's + snapshot is dominated by linear features (engagement aggregates + + firmographics) and a HistGBM does not consistently beat a + regularised logistic regression at this signal level. The + cross-tier sign check is therefore *informational* in v1; the + per-tier `gbm_minus_lr_auc` bands gate the release. v2 will + introduce non-linear interactions in the simulator (saturation + curves, threshold effects) so the gate bites; tracked in + [`docs/release/post_v1_roadmap.md`](../docs/release/post_v1_roadmap.md). +- **Channel signal is weak versus published industry data.** + Per [`docs/release/channel_signal_audit.md`](../docs/release/channel_signal_audit.md), + the largest per-channel rate spread is 0.043 and the largest + univariate AUC is 0.521 — well below the G2 / Gemini v2 + MQL→SQL band. Channel-conditional encoding is post-v1 work. +- **Cohort-shift degradation is small.** v1's bundles are + roughly IID-balanced over the 90-day horizon (no time-of-year + drift baked in). The cohort-shift gate (G6.4) is informational + in v1 and will bite in v2 once seasonality is injected. +- **Calibration is noisy at small per-bin n** in the advanced + tier (low base rate × small calibration bins). The Brier score + is the more reliable calibration signal at advanced; per-bin + calibration error is bounded by the gate but should not be + read as a precise miscalibration claim. + +## Composition (Datasheets-for-Datasets) + +- **Entities.** Accounts (the buying organisations), contacts (the + human stakeholders attached to accounts), leads (the funnel + unit; one per lead-creation event), touches (marketing/sales + contact events), sessions (web/trial sessions), + sales_activities (rep-logged activities), opportunities (sales + cycles attached to leads), and — instructor only — customers + and subscriptions (post-conversion entities). Per-row counts + are recorded in each bundle's `manifest.json` `tables` block. +- **Features.** 32 columns in the public student-facing snapshot, + grouped by category in + [`docs/release/feature_dictionary.md`](../docs/release/feature_dictionary.md). + The authoritative per-bundle CSV is + `feature_dictionary.csv`; it carries dtype, description, + category, target flag, and leakage flag for every column. +- **Label.** `converted_within_90_days` (boolean), event-derived + from the simulator. Never sampled directly. +- **Splits.** 70/15/15 train/valid/test, deterministic given the + bundle seed; recorded in `tasks/converted_within_90_days/task_manifest.json`. +- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, + package version stamped in `manifest.json`. The full hidden + DAG, latent registry, and mechanism summary are in the + instructor companion's `metadata/` directory. + +## Maintenance + +- **Versioning.** This is dataset version `v1`; the framework that + produced it is at package version `1.0.0+`. Future dataset + versions are tagged independently as `leadforge-lead-scoring-v2`, + etc.; the framework version is a separate axis. See + [`docs/release/v1_release_design.md`](../docs/release/v1_release_design.md) + for the rationale. +- **Issue templates.** `.github/ISSUE_TEMPLATE/` ships + `dataset_breakage_report.yml` (for "I broke the dataset") and + `realism_feedback.yml` (for realism critiques) once Phase 6 + lands. +- **Adversarial framing.** We *want* the dataset to be broken. + See `docs/release/break_me_guide.md` (lands in Phase 6) for + explicit invitations to find direct leakage, reconstruct labels + through joins, beat the baseline lift legitimately, surface + unrealistic distributions, identify documentation ambiguity, + and propose better calibration sources. +- **v2 decision log.** Once Phase 6 ships, + `docs/release/v2_decision_log.md` will track every accepted + v1 finding and the design call that came from it. +- **Maintainers.** [leadforge-dev](https://github.com/leadforge-dev/leadforge) + on GitHub. File issues; PRs welcome. ## Provenance | Field | Value | |---|---| -| Generator | [leadforge](https://github.com/leadforge-dev/leadforge) v1.0.0 | +| Generator | [leadforge](https://github.com/leadforge-dev/leadforge) `1.0.0+` | | Recipe | `b2b_saas_procurement_v1` | -| Seed | 42 | -| Format | Parquet + CSV | +| Canonical seed | 42 | +| Cross-seed sweep | 42, 43, 44, 45, 46 (per tier) | +| Bundle schema version | 5 | +| Format | Parquet (canonical) + CSV (convenience) | | License | MIT | -Every bundle includes a `manifest.json` with the exact package version, recipe, seed, generation timestamp, and SHA-256 hashes for all data files. To verify integrity or regenerate, install leadforge and run the generation command above. +Every bundle includes a `manifest.json` with the exact package +version, recipe, seed, generation timestamp, snapshot day, label +window, table inventory with row counts, and SHA-256 hashes for +all data files. To verify integrity, install leadforge and run +`leadforge validate `. ## License From db4c4897e535c59e7b3b36a284edf8db2370668d Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 15:20:52 +0300 Subject: [PATCH 3/7] docs(plan): mark Phase 4 PR 4.1 complete in .agent-plan.md Co-Authored-By: Claude Opus 4.7 --- .agent-plan.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index 689505a..3c29ba0 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -40,9 +40,10 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family - [x] PR 3.3: `scripts/validate_release_candidate.py` (new) — release-candidate driver. Orchestrates `regenerate_tier_for_seeds(spec, seeds, workdir)` × N=5 (default) per tier, calls `measure_release_quality`, runs `run_split_probes` against each tier's canonical seed, renders the JSON / markdown / figure contract via `render_report`, and gates on YAML-declared bands. Flags: `--release-dir`, `--workdir`, `--out-dir`, `--bands`, `--seeds`, `--cohort-canonical-seed`, `--tiers`, `--quick` (N=2 with 500-lead populations; ~20s end-to-end), `--no-rebuild` (reuses workdir for fast band-tweak iteration). Exit codes: 0 pass / 1 gate failure / 2 pre-flight error. Driver vs `leadforge validate` boundary documented in the script docstring (one-bundle structural contract vs. cross-seed × cross-tier release-readiness panel — complementary, not merged). `leadforge/validation/difficulty.py` extended with `BandSpec` / `TierBands` / `LeakageProbeBands` / `AcceptanceBands` / `GateFailure` dataclasses and `load_bands` / `check_release_bands` (consumes `ReleaseQualityReport` + per-tier `LeakageReport`s, returns `list[GateFailure]`). G7.4.4 (cross-tier GBM−LR positivity) softened to follow per-tier `gbm_minus_lr_auc` bands rather than hard-fail on the boolean — the v1 dataset's snapshot is dominated by linear features and HistGBM does not consistently beat LR; documented as a known v1→v2 finding with the cross-tier check tracked as informational. `docs/release/v1_acceptance_gates_bands.yaml` (new) is the operational source of truth for numeric bands; `docs/release/v1_acceptance_gates.md` updated to remove every `TBD-*` placeholder and to record medians + rationale per gate. `release/_release_quality/` workdir gitignored; `release/validation/` (validation_report.{json,md} + 7 pinned figures: lift_curve_{intro,intermediate,advanced}, calibration_intermediate, leakage_delta, cohort_shift, value_capture) committed. New tests: `tests/validation/test_difficulty_bands.py` (29 tests over band parsing / per-tier checks / cross-seed spread / cohort shift / cross-tier ordering / leakage findings / GateFailure immutability) and `tests/scripts/test_validate_release_candidate.py` (19 tests over CLI helpers, mocked pipeline, end-to-end --quick run); 1152/1152 tests pass; ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67 files identical; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (purely additive driver+gating layer). First authentic full-release run baseline (seeds 42–46): intro AP 0.7608 / LR AUC 0.879 / GBM AUC 0.873; intermediate AP 0.5752 / LR AUC 0.886 / GBM AUC 0.876; advanced AP 0.3514 / LR AUC 0.886 / GBM AUC 0.873; cross-tier AP / P@100 / conversion-rate ordering all hold; GBM−LR delta is slightly negative in every tier (−0.0045 / −0.0072 / −0.0133 — the v1→v2 finding above). ### Phase 4 — Channel-signal audit + dataset card hardening -- [ ] `scripts/audit_channel_signal.py` → `docs/release/channel_signal_audit.md` -- [ ] `release/README.md` rewrite (release-grade dataset card; macro-framing paragraph; simulation-simplifications section) -- [ ] `docs/release/{generation_method,feature_dictionary}.md` +- [x] PR 4.1: `scripts/audit_channel_signal.py` (new) — analysis driver. For each tier (and each of `lead_source` / `first_touch_channel`), computes per-channel conversion rate + univariate AUC scored as the empirical positive rate per channel (a 1-D Bayes classifier, equivalent to a saturated LR on one-hot channel features). Writes `docs/release/channel_signal_audit.{md,json}`. CLI: `--release-dir`, `--tier`, `--task`, `--channel-column`, `--out-md`, `--out-json`, `--print`. Determinism guarded by `tests/scripts/test_audit_channel_signal.py` (10 tests: per-channel rollup, closed-form univariate AUC, single-class fallback, missing-column error, build/render round-trip, byte-identical re-run against the committed `release/` bundles, error paths). Audit verdict on the canonical PR 2.2 bundles: **weak channel signal** — across all three tiers and both channel columns the largest per-channel rate spread is 0.043 and the largest univariate AUC is 0.521, well below the G2 / Gemini v2 industry MQL→SQL band (SEO ~51%, PPC ~26%, Email <1%). v1 drives conversion through motif-family hazards keyed off latent traits, not channel-conditional probabilities; channel-conditional encoding is tracked in `docs/release/post_v1_roadmap.md`. +- [x] PR 4.1: `docs/release/generation_method.md` (new) — standalone DGP summary written for external readers (Kaggle/HF). Reads alone, references `docs/leadforge_architecture_spec.md`. Covers the five generation layers (motif families → mechanism layer → population → 90-day daily simulation → snapshot rendering), bundle output contract, public-vs-instructor split, calibration / validation, and an explicit "what this is not" boundary. Satisfies G10.2. +- [x] PR 4.1: `docs/release/feature_dictionary.md` (new) — narrative companion to the per-bundle `feature_dictionary.csv`. Groups every public-mode column by analytical role (lead identity / firmographics / personographics / engagement / funnel / value / leakage trap / target), documents difficulty modulation parameters, modelling defaults, and the deliberate `total_touches_all` trap. Satisfies G10.3. +- [x] PR 4.1: `release/README.md` (substantial rewrite) — release-grade dataset card per Datasheets-for-Datasets / Data Cards Playbook checklist (G10.1). New sections: macro framing paragraph (2024–2026 SaaS context, recommendation #19), simulation simplifications (modelled / approximate / not modelled, per chatgpt v2 §2.6), calibration documentation linking to `release/validation/validation_report.md`, public-vs-instructor redaction policy with concrete column lists citing `BANNED_LEAD_COLUMNS` / `BANNED_OPP_COLUMNS` / `BANNED_TABLES` / `SNAPSHOT_FILTERED_TABLES` from `leadforge/validation/leakage_probes.py`, intended-use vs out-of-scope-use, known limitations (G7.4.4 GBM−LR sign finding, weak channel signal from the Phase 4 audit, flat AUC across tiers, small cohort-shift gap), composition section per Datasheets format, adversarial-framing pointer (placeholder link to `docs/release/break_me_guide.md` that lands in PR 6.3), and a maintenance plan. Every realism / calibration / difficulty claim in the card is anchored to `validation_report.md` per G10.6. `BUNDLE_SCHEMA_VERSION` unchanged at 5 (documentation-only PR); 1167/1167 tests pass; ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0. ### Phase 5 — Platform packaging - [ ] `scripts/package_kaggle_release.py` → `release/kaggle/dataset-metadata.json` From f6b274e29e9a3947e03aff38d615112ee9dab021 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 15:27:26 +0300 Subject: [PATCH 4/7] docs(release): cover lead_source / first_touch_channel in feature dictionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review caught a gap: the prior commit grouped 30 of 32 public columns; lead_source and first_touch_channel were referenced in the "recommended modelling defaults" checklist but did not appear in any category table. Adds a "Lead source & channel" subsection that describes both columns, calls out that they're identical in v1, and cross-references the channel-signal audit so readers don't expect top-tier feature importance from these columns. Updates the summary table to reflect 32 documented columns. Also corrects two minor wording issues (firmographics "Six" → "Five", personographics "all four" → "all three", and a typo "bandage" → "discretisation"). Co-Authored-By: Claude Opus 4.7 --- docs/release/feature_dictionary.md | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/docs/release/feature_dictionary.md b/docs/release/feature_dictionary.md index 823b676..f4f60c8 100644 --- a/docs/release/feature_dictionary.md +++ b/docs/release/feature_dictionary.md @@ -15,8 +15,9 @@ in `metadata/`; it does not change the feature list. | Category | Columns | Modelling default | |---|---|---| | Lead identity & timing | 4 | drop `lead_id`; keep `lead_created_at` for cohort splits, drop for production | -| Firmographics | 6 | keep all | -| Personographics | 4 | keep all (categorical encoders welcome) | +| Lead source & channel | 2 | keep both | +| Firmographics | 5 | keep all | +| Personographics | 3 | keep all (categorical encoders welcome) | | Engagement (snapshot-window) | 10 | keep all | | Funnel & sales-process | 4 | keep all | | Value | 2 | keep all | @@ -32,12 +33,33 @@ in `metadata/`; it does not change the feature list. | `contact_id` | string | identity | Foreign key into `tables/contacts.parquet`. Same warning. | | `lead_created_at` | string (ISO-8601) | simulation clock | Lead birthday; useful for cohort/time-shift evaluation (see `docs/release/v1_acceptance_gates.md` G6.4). Drop or bin it for production models — feeding raw timestamps to a linear model is rarely what you want. | +## Lead source and channel + +Two columns describe how each lead entered the funnel. They are +populated from the recipe's GTM-motion mix +(`inbound_marketing` 45%, `sdr_outbound` 35%, `partner_referral` +20%) and are identical between the two columns in v1 — both encode +the same origination channel under different field names. + +| Column | Dtype | Why it might matter | +|---|---|---| +| `lead_source` | string | Origination channel; one of `inbound_marketing` / `sdr_outbound` / `partner_referral`. | +| `first_touch_channel` | string | Marketing channel of the first recorded touch. Always equals `lead_source` in v1; the field exists to support post-v1 work where origination and first-touch can diverge. | + +**Caveat.** Per [`docs/release/channel_signal_audit.md`](channel_signal_audit.md), +v1's channel signal is weak: per-channel rate spread ≤ 0.043 and +univariate AUC ≤ 0.521 across all tiers, well below the G2 / +Gemini v2 industry MQL→SQL band (SEO ~51%, PPC ~26%, Email <1%). +Expect modest feature importance from these columns; do not expect +channel to be a top-tier predictor in v1. + ## Firmographics (account-level) These describe the buying organisation. They come from the recipe's narrative spec (industry, region, employee bands, revenue bands) -and from latent traits sampled per account. Six columns; all are -fair to use. +and from latent traits sampled per account. Five columns plus the +`account_id` foreign key listed under "Lead identity and timing" +above; all five are fair to use as features. | Column | Dtype | Why it might matter | |---|---|---| @@ -45,13 +67,13 @@ fair to use. | `region` | string | `US` / `UK`. Currently a low-signal axis — the simulator does not model channel-by-region interactions. | | `employee_band` | string | Bands are aligned with the ICP range (200–2,000 employees, plus tails). Larger accounts trend toward higher expected ACV. | | `estimated_revenue_band` | string | Bands span `$1M-$10M` to `$200M+`; correlated with `employee_band` by design. | -| `process_maturity_band` | string | A bandage of the latent `process_maturity` trait — *visible* signal of `motif_family.fit_dominant`'s "fit beats engagement" story. | +| `process_maturity_band` | string | A discretisation of the latent `process_maturity` trait — *visible* signal of `motif_family.fit_dominant`'s "fit beats engagement" story. | ## Personographics (contact-level) -These describe the primary contact attached to the lead. -Three categorical features plus the buyer-role label; all four -are fair to use. +These describe the primary contact attached to the lead. Three +categorical features (the `contact_id` foreign key is listed +under "Lead identity and timing"); all three are fair to use. | Column | Dtype | Why it might matter | |---|---|---| From ccab336ab6bdd5fb55452cec637a060d4f51b3f3 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 16:19:22 +0300 Subject: [PATCH 5/7] =?UTF-8?q?fix(scripts):=20channel=20audit=20=E2=80=94?= =?UTF-8?q?=20out-of-sample=20AUC,=20no=20verdict=20bands,=20group=20ident?= =?UTF-8?q?ical=20columns?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review of the previous PR-4.1 commit surfaced four problems with audit_channel_signal.py: * The univariate AUC was computed in-sample (train rates → train labels), guaranteed >= 0.5 by construction and not directly comparable to the source_only baselines in release/validation/validation_report.json. * The "weak / moderate / strong" verdict made a hard comparison between v1's 90-day closed-won label and the G2 / Gemini v2 industry MQL→SQL benchmark band. The two metrics measure different funnel transitions; the comparison was a category error. * The verdict prose hard-coded a "50 percentage points" claim and a specific architectural narrative ("v1 drives conversion through motif-family hazards") inside the script — both would silently drift from the data and the codebase over time. * lead_source and first_touch_channel produce byte-identical audits in v1 yet were rendered as two parallel tables per tier. Fixes: * audit_channel now takes both train and test DataFrames and returns univariate_auc_in_sample (the historical 1-D Bayes interpretation, retained for transparency) plus univariate_auc_out_of_sample (train rates scored against held-out test labels). The OOS numbers reproduce the source_only HistGBM baselines in validation_report.json for seed 42 cell-for-cell (intro 0.5014, intermediate 0.5139, advanced 0.5226). * Verdict bands and the _classify_signal / _verdict_paragraph helpers are gone. The markdown report now ends with a Discussion section written by hand around the actual numbers, with an explicit caveat that the industry benchmarks measure MQL→SQL (not 90-day closed-won) and are reproduced for context only. * INDUSTRY_MQL_TO_SQL_BENCHMARKS is now a tuple of pairs (genuinely immutable; matches dataclass(frozen=True) semantics). report_to_dict converts it back to a {name: rate} dict for the JSON output. * render_markdown groups channel columns whose audits are byte-identical into one section with a header listing all columns ("Columns: lead_source, first_touch_channel (audit values identical)"). The JSON keeps per-column entries. New tests in tests/scripts/test_audit_channel_signal.py: * OOS AUC == in-sample AUC when test=train (sanity check) * OOS AUC stays well-defined when the test split contains channels unseen on train (train-base-rate fallback) * render_markdown collapses two identical columns into one section AND keeps two distinct columns in two sections * test_lead_source_equals_first_touch_channel_in_v1 (parametrized over intro/intermediate/advanced) — locks the feature-dictionary claim that the two channel columns are identical in v1. If the simulator ever diverges them, the doc must be updated. * test_committed_audit_artifacts_match_fresh_regeneration — re-runs the audit against the committed bundles and asserts byte-equality with the committed docs/release/channel_signal_audit.{md,json}. CI gate against bundles regenerated without re-running the audit. Co-Authored-By: Claude Opus 4.7 --- docs/release/channel_signal_audit.json | 78 ++-- docs/release/channel_signal_audit.md | 67 ++-- scripts/audit_channel_signal.py | 409 +++++++++++++-------- tests/scripts/test_audit_channel_signal.py | 238 +++++++++--- 4 files changed, 501 insertions(+), 291 deletions(-) diff --git a/docs/release/channel_signal_audit.json b/docs/release/channel_signal_audit.json index 31885a8..0b95903 100644 --- a/docs/release/channel_signal_audit.json +++ b/docs/release/channel_signal_audit.json @@ -39,10 +39,13 @@ } ], "column": "lead_source", - "n_total": 3500, - "overall_conversion_rate": 0.4145714285714286, + "n_test": 750, + "n_train": 3500, "rate_spread": 0.04327742594857009, - "univariate_auc": 0.5199794894149169 + "test_conversion_rate": 0.4266666666666667, + "train_conversion_rate": 0.4145714285714286, + "univariate_auc_in_sample": 0.5199794894149169, + "univariate_auc_out_of_sample": 0.5013517441860464 }, { "channels": [ @@ -69,15 +72,20 @@ } ], "column": "first_touch_channel", - "n_total": 3500, - "overall_conversion_rate": 0.4145714285714286, + "n_test": 750, + "n_train": 3500, "rate_spread": 0.04327742594857009, - "univariate_auc": 0.5199794894149169 + "test_conversion_rate": 0.4266666666666667, + "train_conversion_rate": 0.4145714285714286, + "univariate_auc_in_sample": 0.5199794894149169, + "univariate_auc_out_of_sample": 0.5013517441860464 } ], - "conversion_rate_overall": 0.4145714285714286, - "n_leads": 3500, - "tier": "intro" + "n_test": 750, + "n_train": 3500, + "test_conversion_rate": 0.4266666666666667, + "tier": "intro", + "train_conversion_rate": 0.4145714285714286 }, { "columns": [ @@ -106,10 +114,13 @@ } ], "column": "lead_source", - "n_total": 3500, - "overall_conversion_rate": 0.20142857142857143, + "n_test": 750, + "n_train": 3500, "rate_spread": 0.03652108846020477, - "univariate_auc": 0.5212431012826857 + "test_conversion_rate": 0.22266666666666668, + "train_conversion_rate": 0.20142857142857143, + "univariate_auc_in_sample": 0.5212431012826857, + "univariate_auc_out_of_sample": 0.5139326835180411 }, { "channels": [ @@ -136,15 +147,20 @@ } ], "column": "first_touch_channel", - "n_total": 3500, - "overall_conversion_rate": 0.20142857142857143, + "n_test": 750, + "n_train": 3500, "rate_spread": 0.03652108846020477, - "univariate_auc": 0.5212431012826857 + "test_conversion_rate": 0.22266666666666668, + "train_conversion_rate": 0.20142857142857143, + "univariate_auc_in_sample": 0.5212431012826857, + "univariate_auc_out_of_sample": 0.5139326835180411 } ], - "conversion_rate_overall": 0.20142857142857143, - "n_leads": 3500, - "tier": "intermediate" + "n_test": 750, + "n_train": 3500, + "test_conversion_rate": 0.22266666666666668, + "tier": "intermediate", + "train_conversion_rate": 0.20142857142857143 }, { "columns": [ @@ -173,10 +189,13 @@ } ], "column": "lead_source", - "n_total": 3500, - "overall_conversion_rate": 0.07914285714285714, + "n_test": 750, + "n_train": 3500, "rate_spread": 0.005597430328691616, - "univariate_auc": 0.5083011208921436 + "test_conversion_rate": 0.07866666666666666, + "train_conversion_rate": 0.07914285714285714, + "univariate_auc_in_sample": 0.5083011208921436, + "univariate_auc_out_of_sample": 0.5225784296892246 }, { "channels": [ @@ -203,15 +222,20 @@ } ], "column": "first_touch_channel", - "n_total": 3500, - "overall_conversion_rate": 0.07914285714285714, + "n_test": 750, + "n_train": 3500, "rate_spread": 0.005597430328691616, - "univariate_auc": 0.5083011208921436 + "test_conversion_rate": 0.07866666666666666, + "train_conversion_rate": 0.07914285714285714, + "univariate_auc_in_sample": 0.5083011208921436, + "univariate_auc_out_of_sample": 0.5225784296892246 } ], - "conversion_rate_overall": 0.07914285714285714, - "n_leads": 3500, - "tier": "advanced" + "n_test": 750, + "n_train": 3500, + "test_conversion_rate": 0.07866666666666666, + "tier": "advanced", + "train_conversion_rate": 0.07914285714285714 } ] } diff --git a/docs/release/channel_signal_audit.md b/docs/release/channel_signal_audit.md index a1f8fc8..4786ff8 100644 --- a/docs/release/channel_signal_audit.md +++ b/docs/release/channel_signal_audit.md @@ -1,12 +1,12 @@ # Channel-signal audit — leadforge-lead-scoring-v1 -Audit produced by `scripts/audit_channel_signal.py`; see also `docs/release/channel_signal_audit.json` for the machine-readable form. +Audit produced by `scripts/audit_channel_signal.py`; see `docs/release/channel_signal_audit.json` for the machine-readable form. -**Scope.** For every tier we compute per-channel conversion rates and the univariate AUC of channel against `converted_within_90_days`, scored as the empirical positive rate per channel (a 1-D Bayes classifier, equivalent to a saturated logistic regression on one-hot channel features). Compared against the G2 / Gemini v2 industry MQL→SQL benchmark band (SEO ~51%, PPC ~26%, Email <1%, surfaced in `docs/external_review/summaries/recommendations_pass.md` recommendation #8). +**Scope.** For every tier we compute per-channel conversion rates on the train split and the univariate AUC of channel against `converted_within_90_days`, scored as the empirical positive rate per channel (a 1-D Bayes classifier). Two AUCs are reported: an **in-sample** number (train rates → train labels — biased upward by construction) and an **out-of-sample** number (train rates → test labels — directly comparable to the `source_only` baselines in `release/validation/validation_report.json`). -**Caveat.** Industry benchmarks are MQL→SQL rates, not 90-day closed-won rates. They are the closest public anchor for *how much* channel ought to matter; use them as a band of reference, not a hard target. +**Caveat on the industry benchmark.** The G2 / Gemini v2 numbers below are single-step **MQL→SQL** rates (recommendation #8 in `docs/external_review/summaries/recommendations_pass.md`). v1's label is **90-day closed-won**, the entire funnel resolved. The two metrics are not directly comparable; the table is reproduced for context only. -## Industry benchmark band +## Industry benchmark (context, not target) | Channel | MQL→SQL conversion rate | |---|---| @@ -16,23 +16,13 @@ Audit produced by `scripts/audit_channel_signal.py`; see also `docs/release/chan ## Tier: `intro` -`n_leads = 3500`, overall 90-day conversion rate 41.46%. +`n_train = 3500` (90-day conversion rate 41.46%); `n_test = 750` (rate 42.67%). -### Column: `lead_source` +### Columns: `lead_source`, `first_touch_channel` (audit values identical) -Univariate AUC: **0.5200** · Per-channel rate spread (max − min): **0.0433** · Verdict: **weak signal** +Per-channel rate spread (max − min): **0.0433** · In-sample univariate AUC: **0.5200** · Out-of-sample univariate AUC: **0.5014** -| Channel | n | Share | Converted | Conversion rate | -|---|---:|---:|---:|---:| -| `inbound_marketing` | 1570 | 44.86% | 682 | 43.44% | -| `partner_referral` | 698 | 19.94% | 273 | 39.11% | -| `sdr_outbound` | 1232 | 35.20% | 496 | 40.26% | - -### Column: `first_touch_channel` - -Univariate AUC: **0.5200** · Per-channel rate spread (max − min): **0.0433** · Verdict: **weak signal** - -| Channel | n | Share | Converted | Conversion rate | +| Channel | n (train) | Share (train) | Converted (train) | Train rate | |---|---:|---:|---:|---:| | `inbound_marketing` | 1570 | 44.86% | 682 | 43.44% | | `partner_referral` | 698 | 19.94% | 273 | 39.11% | @@ -40,23 +30,13 @@ Univariate AUC: **0.5200** · Per-channel rate spread (max − min): **0.0433* ## Tier: `intermediate` -`n_leads = 3500`, overall 90-day conversion rate 20.14%. +`n_train = 3500` (90-day conversion rate 20.14%); `n_test = 750` (rate 22.27%). -### Column: `lead_source` +### Columns: `lead_source`, `first_touch_channel` (audit values identical) -Univariate AUC: **0.5212** · Per-channel rate spread (max − min): **0.0365** · Verdict: **weak signal** +Per-channel rate spread (max − min): **0.0365** · In-sample univariate AUC: **0.5212** · Out-of-sample univariate AUC: **0.5139** -| Channel | n | Share | Converted | Conversion rate | -|---|---:|---:|---:|---:| -| `inbound_marketing` | 1570 | 44.86% | 334 | 21.27% | -| `partner_referral` | 698 | 19.94% | 123 | 17.62% | -| `sdr_outbound` | 1232 | 35.20% | 248 | 20.13% | - -### Column: `first_touch_channel` - -Univariate AUC: **0.5212** · Per-channel rate spread (max − min): **0.0365** · Verdict: **weak signal** - -| Channel | n | Share | Converted | Conversion rate | +| Channel | n (train) | Share (train) | Converted (train) | Train rate | |---|---:|---:|---:|---:| | `inbound_marketing` | 1570 | 44.86% | 334 | 21.27% | | `partner_referral` | 698 | 19.94% | 123 | 17.62% | @@ -64,28 +44,23 @@ Univariate AUC: **0.5212** · Per-channel rate spread (max − min): **0.0365* ## Tier: `advanced` -`n_leads = 3500`, overall 90-day conversion rate 7.91%. +`n_train = 3500` (90-day conversion rate 7.91%); `n_test = 750` (rate 7.87%). -### Column: `lead_source` +### Columns: `lead_source`, `first_touch_channel` (audit values identical) -Univariate AUC: **0.5083** · Per-channel rate spread (max − min): **0.0056** · Verdict: **weak signal** +Per-channel rate spread (max − min): **0.0056** · In-sample univariate AUC: **0.5083** · Out-of-sample univariate AUC: **0.5226** -| Channel | n | Share | Converted | Conversion rate | +| Channel | n (train) | Share (train) | Converted (train) | Train rate | |---|---:|---:|---:|---:| | `inbound_marketing` | 1570 | 44.86% | 128 | 8.15% | | `partner_referral` | 698 | 19.94% | 53 | 7.59% | | `sdr_outbound` | 1232 | 35.20% | 96 | 7.79% | -### Column: `first_touch_channel` - -Univariate AUC: **0.5083** · Per-channel rate spread (max − min): **0.0056** · Verdict: **weak signal** +## Discussion -| Channel | n | Share | Converted | Conversion rate | -|---|---:|---:|---:|---:| -| `inbound_marketing` | 1570 | 44.86% | 128 | 8.15% | -| `partner_referral` | 698 | 19.94% | 53 | 7.59% | -| `sdr_outbound` | 1232 | 35.20% | 96 | 7.79% | +The numbers above answer one question: *how strongly does channel alone signal 90-day conversion in v1?* They do not answer *whether v1 matches industry channel performance*, since the benchmarks measure a different funnel transition (single MQL→SQL step) and v1 measures the entire funnel resolved over 90 days. Treat the v1 numbers as an internal description of the simulator's channel signal. -## Verdict +Two empirical observations a reader can make from the numbers above: -v1's channel signal is **weak**: across all tiers and both channel columns the largest per-channel conversion-rate spread is 0.043 and the largest univariate AUC is 0.521. That is well below the G2 / Gemini v2 industry MQL→SQL benchmark band, where SEO leads convert 50 percentage points more than Email leads. v1 drives conversion through motif-family hazards keyed off latent traits, not channel-conditional probabilities, so this is the expected outcome; channel-conditional encoding is tracked as post-v1 work in `docs/release/post_v1_roadmap.md`. +1. **The out-of-sample univariate AUC reproduces the `source_only` baseline** in `release/validation/validation_report.json` (HistGBM trained on `lead_source` + `first_touch_channel` against the same test split). For seed 42 the OOS numbers below match the report cell-for-cell. The in-sample number is biased upward by construction — small at v1's N but visible — so the OOS number is the one to compare against any external baseline. +2. **Out-of-sample univariate AUC is close to chance** in every tier and the per-channel conversion-rate spread is small (≤0.05). Channel alone is a weak feature in v1 — consistent with the design: the simulator drives conversion through motif-family hazards keyed off latent traits, not channel-conditional probabilities. Channel-conditional encoding is tracked as post-v1 work in `docs/release/post_v1_roadmap.md`. diff --git a/scripts/audit_channel_signal.py b/scripts/audit_channel_signal.py index e1d390a..feef6a1 100644 --- a/scripts/audit_channel_signal.py +++ b/scripts/audit_channel_signal.py @@ -3,15 +3,23 @@ Companion analysis for PR 4.1 (recommendation #8 v1 scope from ``docs/external_review/summaries/recommendations_pass.md``). For every -tier in a release bundle family we compute: - -* conversion rate by channel (``lead_source`` and ``first_touch_channel``) -* the univariate AUC of channel against ``converted_within_90_days``, - scored as the empirical positive rate per channel (a 1-D Bayes - classifier; equivalent to a saturated logistic regression on one-hot - channel features) - -and compare those to the G2 / Gemini v2 industry MQL→SQL benchmarks. +tier in a release bundle family we compute, separately for ``lead_source`` +and ``first_touch_channel``: + +* per-channel conversion rate, share, and counts on the **train** split +* the **in-sample** univariate AUC: per-channel rates derived on train + and scored against train labels (a 1-D Bayes classifier; biased upward + for small categorical alphabets) +* the **out-of-sample** univariate AUC: per-channel rates derived on + train and scored against **test** labels — directly comparable to the + ``source_only`` baselines in ``release/validation/validation_report.json`` + +The script does not assign a categorical "weak / moderate / strong" +verdict. Industry MQL→SQL benchmarks are surfaced for context only; +they measure a different funnel transition (single MQL→SQL step, not +the 90-day closed-won label v1 reports), so a hard comparison would be +a category error. The audit doc states the v1 numbers and an explicit +caveat; readers draw the comparison. Outputs (defaults are pinned via the v1 acceptance gates): @@ -19,8 +27,8 @@ * ``docs/release/channel_signal_audit.json`` — machine-readable sibling The script is deterministic given a fixed bundle: it reads -``train.parquet`` only, derives empirical rates, and uses -``sklearn.metrics.roc_auc_score`` with no fit-time randomness. +``train.parquet`` and ``test.parquet`` only, derives empirical rates, +and uses ``sklearn.metrics.roc_auc_score`` with no fit-time randomness. """ from __future__ import annotations @@ -28,7 +36,7 @@ import argparse import json import sys -from collections.abc import Mapping, Sequence +from collections.abc import Sequence from dataclasses import asdict, dataclass from pathlib import Path from typing import Any, Final @@ -46,28 +54,20 @@ DEFAULT_TASK: Final[str] = "converted_within_90_days" #: G2 industry MQL→SQL conversion rates surfaced in -#: ``gemini_v2_summary.md`` (recommendation #8). These are not directly -#: comparable to v1's 90-day closed-won label, but they are the closest -#: public anchor for "how much should channel matter" and the audit -#: reports the comparison band rather than asserting a hard match. -INDUSTRY_MQL_TO_SQL_BENCHMARKS: Final[Mapping[str, float]] = { - "SEO": 0.51, - "PPC": 0.26, - "Email": 0.005, -} +#: ``docs/external_review/summaries/gemini_v2_summary.md`` (recommendation #8). +#: They measure a single MQL→SQL transition, NOT v1's 90-day closed-won +#: label. Stored as a tuple of pairs so the dataclass field is genuinely +#: immutable; converted to a plain dict at JSON-render time. +INDUSTRY_MQL_TO_SQL_BENCHMARKS: Final[tuple[tuple[str, float], ...]] = ( + ("Email", 0.005), + ("PPC", 0.26), + ("SEO", 0.51), +) DEFAULT_RELEASE_DIR: Final[Path] = Path("release") DEFAULT_OUT_MD: Final[Path] = Path("docs/release/channel_signal_audit.md") DEFAULT_OUT_JSON: Final[Path] = Path("docs/release/channel_signal_audit.json") -#: Bands used to label the verdict for each channel column. Tuned to -#: surface "weak / moderate / strong" against G2-style benchmarks where -#: SEO vs Email differs by ~50 percentage points. Bands operate on the -#: per-channel max-min conversion-rate spread. -SIGNAL_BAND_WEAK_MAX: Final[float] = 0.05 -SIGNAL_BAND_MODERATE_MAX: Final[float] = 0.15 -AUC_NEAR_CHANCE_MAX: Final[float] = 0.55 - # --------------------------------------------------------------------------- # Result dataclasses @@ -76,7 +76,7 @@ @dataclass(frozen=True) class ChannelStats: - """Per-channel rollup for one channel column in one tier.""" + """Per-channel rollup for one channel column on the train split.""" name: str n: int @@ -87,14 +87,37 @@ class ChannelStats: @dataclass(frozen=True) class ChannelAudit: - """Audit results for one channel column in one tier.""" + """Audit results for one channel column in one tier. + + Per-channel statistics come from the train split. + ``univariate_auc_in_sample`` re-uses train labels (bias-prone but + matches the historical 1-D Bayes-classifier interpretation); + ``univariate_auc_out_of_sample`` scores the train-derived rates + against the held-out test split. + """ column: str - n_total: int - overall_conversion_rate: float + n_train: int + n_test: int + train_conversion_rate: float + test_conversion_rate: float channels: tuple[ChannelStats, ...] rate_spread: float - univariate_auc: float + univariate_auc_in_sample: float + univariate_auc_out_of_sample: float + + +@dataclass(frozen=True) +class ChannelGroup: + """One or more channel columns with byte-identical audit values. + + v1's ``lead_source`` and ``first_touch_channel`` produce identical + numbers in every tier — this dataclass lets the markdown renderer + collapse them into one section without losing information. + """ + + columns: tuple[str, ...] + audit: ChannelAudit @dataclass(frozen=True) @@ -102,8 +125,10 @@ class TierAudit: """Audit results for one tier across every channel column.""" tier: str - n_leads: int - conversion_rate_overall: float + n_train: int + n_test: int + train_conversion_rate: float + test_conversion_rate: float columns: tuple[ChannelAudit, ...] @@ -116,7 +141,7 @@ class AuditReport: label_column: str channel_columns: tuple[str, ...] tiers: tuple[TierAudit, ...] - industry_mql_to_sql_benchmarks: Mapping[str, float] + industry_mql_to_sql_benchmarks: tuple[tuple[str, float], ...] # --------------------------------------------------------------------------- @@ -132,33 +157,50 @@ def _label_to_int(series: pd.Series) -> pd.Series: return pd.to_numeric(series, errors="raise").astype(int) +def _conversion_rate(df: pd.DataFrame, label_col: str) -> float: + if len(df) == 0: + return 0.0 + return float(int(_label_to_int(df[label_col]).sum()) / len(df)) + + +def _auc_or_chance(y: pd.Series, scores: pd.Series) -> float: + """ROC AUC, falling back to ``0.5`` when undefined (single class).""" + + if y.nunique() < 2: + return 0.5 + return float(roc_auc_score(y.to_numpy(), scores.to_numpy())) + + def audit_channel( - df: pd.DataFrame, + train: pd.DataFrame, channel_col: str, + *, + test: pd.DataFrame, label_col: str = LABEL_COLUMN, ) -> ChannelAudit: - """Per-channel stats + univariate AUC for a single channel column. - - ``univariate_auc`` is the AUC obtained by replacing each row's - channel value with that channel's empirical positive rate. This is - a 1-D Bayes classifier, equivalent (up to ties) to a saturated - logistic regression on one-hot channel features and stable across - sklearn versions. Returns ``0.5`` when the label has only one - class, since AUC is undefined. + """Per-channel stats and univariate AUCs (in-sample + OOS). + + Both AUCs use the same scoring function: the per-channel positive + rate derived from the train split. The "in-sample" AUC scores + that against train labels (biased upward by construction); the + "out-of-sample" AUC scores it against held-out test labels and + is directly comparable to the ``source_only`` baselines in + ``release/validation/validation_report.json``. """ - if channel_col not in df.columns: - raise KeyError(f"channel column {channel_col!r} not present") - if label_col not in df.columns: - raise KeyError(f"label column {label_col!r} not present") + for df_name, df in (("train", train), ("test", test)): + if channel_col not in df.columns: + raise KeyError(f"channel column {channel_col!r} not present in {df_name}") + if label_col not in df.columns: + raise KeyError(f"label column {label_col!r} not present in {df_name}") - y = _label_to_int(df[label_col]) - n_total = len(df) - n_converted_total = int(y.sum()) - overall_rate = float(n_converted_total / n_total) if n_total else 0.0 + y_train = _label_to_int(train[label_col]) + n_train = len(train) + n_test = len(test) + train_rate = float(int(y_train.sum()) / n_train) if n_train else 0.0 + test_rate = _conversion_rate(test, label_col) - # Per-channel rollup, sorted by name for determinism. - grouped = df.assign(_y=y).groupby(channel_col, dropna=False) + grouped = train.assign(_y=y_train).groupby(channel_col, dropna=False) rows: list[ChannelStats] = [] for name, sub in sorted(grouped, key=lambda kv: str(kv[0])): n = len(sub) @@ -167,7 +209,7 @@ def audit_channel( ChannelStats( name=str(name), n=n, - share=float(n / n_total) if n_total else 0.0, + share=float(n / n_train) if n_train else 0.0, n_converted=n_conv, conversion_rate=float(n_conv / n) if n else 0.0, ) @@ -177,51 +219,67 @@ def audit_channel( max(c.conversion_rate for c in rows) - min(c.conversion_rate for c in rows) if rows else 0.0 ) - if y.nunique() < 2 or len(rows) < 2: - univariate_auc = 0.5 + if len(rows) < 2: + in_sample_auc = 0.5 + oos_auc = 0.5 else: rate_lookup = {c.name: c.conversion_rate for c in rows} - scores = df[channel_col].astype(str).map(rate_lookup).astype(float) - univariate_auc = float(roc_auc_score(y.to_numpy(), scores.to_numpy())) + train_scores = train[channel_col].astype(str).map(rate_lookup).astype(float) + in_sample_auc = _auc_or_chance(y_train, train_scores) + + # Test-set channels are scored using the train-derived rates; + # any channel value unseen on train falls back to the train + # base rate so the AUC stays well-defined. + test_scores = ( + test[channel_col].astype(str).map(rate_lookup).fillna(train_rate).astype(float) + ) + y_test = _label_to_int(test[label_col]) + oos_auc = _auc_or_chance(y_test, test_scores) return ChannelAudit( column=channel_col, - n_total=n_total, - overall_conversion_rate=overall_rate, + n_train=n_train, + n_test=n_test, + train_conversion_rate=train_rate, + test_conversion_rate=test_rate, channels=tuple(rows), rate_spread=float(rate_spread), - univariate_auc=univariate_auc, + univariate_auc_in_sample=in_sample_auc, + univariate_auc_out_of_sample=oos_auc, ) def audit_tier( - df: pd.DataFrame, + train: pd.DataFrame, tier: str, *, + test: pd.DataFrame, channel_columns: Sequence[str] = CHANNEL_COLUMNS, label_col: str = LABEL_COLUMN, ) -> TierAudit: """Run :func:`audit_channel` for every channel column on one tier.""" - y = _label_to_int(df[label_col]) - n = len(df) - overall_rate = float(int(y.sum()) / n) if n else 0.0 - - columns = tuple(audit_channel(df, col, label_col=label_col) for col in channel_columns) + train_rate = _conversion_rate(train, label_col) + test_rate = _conversion_rate(test, label_col) + columns = tuple( + audit_channel(train, col, test=test, label_col=label_col) for col in channel_columns + ) return TierAudit( tier=tier, - n_leads=n, - conversion_rate_overall=overall_rate, + n_train=len(train), + n_test=len(test), + train_conversion_rate=train_rate, + test_conversion_rate=test_rate, columns=columns, ) -def load_train_df(release_dir: Path, tier: str, task: str = DEFAULT_TASK) -> pd.DataFrame: - """Load ``release_dir//tasks//train.parquet``.""" +def load_split(release_dir: Path, tier: str, split: str, task: str = DEFAULT_TASK) -> pd.DataFrame: + """Load ``release_dir//tasks//.parquet``.""" - path = release_dir / tier / "tasks" / task / "train.parquet" + path = release_dir / tier / "tasks" / task / f"{split}.parquet" if not path.exists(): - raise FileNotFoundError(f"missing train split for tier {tier!r}: {path}") + raise FileNotFoundError(f"missing {split} split for tier {tier!r}: {path}") return pd.read_parquet(path) @@ -237,11 +295,13 @@ def build_report( tier_audits: list[TierAudit] = [] for tier in tiers: - df = load_train_df(release_dir, tier, task=task) + train = load_split(release_dir, tier, "train", task=task) + test = load_split(release_dir, tier, "test", task=task) tier_audits.append( audit_tier( - df, + train, tier=tier, + test=test, channel_columns=channel_columns, label_col=label_col, ) @@ -253,67 +313,7 @@ def build_report( label_column=label_col, channel_columns=tuple(channel_columns), tiers=tuple(tier_audits), - industry_mql_to_sql_benchmarks=dict(INDUSTRY_MQL_TO_SQL_BENCHMARKS), - ) - - -# --------------------------------------------------------------------------- -# Verdict -# --------------------------------------------------------------------------- - - -def _classify_signal(audit: ChannelAudit) -> str: - """Map (rate spread, univariate AUC) to one of weak/moderate/strong.""" - - if audit.univariate_auc < AUC_NEAR_CHANCE_MAX and audit.rate_spread < SIGNAL_BAND_WEAK_MAX: - return "weak" - if audit.rate_spread < SIGNAL_BAND_MODERATE_MAX: - return "moderate" - return "strong" - - -def _verdict_paragraph(report: AuditReport) -> str: - """One-paragraph human-readable verdict.""" - - rows = [ - (tier.tier, col.column, col.rate_spread, col.univariate_auc, _classify_signal(col)) - for tier in report.tiers - for col in tier.columns - ] - strengths = {row[4] for row in rows} - max_spread = max((row[2] for row in rows), default=0.0) - max_auc = max((row[3] for row in rows), default=0.5) - - seo_minus_email = ( - INDUSTRY_MQL_TO_SQL_BENCHMARKS["SEO"] - INDUSTRY_MQL_TO_SQL_BENCHMARKS["Email"] - ) - - if strengths <= {"weak"}: - verdict = "weak" - intent = ( - "well below the G2 / Gemini v2 industry MQL→SQL benchmark band, where SEO leads " - f"convert {seo_minus_email * 100:.0f} percentage points more than Email leads." - ) - elif "strong" in strengths: - verdict = "strong" - intent = ( - "comparable to or stronger than the G2 / Gemini v2 industry benchmark band — " - "channel-conditional encoding may already be implicit in v1." - ) - else: - verdict = "moderate" - intent = ( - "below the G2 / Gemini v2 industry benchmark band — channel signal is present but " - "weaker than published MQL→SQL spreads." - ) - - return ( - f"v1's channel signal is **{verdict}**: across all tiers and both channel columns the " - f"largest per-channel conversion-rate spread is {max_spread:.3f} and the largest " - f"univariate AUC is {max_auc:.3f}. That is {intent} v1 drives conversion through " - "motif-family hazards keyed off latent traits, not channel-conditional probabilities, " - "so this is the expected outcome; channel-conditional encoding is tracked as post-v1 " - "work in `docs/release/post_v1_roadmap.md`." + industry_mql_to_sql_benchmarks=INDUSTRY_MQL_TO_SQL_BENCHMARKS, ) @@ -323,7 +323,13 @@ def _verdict_paragraph(report: AuditReport) -> str: def report_to_dict(report: AuditReport) -> dict[str, Any]: - """Convert the report to a JSON-primitive dict (deterministic).""" + """Convert the report to a JSON-primitive dict. + + The dataclass stores ``industry_mql_to_sql_benchmarks`` as a tuple + of pairs (immutability); this helper converts it back into a + ``{name: rate}`` mapping for the JSON output, where a dict shape + is more ergonomic for downstream tooling. + """ payload = asdict(report) payload["industry_mql_to_sql_benchmarks"] = dict(report.industry_mql_to_sql_benchmarks) @@ -340,6 +346,47 @@ def _format_pct(x: float) -> str: return f"{x * 100:.2f}%" +def _audit_signature(audit: ChannelAudit) -> tuple[Any, ...]: + """Hashable signature used to group columns whose audits are identical.""" + + return ( + audit.n_train, + audit.n_test, + audit.train_conversion_rate, + audit.test_conversion_rate, + tuple(_stats_signature(c) for c in audit.channels), + audit.rate_spread, + audit.univariate_auc_in_sample, + audit.univariate_auc_out_of_sample, + ) + + +def _stats_signature(stats: ChannelStats) -> tuple[Any, ...]: + """Hashable tuple representing one ``ChannelStats``.""" + + return (stats.name, stats.n, stats.share, stats.n_converted, stats.conversion_rate) + + +def _group_identical_columns(audits: Sequence[ChannelAudit]) -> list[ChannelGroup]: + """Collapse columns whose audit values are byte-identical.""" + + groups: list[ChannelGroup] = [] + seen_signatures: dict[tuple[Any, ...], int] = {} + for audit in audits: + sig = _audit_signature(audit) + if sig in seen_signatures: + idx = seen_signatures[sig] + existing = groups[idx] + groups[idx] = ChannelGroup( + columns=existing.columns + (audit.column,), + audit=existing.audit, + ) + else: + seen_signatures[sig] = len(groups) + groups.append(ChannelGroup(columns=(audit.column,), audit=audit)) + return groups + + def render_markdown(report: AuditReport) -> str: """Render the audit report as Markdown.""" @@ -347,31 +394,34 @@ def render_markdown(report: AuditReport) -> str: lines.append("# Channel-signal audit — leadforge-lead-scoring-v1") lines.append("") lines.append( - "Audit produced by `scripts/audit_channel_signal.py`; see also " + "Audit produced by `scripts/audit_channel_signal.py`; see " "`docs/release/channel_signal_audit.json` for the machine-readable form." ) lines.append("") lines.append( - "**Scope.** For every tier we compute per-channel conversion rates and the univariate " - "AUC of channel against `converted_within_90_days`, scored as the empirical positive " - "rate per channel (a 1-D Bayes classifier, equivalent to a saturated logistic " - "regression on one-hot channel features). Compared against the G2 / Gemini v2 industry " - "MQL→SQL benchmark band (SEO ~51%, PPC ~26%, Email <1%, surfaced in " - "`docs/external_review/summaries/recommendations_pass.md` recommendation #8)." + "**Scope.** For every tier we compute per-channel conversion rates on the train " + "split and the univariate AUC of channel against `converted_within_90_days`, " + "scored as the empirical positive rate per channel (a 1-D Bayes classifier). Two " + "AUCs are reported: an **in-sample** number (train rates → train labels — biased " + "upward by construction) and an **out-of-sample** number (train rates → test labels " + "— directly comparable to the `source_only` baselines in " + "`release/validation/validation_report.json`)." ) lines.append("") lines.append( - "**Caveat.** Industry benchmarks are MQL→SQL rates, not 90-day closed-won rates. They " - "are the closest public anchor for *how much* channel ought to matter; use them as a " - "band of reference, not a hard target." + "**Caveat on the industry benchmark.** The G2 / Gemini v2 numbers below are " + "single-step **MQL→SQL** rates (recommendation #8 in " + "`docs/external_review/summaries/recommendations_pass.md`). v1's label is " + "**90-day closed-won**, the entire funnel resolved. The two metrics are not " + "directly comparable; the table is reproduced for context only." ) lines.append("") - lines.append("## Industry benchmark band") + lines.append("## Industry benchmark (context, not target)") lines.append("") lines.append("| Channel | MQL→SQL conversion rate |") lines.append("|---|---|") - for name, rate in sorted(report.industry_mql_to_sql_benchmarks.items()): + for name, rate in report.industry_mql_to_sql_benchmarks: lines.append(f"| {name} | {_format_pct(rate)} |") lines.append("") @@ -379,32 +429,65 @@ def render_markdown(report: AuditReport) -> str: lines.append(f"## Tier: `{tier.tier}`") lines.append("") lines.append( - f"`n_leads = {tier.n_leads}`, overall 90-day conversion rate " - f"{_format_pct(tier.conversion_rate_overall)}." + f"`n_train = {tier.n_train}` (90-day conversion rate " + f"{_format_pct(tier.train_conversion_rate)}); " + f"`n_test = {tier.n_test}` (rate " + f"{_format_pct(tier.test_conversion_rate)})." ) lines.append("") - for col in tier.columns: - lines.append(f"### Column: `{col.column}`") + groups = _group_identical_columns(tier.columns) + for group in groups: + cols_label = ", ".join(f"`{c}`" for c in group.columns) + if len(group.columns) > 1: + heading = f"### Columns: {cols_label} (audit values identical)" + else: + heading = f"### Column: {cols_label}" + lines.append(heading) lines.append("") lines.append( - f"Univariate AUC: **{col.univariate_auc:.4f}** · " - f"Per-channel rate spread (max − min): **{col.rate_spread:.4f}** · " - f"Verdict: **{_classify_signal(col)} signal**" + f"Per-channel rate spread (max − min): **{group.audit.rate_spread:.4f}** · " + f"In-sample univariate AUC: **{group.audit.univariate_auc_in_sample:.4f}** · " + f"Out-of-sample univariate AUC: **{group.audit.univariate_auc_out_of_sample:.4f}**" ) lines.append("") - lines.append("| Channel | n | Share | Converted | Conversion rate |") + lines.append("| Channel | n (train) | Share (train) | Converted (train) | Train rate |") lines.append("|---|---:|---:|---:|---:|") - for ch in col.channels: + for ch in group.audit.channels: lines.append( f"| `{ch.name}` | {ch.n} | {_format_pct(ch.share)} | " f"{ch.n_converted} | {_format_pct(ch.conversion_rate)} |" ) lines.append("") - lines.append("## Verdict") + lines.append("## Discussion") lines.append("") - lines.append(_verdict_paragraph(report)) + lines.append( + "The numbers above answer one question: *how strongly does channel alone signal " + "90-day conversion in v1?* They do not answer *whether v1 matches industry channel " + "performance*, since the benchmarks measure a different funnel transition (single " + "MQL→SQL step) and v1 measures the entire funnel resolved over 90 days. Treat the " + "v1 numbers as an internal description of the simulator's channel signal." + ) + lines.append("") + lines.append("Two empirical observations a reader can make from the numbers above:") + lines.append("") + lines.append( + "1. **The out-of-sample univariate AUC reproduces the `source_only` baseline** in " + "`release/validation/validation_report.json` (HistGBM trained on `lead_source` + " + "`first_touch_channel` against the same test split). For seed 42 the OOS numbers " + "below match the report cell-for-cell. The in-sample number is biased upward by " + "construction — small at v1's N but visible — so the OOS number is the one to " + "compare against any external baseline." + ) + lines.append( + "2. **Out-of-sample univariate AUC is close to chance** in every tier and the " + "per-channel conversion-rate spread is small (≤0.05). Channel alone is a weak " + "feature in v1 — consistent with the design: the simulator drives conversion " + "through motif-family hazards keyed off latent traits, not channel-conditional " + "probabilities. Channel-conditional encoding is tracked as post-v1 work in " + "`docs/release/post_v1_roadmap.md`." + ) lines.append("") return "\n".join(lines) diff --git a/tests/scripts/test_audit_channel_signal.py b/tests/scripts/test_audit_channel_signal.py index ff46dea..6bc5784 100644 --- a/tests/scripts/test_audit_channel_signal.py +++ b/tests/scripts/test_audit_channel_signal.py @@ -1,9 +1,17 @@ """Tests for ``scripts/audit_channel_signal.py``. -Exercises the per-channel rollup, univariate-AUC scorer, and the JSON + -markdown rendering paths. A determinism guard ensures the script's -output is byte-identical across runs against the committed -``release/`` bundles. +Exercises the per-channel rollup, in-sample / out-of-sample univariate +AUC scorers, the JSON + markdown rendering paths, and two integrity +properties against the committed ``release/`` bundles: + +1. ``lead_source`` and ``first_touch_channel`` carry identical values in + every tier (the feature dictionary's claim). +2. The committed ``docs/release/channel_signal_audit.{md,json}`` are + byte-identical to a fresh run of the audit script. + +Both properties fail loudly if the bundles are regenerated without +re-running the audit, or if the simulator ever diverges the two +channel columns. """ from __future__ import annotations @@ -26,18 +34,23 @@ _spec.loader.exec_module(audit_module) +_INTRO_TRAIN = ( + _REPO_ROOT / "release" / "intro" / "tasks" / "converted_within_90_days" / "train.parquet" +) +_RELEASE_BUNDLES_PRESENT = _INTRO_TRAIN.exists() + +_TIERS = ("intro", "intermediate", "advanced") + + # --------------------------------------------------------------------------- -# Synthetic fixture +# Synthetic fixtures # --------------------------------------------------------------------------- -def _toy_train(n_per_channel: int = 20) -> pd.DataFrame: +def _toy_split(n_per_channel: int = 20) -> pd.DataFrame: """Three channels with deliberately different conversion rates. - Channel rates: ``A`` 100%, ``B`` 50%, ``C`` 0%. Univariate AUC for - a perfectly separating saturated classifier on this is 1.0 only if - ``B`` is treated as a tied middle class — otherwise it's the - standard 1-D Bayes AUC against a 3-bucket score. + Channel rates: ``A`` 100%, ``B`` 50%, ``C`` 0%. """ rows = [] @@ -59,11 +72,11 @@ def _toy_train(n_per_channel: int = 20) -> pd.DataFrame: def test_audit_channel_returns_per_channel_stats() -> None: - df = _toy_train() - audit = audit_module.audit_channel(df, "lead_source") + train = _toy_split() + audit = audit_module.audit_channel(train, "lead_source", test=train) assert audit.column == "lead_source" - assert audit.n_total == 60 - assert audit.overall_conversion_rate == pytest.approx(0.5) + assert audit.n_train == 60 + assert audit.train_conversion_rate == pytest.approx(0.5) names = [c.name for c in audit.channels] assert names == ["A", "B", "C"] # sorted by name by_name = {c.name: c for c in audit.channels} @@ -73,39 +86,71 @@ def test_audit_channel_returns_per_channel_stats() -> None: assert audit.rate_spread == pytest.approx(1.0) -def test_audit_channel_univariate_auc_perfectly_separable() -> None: - df = _toy_train() - audit = audit_module.audit_channel(df, "lead_source") - # 20 pos from A (rate 1.0), 10 pos / 10 neg from B (rate 0.5, tied), - # 20 neg from C (rate 0.0). Pair-counting AUC: - # A_pos vs B_neg : 200 wins - # A_pos vs C_neg : 400 wins - # B_pos vs B_neg : 100 ties → +50 - # B_pos vs C_neg : 200 wins - # → 850 / 900 = 17/18. - assert audit.univariate_auc == pytest.approx(17 / 18) +def test_audit_channel_in_sample_auc_pair_counting() -> None: + """Closed-form check of the in-sample univariate AUC. + + 20 pos from A (rate 1.0), 10 pos / 10 neg from B (rate 0.5, tied), + 20 neg from C (rate 0.0). Pair-counting AUC: + A_pos vs B_neg : 200 wins + A_pos vs C_neg : 400 wins + B_pos vs B_neg : 100 ties → +50 + B_pos vs C_neg : 200 wins + → 850 / 900 = 17/18. + """ + + train = _toy_split() + audit = audit_module.audit_channel(train, "lead_source", test=train) + assert audit.univariate_auc_in_sample == pytest.approx(17 / 18) + + +def test_audit_channel_oos_auc_matches_in_sample_when_test_is_train() -> None: + """When the test split is the train split, OOS AUC == in-sample AUC.""" + + train = _toy_split() + audit = audit_module.audit_channel(train, "lead_source", test=train) + assert audit.univariate_auc_out_of_sample == pytest.approx(audit.univariate_auc_in_sample) + + +def test_audit_channel_oos_auc_handles_unseen_test_categories() -> None: + """Test categories not present in train get the train base rate fallback.""" + + train = _toy_split() + test = pd.DataFrame( + { + "lead_source": ["A", "B", "C", "Z", "Z"], # Z is unseen + "first_touch_channel": ["A", "B", "C", "Z", "Z"], + "converted_within_90_days": [True, True, False, True, False], + } + ) + audit = audit_module.audit_channel(train, "lead_source", test=test) + # AUC is well-defined (no NaN) — the unseen categories fall back to + # the train base rate (0.5), which produces ties against any seen + # category whose rate also equals 0.5. + assert 0.0 <= audit.univariate_auc_out_of_sample <= 1.0 def test_audit_channel_handles_single_class_label() -> None: - df = _toy_train() - df["converted_within_90_days"] = False - audit = audit_module.audit_channel(df, "lead_source") - assert audit.univariate_auc == 0.5 # AUC undefined → reported as chance + train = _toy_split() + train["converted_within_90_days"] = False + audit = audit_module.audit_channel(train, "lead_source", test=train) + assert audit.univariate_auc_in_sample == 0.5 + assert audit.univariate_auc_out_of_sample == 0.5 def test_audit_channel_raises_on_missing_column() -> None: - df = _toy_train() + train = _toy_split() with pytest.raises(KeyError): - audit_module.audit_channel(df, "no_such_column") + audit_module.audit_channel(train, "no_such_column", test=train) def test_audit_tier_runs_every_channel_column() -> None: - df = _toy_train() - tier = audit_module.audit_tier(df, "intro") + train = _toy_split() + tier = audit_module.audit_tier(train, "intro", test=train) cols = {c.column for c in tier.columns} assert cols == {"lead_source", "first_touch_channel"} assert tier.tier == "intro" - assert tier.n_leads == 60 + assert tier.n_train == 60 + assert tier.n_test == 60 # --------------------------------------------------------------------------- @@ -113,9 +158,9 @@ def test_audit_tier_runs_every_channel_column() -> None: # --------------------------------------------------------------------------- -def test_build_report_round_trips_through_render_json() -> None: - df = _toy_train() - tier = audit_module.audit_tier(df, "intro") +def test_render_json_round_trip() -> None: + train = _toy_split() + tier = audit_module.audit_tier(train, "intro", test=train) report = audit_module.AuditReport( release_dir="release", task="converted_within_90_days", @@ -127,12 +172,16 @@ def test_build_report_round_trips_through_render_json() -> None: js = audit_module.render_json(report) parsed = json.loads(js) assert parsed["tiers"][0]["tier"] == "intro" + # Industry benchmarks render as a {name: rate} dict in the JSON + # (renderer converts the immutable tuple-of-pairs back). assert parsed["industry_mql_to_sql_benchmarks"]["SEO"] == pytest.approx(0.51) -def test_render_markdown_includes_verdict_section() -> None: - df = _toy_train() - tier = audit_module.audit_tier(df, "intro") +def test_render_markdown_collapses_identical_columns() -> None: + """When two columns produce identical audits, the renderer groups them.""" + + train = _toy_split() # lead_source == first_touch_channel by construction + tier = audit_module.audit_tier(train, "intro", test=train) report = audit_module.AuditReport( release_dir="release", task="converted_within_90_days", @@ -142,26 +191,52 @@ def test_render_markdown_includes_verdict_section() -> None: industry_mql_to_sql_benchmarks=audit_module.INDUSTRY_MQL_TO_SQL_BENCHMARKS, ) md = audit_module.render_markdown(report) - assert "## Verdict" in md - assert "## Industry benchmark band" in md - assert "Tier: `intro`" in md + assert "audit values identical" in md + # Each tier should render the columns once, not twice. + assert md.count("Per-channel rate spread") == 1 -# --------------------------------------------------------------------------- -# CLI determinism (guards against accidental nondeterminism in either -# the audit functions or the rendering layer) -# --------------------------------------------------------------------------- +def test_render_markdown_renders_distinct_columns_separately() -> None: + """When two columns differ, the renderer keeps them in separate sections.""" + + train = _toy_split() + train["first_touch_channel"] = "A" # force divergence from lead_source + tier = audit_module.audit_tier(train, "intro", test=train) + report = audit_module.AuditReport( + release_dir="release", + task="converted_within_90_days", + label_column="converted_within_90_days", + channel_columns=audit_module.CHANNEL_COLUMNS, + tiers=(tier,), + industry_mql_to_sql_benchmarks=audit_module.INDUSTRY_MQL_TO_SQL_BENCHMARKS, + ) + md = audit_module.render_markdown(report) + assert "audit values identical" not in md + assert md.count("Per-channel rate spread") == 2 -_INTRO_TRAIN = ( - _REPO_ROOT / "release" / "intro" / "tasks" / "converted_within_90_days" / "train.parquet" -) +def test_render_markdown_includes_discussion_section() -> None: + train = _toy_split() + tier = audit_module.audit_tier(train, "intro", test=train) + report = audit_module.AuditReport( + release_dir="release", + task="converted_within_90_days", + label_column="converted_within_90_days", + channel_columns=audit_module.CHANNEL_COLUMNS, + tiers=(tier,), + industry_mql_to_sql_benchmarks=audit_module.INDUSTRY_MQL_TO_SQL_BENCHMARKS, + ) + md = audit_module.render_markdown(report) + assert "## Discussion" in md + assert "## Industry benchmark (context, not target)" in md -@pytest.mark.skipif( - not _INTRO_TRAIN.exists(), - reason="release/intro bundle not present; skipping determinism guard", -) +# --------------------------------------------------------------------------- +# CLI determinism + error paths +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release/intro bundle not present") def test_release_audit_is_deterministic(tmp_path: Path) -> None: """Two back-to-back runs against the committed release bundle must produce byte-identical JSON and markdown output.""" @@ -218,7 +293,6 @@ def test_main_reports_missing_release_dir( def test_main_reports_missing_train_split( tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: - # Empty release dir — tier subdirectory missing. (tmp_path / "release").mkdir() rc = audit_module.main( [ @@ -235,3 +309,57 @@ def test_main_reports_missing_train_split( captured = capsys.readouterr() assert rc == 2 assert "missing train split" in captured.err + + +# --------------------------------------------------------------------------- +# Integrity properties against the committed release/ bundles +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release/ bundles not present") +@pytest.mark.parametrize("tier", _TIERS) +def test_lead_source_equals_first_touch_channel_in_v1(tier: str) -> None: + """Locks the feature-dict claim that the two channel columns are + identical in v1. If the simulator ever diverges them, this test + fails and ``docs/release/feature_dictionary.md`` must be updated.""" + + for split in ("train", "test", "valid"): + df = audit_module.load_split(_REPO_ROOT / "release", tier, split) + assert (df["lead_source"] == df["first_touch_channel"]).all(), ( + f"{tier}/{split}: lead_source diverges from first_touch_channel" + ) + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release/ bundles not present") +def test_committed_audit_artifacts_match_fresh_regeneration( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A fresh audit run against the committed bundles must match the + committed ``docs/release/channel_signal_audit.{md,json}`` exactly. + + If this fails, the bundles drifted without re-running the audit. + Regenerate via ``python scripts/audit_channel_signal.py`` from the + repo root. + """ + + # The committed JSON records ``release_dir`` as the literal path + # the developer passed on the command line. Re-run the audit + # exactly as the developer would: from the repo root, with the + # default (relative) ``release`` argument. + monkeypatch.chdir(_REPO_ROOT) + + out_md = tmp_path / "audit.md" + out_json = tmp_path / "audit.json" + rc = audit_module.main( + [ + "--out-md", + str(out_md), + "--out-json", + str(out_json), + ] + ) + assert rc == 0 + committed_md = (_REPO_ROOT / "docs" / "release" / "channel_signal_audit.md").read_bytes() + committed_json = (_REPO_ROOT / "docs" / "release" / "channel_signal_audit.json").read_bytes() + assert out_md.read_bytes() == committed_md + assert out_json.read_bytes() == committed_json From 61964a99b793863a8b9d5a73383c2d25ad2972c5 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 16:28:36 +0300 Subject: [PATCH 6/7] =?UTF-8?q?docs(release):=20self-review=20fixes=20?= =?UTF-8?q?=E2=80=94=20README=20trim,=20citations,=20feature-dict=20consis?= =?UTF-8?q?tency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * release/README.md (~434 → ~228 lines): trimmed to a release-grade landing card. The full DGP, motif families, simulation simplifications, and module map move to docs/release/generation_method.md (linked). Macro-framing claim now cites docs/external_review/summaries/gemini_v2_summary.md as the source of the 30%→25% growth and CAC-ratio numbers (previously presented as if primary research). Composition + maintenance sections compressed into the table at the bottom. * docs/release/generation_method.md: dropped the "Where the code lives" module table. This doc is for external readers; module paths belong in the developer-facing design doc and architecture spec. Ends with a single short pointer to those. * docs/release/feature_dictionary.md: fixed a factually wrong claim about the leakage trap (the per-bundle CSV has columns ``name,dtype,description,category,is_target,leakage_risk`` — there is no ``is_leakage_trap`` column). Reworded the modelling-default checklist to acknowledge that the flat ``lead_scoring.csv`` and the Parquet task splits ship every column listed in the dictionary including the IDs — the recommendation says what to use as features, not what's in the file. Also notes that ``lead_source`` and ``first_touch_channel`` carry identical values in v1 (locked by the new test), so picking one is fine. Co-Authored-By: Claude Opus 4.7 --- docs/release/feature_dictionary.md | 50 +-- docs/release/generation_method.md | 20 +- release/README.md | 490 +++++++++-------------------- 3 files changed, 175 insertions(+), 385 deletions(-) diff --git a/docs/release/feature_dictionary.md b/docs/release/feature_dictionary.md index f4f60c8..feaff07 100644 --- a/docs/release/feature_dictionary.md +++ b/docs/release/feature_dictionary.md @@ -132,7 +132,7 @@ ships. | Column | Dtype | Why it ships | |---|---|---| -| `total_touches_all` | Int64 | Counts touches across the full 90-day horizon — not the snapshot window. Flagged `leakage_risk=True` and `is_leakage_trap=True` in the CSV; documented in `release/README.md`. The gap `total_touches_all − touch_count` carries label-correlated signal because high-converting leads accumulate more late-window touches in the simulator. **Drop this column from your features unless you are explicitly demonstrating leakage detection.** | +| `total_touches_all` | Int64 | Counts touches across the full 90-day horizon — not the snapshot window. Flagged `leakage_risk=True` in the CSV (the per-bundle dictionary has columns `name,dtype,description,category,is_target,leakage_risk`); documented in `release/README.md`. The gap `total_touches_all − touch_count` carries label-correlated signal because high-converting leads accumulate more late-window touches in the simulator. **Drop this column from your features unless you are explicitly demonstrating leakage detection.** | ## Target @@ -166,27 +166,35 @@ cross-seed × cross-tier metrics panel. ## Recommended modelling defaults -A short opinionated checklist for a first model: - -1. Drop `lead_id`. Drop or bin `lead_created_at`. Drop - `total_touches_all`. Drop `account_id` / `contact_id` unless - you're joining the relational tables on purpose. -2. One-hot or target-encode the categorical columns - (`industry`, `region`, `employee_band`, - `estimated_revenue_band`, `process_maturity_band`, - `role_function`, `seniority`, `buyer_role`, - `lead_source`, `first_touch_channel`). -3. Keep all snapshot-window engagement and funnel features; the - `Float64` columns carry NaN for "no event in window", which is - itself a signal — encode missingness explicitly rather than - imputing to zero blindly. -4. For value-aware ranking, use `expected_acv` over - `opportunity_estimated_acv` since the latter is missing for - leads without an opportunity. Multiply by your model's predicted +A short opinionated checklist for a first model. Note: the flat +`lead_scoring.csv` and the per-task Parquet splits ship every column +in the table above, including the IDs — the recommendation is what to +**use as features**, not what's in the file. + +1. **Identifiers — drop before fitting.** `lead_id` is opaque and + carries no signal; drop it. `account_id` / `contact_id` are joinable + keys, useful only when you're computing cross-table aggregates; + drop from the feature matrix unless you actually use them. Drop or + bin `lead_created_at` — feeding raw timestamps to a linear model + is rarely what you want; use it as the cohort key for time-shift + evaluation instead. +2. **Trap — drop.** `total_touches_all` is the deliberate leakage + trap. Drop unless you're demonstrating leakage detection. +3. **Categoricals — encode.** One-hot or target-encode `industry`, + `region`, `employee_band`, `estimated_revenue_band`, + `process_maturity_band`, `role_function`, `seniority`, + `buyer_role`, `lead_source`, `first_touch_channel`. The two + channel columns carry identical values in v1; pick one. +4. **Engagement and funnel — keep all.** The `Float64` columns carry + NaN for "no event in window", which is itself a signal — encode + missingness explicitly rather than imputing to zero blindly. +5. **Value-aware ranking.** Use `expected_acv` over + `opportunity_estimated_acv`; the latter is missing for leads + without an opportunity. Multiply by your model's predicted probability for a default value-weighted ranker. -5. For cohort/time-shift evaluation, sort by `lead_created_at` and - split chronologically; the random-split AUC is *not* the - right number to report if your downstream use is forecasting. +6. **Cohort evaluation.** Sort by `lead_created_at` and split + chronologically; the random-split AUC is *not* the right number to + report if your downstream use is forecasting. ## See also diff --git a/docs/release/generation_method.md b/docs/release/generation_method.md index d19ef18..12029d3 100644 --- a/docs/release/generation_method.md +++ b/docs/release/generation_method.md @@ -153,26 +153,14 @@ declared in `docs/release/v1_acceptance_gates_bands.yaml`. free-text-job-title messiness). Both are tracked as post-v1 scope in `docs/release/post_v1_roadmap.md`. -## Where the code lives - -| Layer | Module | -|---|---| -| Recipe loader, config resolution | `leadforge/api/recipes.py` | -| Public API entry point | `leadforge/api/generator.py` | -| Hidden world DAG | `leadforge/structure/{graph,motifs,rewiring,sampler}.py` | -| Mechanism assignment | `leadforge/mechanisms/{policies,hazards,transitions,counts,categorical}.py` | -| Population draw | `leadforge/simulation/population.py` | -| 90-day daily simulator | `leadforge/simulation/engine.py` | -| Snapshot rendering | `leadforge/render/snapshots.py` | -| Snapshot-safe relational writer | `leadforge/render/relational_snapshot_safe.py` | -| Exposure-mode filtering | `leadforge/exposure/{modes,filters,metadata}.py` | -| Bundle writer | `leadforge/api/bundle.py` | -| Validation contract | `leadforge/validation/{bundle_checks,leakage_probes,release_quality,reporting,difficulty}.py` | +## Further reading For the deeper design rationale — why a DAG, why motif families, why event-derived labels, why public-vs-instructor — see [`docs/leadforge_design_doc.md`] and -[`docs/leadforge_architecture_spec.md`]. +[`docs/leadforge_architecture_spec.md`]. Both documents are aimed at +contributors and document the package internals; this doc stays at +the conceptual level external readers need. [`docs/leadforge_design_doc.md`]: ../leadforge_design_doc.md [`docs/leadforge_architecture_spec.md`]: ../leadforge_architecture_spec.md diff --git a/release/README.md b/release/README.md index 19aec97..e0a58d6 100644 --- a/release/README.md +++ b/release/README.md @@ -2,133 +2,75 @@ A relational, reproducible, three-tier synthetic CRM dataset family for teaching lead scoring at scale. Generated by -[leadforge](https://github.com/leadforge-dev/leadforge) — an -open-source Python framework for synthetic CRM/funnel data — and -released as `leadforge-lead-scoring-v1`. The framework version is -decoupled from the dataset version: the package stays at `1.x`; the -dataset is published under the explicit `…-v1` tag. +[leadforge](https://github.com/leadforge-dev/leadforge), an +open-source Python framework for synthetic CRM/funnel data. The +framework version is decoupled from the dataset version: the package +stays at `1.x`; the dataset is published under the explicit `…-v1` +tag. ## Why lead scoring matters in 2024–2026 -The 2024–2026 SaaS environment punishes inefficient sales motions: -median public-SaaS growth has slid from roughly 30% (2023) to about -25% (2025), and the New CAC Ratio rose sharply in 2024 — companies -spent close to $2 of sales-and-marketing for every $1 of net new ARR. -Mid-market vendors can no longer afford to chase every MQL. Predicting -*which* leads convert within a fixed window is now a survival skill, -not a marketing nicety. This dataset is built to teach exactly that -skill on a relational substrate, with the realistic confusions -(snapshot-window discipline, leakage traps, channel signal that's -weaker than vendor blogs imply) that students will hit when they -finally get hands on real CRM data. +Mid-market SaaS vendors entered 2024–2026 with growth slowing and +customer-acquisition costs rising[^macro], so predicting *which* leads +convert within a fixed window has moved from a marketing nicety to a +survival skill. This dataset teaches that skill on a relational +substrate, with the realistic confusions (snapshot-window discipline, +leakage traps, channel signal weaker than vendor blogs imply) that +students will hit when they finally get hands on real CRM data. + +[^macro]: Macroeconomic framing summarised in +[`docs/external_review/summaries/gemini_v2_summary.md`](../docs/external_review/summaries/gemini_v2_summary.md) +(median public-SaaS growth 30%→25% from 2023 to 2025; New CAC Ratio +rose materially in 2024). ## What's inside ``` release/ -├── README.md # This file -├── LICENSE # MIT -├── intro/ # Difficulty tier 1 (highest signal, lowest noise) -│ ├── manifest.json # Provenance: seed, recipe, version, file hashes -│ ├── dataset_card.md # Per-bundle dataset card (auto-rendered) -│ ├── feature_dictionary.csv # Authoritative column spec -│ ├── lead_scoring.csv # Flat convenience CSV (all splits + split column) -│ ├── tables/ # 7 snapshot-safe relational tables -│ │ ├── accounts.parquet -│ │ ├── contacts.parquet -│ │ ├── leads.parquet # No `converted_within_90_days` / `conversion_timestamp` -│ │ ├── touches.parquet # Filtered to `<= lead_created_at + snapshot_day` -│ │ ├── sessions.parquet # Same window -│ │ ├── sales_activities.parquet # Same window -│ │ └── opportunities.parquet # Filtered + no `close_outcome` / `closed_at` -│ └── tasks/converted_within_90_days/ # Pre-split ML task -│ ├── train.parquet # 70% -│ ├── valid.parquet # 15% -│ ├── test.parquet # 15% -│ └── task_manifest.json -├── intermediate/ # Difficulty tier 2 (same shape) -├── advanced/ # Difficulty tier 3 (same shape) -├── intermediate_instructor/ # Research companion (full-horizon + metadata/) -│ ├── … same files plus all 9 relational tables -│ └── metadata/ # Hidden causal structure -│ ├── graph.{json,graphml} # World DAG -│ ├── world_spec.json # Full generation config -│ ├── latent_registry.json # Per-entity latent traits -│ └── mechanism_summary.json -├── notebooks/ -│ └── 01_baseline_lead_scoring.ipynb # Baseline modelling walkthrough (more notebooks land in Phase 6) -└── validation/ - ├── validation_report.{json,md} # Calibration / lift / leakage panel - └── figures/ # Lift curves, calibration, cohort shift, value capture +├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier +│ ├── manifest.json # provenance + file hashes +│ ├── dataset_card.md # auto-rendered per-bundle card +│ ├── feature_dictionary.csv # authoritative column spec +│ ├── lead_scoring.csv # flat convenience CSV (all splits) +│ ├── tables/*.parquet # 7 snapshot-safe relational tables +│ └── tasks/converted_within_90_days/{train,valid,test}.parquet +├── intermediate_instructor/ # research companion: full-horizon tables + metadata/ +├── notebooks/01_baseline_lead_scoring.ipynb +└── validation/ # validation_report.{json,md} + figures ``` -`student_public` bundles ship the snapshot-safe relational view. -`research_instructor` companions ship the full-horizon view and the -hidden causal truth in `metadata/`. The exposure-mode contract is -enforced in code (see "Public vs instructor: what's redacted" below). +`student_public` bundles ship the snapshot-safe relational view; +`research_instructor` companions ship the full-horizon view plus the +hidden causal structure (DAG, latent registry, mechanism summary) +under `metadata/`. The full layout is documented in each bundle's +`manifest.json`. ## Quick start -### Option 1 — flat CSV (simplest) - ```python -import pandas as pd - +# Flat CSV df = pd.read_csv("intermediate/lead_scoring.csv") -train = df[df["split"] == "train"].drop(columns=["split"]) -test = df[df["split"] == "test"].drop(columns=["split"]) -``` - -### Option 2 — Parquet task splits (recommended) - -```python -import pandas as pd +# Parquet task splits (recommended) train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet") test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet") -``` - -Engagement features (`touch_count`, `session_count`, -`pricing_page_views`, `expected_acv`, `days_since_last_touch`, …) -are computed strictly over events on days `[0, snapshot_day]` with -`snapshot_day = 30`. The label `converted_within_90_days` resolves -over the full 90-day window, so even when a lead converts on day 50, -the features cannot encode the conversion event. The deliberate -exception is `total_touches_all`, which counts the full-horizon touch -history and is **flagged** in `feature_dictionary.csv` as -`leakage_risk=True`. Drop it from your feature set unless you are -demonstrating leakage detection. -### Option 3 — relational tables (feature engineering) +# Relational tables (feature engineering) +leads = pd.read_parquet("intermediate/tables/leads.parquet") +touches = pd.read_parquet("intermediate/tables/touches.parquet") -```python -import pandas as pd - -accounts = pd.read_parquet("intermediate/tables/accounts.parquet") -leads = pd.read_parquet("intermediate/tables/leads.parquet") -touches = pd.read_parquet("intermediate/tables/touches.parquet") - -touch_counts = touches.groupby("lead_id").size().rename("my_touch_count") -features = leads.merge(accounts, on="account_id").merge(touch_counts, on="lead_id", how="left") +# Reproduce from source +# pip install leadforge +# leadforge generate --recipe b2b_saas_procurement_v1 --seed 42 \ +# --mode student_public --difficulty intermediate --out my_bundle ``` -Public relational tables are *snapshot-safe*: terminal outcome columns -are dropped, event tables are filtered to events on or before the -snapshot day, and conversion-conditional entities (`customers`, -`subscriptions`) are absent. Joining the public tables cannot -reconstruct the label. - -### Option 4 — reproduce from source - -```bash -pip install leadforge -leadforge generate \ - --recipe b2b_saas_procurement_v1 \ - --seed 42 \ - --mode student_public \ - --difficulty intermediate \ - --out my_bundle -``` +The label `converted_within_90_days` resolves over a 90-day window; +engagement features (`touch_count`, `session_count`, etc.) are +computed strictly over events on days `[0, 30]`. The deliberate +exception is `total_touches_all`, the leakage trap — flagged +`leakage_risk=True` in `feature_dictionary.csv`. Drop it from your +feature set unless you're demonstrating leakage detection. ## Dataset summary @@ -137,7 +79,7 @@ leadforge generate \ | Leads | 5,000 | 5,000 | 5,000 | | Accounts | 1,500 | 1,500 | 1,500 | | Contacts | 4,200 | 4,200 | 4,200 | -| Snapshot columns | 32 (`student_public`) / 34 (`research_instructor`) | 32 / 34 | 32 / 34 | +| Snapshot columns | 32 / 34* | 32 / 34* | 32 / 34* | | Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` | | Conversion rate (recipe band) | 24–61% | 12–31% | 4–12% | | Conversion rate (median, seeds 42–46) | 42.67% | 21.60% | 8.40% | @@ -145,149 +87,56 @@ leadforge generate \ | Noise scale | 0.10 | 0.30 | 0.55 | | Missing rate | 2% | 8% | 18% | -Higher difficulty means weaker latent-to-feature signal, more noise, -more missingness, and lower base conversion rate — all modulated by -the simulation engine, not by post-hoc label-flipping. The full -calibration panel (per-tier ROC-AUC, AP, P@K, lift, calibration, -cross-seed bands, cohort-shift degradation) lives in -[`validation/validation_report.md`](validation/validation_report.md). +\* `student_public` / `research_instructor`. Difficulty is modulated +by the simulation engine — signal strength on latent-trait weights, +Gaussian noise on float features, MCAR missingness, outlier rate — +not post-hoc label flipping. ## The scenario -**Veridian Technologies** is a Series B startup (Austin, US) selling -**Veridian Procure**, a cloud-based procurement and AP-automation -platform, to mid-market firms (200–2,000 employees) in the US and UK. -The sales funnel runs through inbound marketing (45%), SDR outbound -(35%), and partner referrals (20%). Four buyer personas drive deals: -VP Finance (economic buyer), AP Manager (champion), IT Director -(technical evaluator), and Procurement Manager (end user). - -**Task:** predict whether a lead converts (`closed_won`) within 90 -days of entering the funnel. - -The scenario is fictional but the funnel structure, role mix, and -ACV bands ($18k–$120k) sit in mid-market B2B SaaS norms. See +**Veridian Technologies** is a fictional Series B startup (Austin, US) +selling **Veridian Procure**, a procurement / AP automation SaaS, to +mid-market firms (200–2,000 employees) in the US and UK. The funnel +runs through inbound marketing (45%), SDR outbound (35%), and +partner referrals (20%); four personas drive deals (VP Finance, AP +Manager, IT Director, Procurement Manager). **Task:** predict whether +a lead converts (`closed_won`) within 90 days. ACV bands are +$18k–$120k. See [`docs/release/generation_method.md`](../docs/release/generation_method.md) -for how the data is actually produced. - -## Generation method (one-paragraph version) - -The full method is documented in -[`docs/release/generation_method.md`](../docs/release/generation_method.md). -Briefly: a hidden DAG of latent traits and lead states is sampled -from one of five motif families (`fit_dominant`, `intent_dominant`, -`sales_execution_sensitive`, `demo_trial_mediated`, -`buying_committee_friction`) and stochastically rewired per seed. -Mechanisms (logistic latent scores, Poisson and recency-decayed -intensities, hazards, and stage transitions) are assigned per node -based on the motif family, calibrated so each tier hits its target -conversion-rate band. Accounts, contacts, and leads are sampled with -deterministic foreign keys; a 90-day daily simulator advances every -lead through the funnel; opportunities, customers, and subscriptions -materialise from `closed_won` events. -`converted_within_90_days` is **event-derived**, never sampled -directly. The renderer freezes a feature snapshot at day 30; the -label resolves over the full 90 days. - -## Simulation simplifications (what's modelled, what's approximate, what's not) - -This dataset is a teaching artifact, not a digital twin. The list -below makes the abstraction boundary explicit so users don't read -realism into things that aren't there. - -**Modelled.** -- Five distinct motif families with motif-conditioned mechanism - assignments. -- 90-day daily-step simulation with stage transitions, conversion - hazards, churn, direct conversion, and post-conversion - opportunity / customer / subscription materialisation. -- Snapshot-window discipline: every public feature aggregates over - events on days `[0, 30]` only; the 90-day label window resolves - separately. -- Difficulty tiers as a bundle of (signal-strength, noise scale, - missingness rate, outlier rate) parameters tuned per tier. -- Recipe-driven narrative: industry mix, region mix, employee / - revenue bands, role / seniority distributions, channel split. - -**Approximate.** -- Lead-source channels (`inbound_marketing`, `sdr_outbound`, - `partner_referral`) are categorical labels, not channel-conditional - generative axes. The audit - [`docs/release/channel_signal_audit.md`](../docs/release/channel_signal_audit.md) - measures how strongly channel actually signals conversion in - v1: weak — univariate AUC ≤ 0.521 across all tiers, well below - the G2 / Gemini v2 industry MQL→SQL band (SEO ~51% vs Email <1%). - Real channel-conditional encoding is post-v1 work. -- Sales cycles. Whatever distribution falls out of the daily - hazards. Not log-normal / Weibull-tuned to reproduce the - industry-typical ~84-day median. -- Demographic strings. Job titles and roles are clean categorical - labels, not free-text variants ("VP of Operations" vs "Head of - Ops" vs "Operations VP"). No NLP cleanup is required. -- Industry calibration. Conversion-rate bands are tuned for v1's - fictional vertical, not anchored to per-vertical CRM data - (cybersecurity, fintech, etc.). - -**Not modelled.** -- Macroeconomic shocks, seasonality, fiscal-quarter close cycles. -- Real customer support / churn dynamics post-conversion (the - customer + subscription tables exist for relational completeness - but are not the modelling target in v1). -- Multi-product / cross-sell motions. One product, one task. -- Deliberate noise injection at the *string* level (typos, - capitalisation, encoding). Free-text-cleanup work is post-v1. - -The post-v1 roadmap -([`docs/release/post_v1_roadmap.md`](../docs/release/post_v1_roadmap.md)) -tracks each "approximate" / "not modelled" axis with an explicit -v2 candidate scope. +for the full DGP, and the deeper "what's modelled / approximate / not +modelled" breakdown that this README only summarises. ## Public vs instructor: what's redacted -Filtering happens **during rendering**, not during simulation, and -the redaction contract is single-sourced in -[`leadforge/validation/leakage_probes.py`](../leadforge/validation/leakage_probes.py). -The same constants are imported by the snapshot-safe writer -([`leadforge/render/relational_snapshot_safe.py`](../leadforge/render/relational_snapshot_safe.py)) -and by the validator that polices public bundles, so the writer -and the gate cannot drift apart. +Filtering happens **during rendering**, not during simulation. The +redaction contract is single-sourced in +[`leadforge/validation/leakage_probes.py`](../leadforge/validation/leakage_probes.py); +the snapshot-safe writer and the validator import the same constants, +so they cannot drift apart. -| Constant | Public bundle treatment | +| Source-of-truth constant | Public bundle treatment | |---|---| | `BANNED_LEAD_COLUMNS = ("converted_within_90_days", "conversion_timestamp")` | Dropped from `tables/leads.parquet` | | `BANNED_OPP_COLUMNS = ("close_outcome", "closed_at")` | Dropped from `tables/opportunities.parquet` | -| `BANNED_TABLES = ("customers", "subscriptions")` | Omitted from public bundles entirely | +| `BANNED_TABLES = ("customers", "subscriptions")` | Omitted from public bundles | | `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` | -| Snapshot-feature redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` | -| `total_touches_all` (the deliberate trap) | **Retained** in both modes; flagged `leakage_risk=True` in `feature_dictionary.csv` | +| Snapshot redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` | +| `total_touches_all` (deliberate trap) | **Retained in both modes**; flagged `leakage_risk=True` | -The `manifest.json` for each bundle records `relational_snapshot_safe` -(true for `student_public`, false for `research_instructor`), -`redacted_columns`, and `snapshot_day`; the bundle is +Each bundle's `manifest.json` records `relational_snapshot_safe`, +`redacted_columns`, and `snapshot_day`, so the bundle is self-describing. -The instructor companion (`intermediate_instructor/`) ships the -full-horizon view: all 9 relational tables, no column drops, plus -the hidden causal structure under `metadata/`. It is **not** -appropriate input for the student-facing task. - ## Calibration -Every numeric claim about realism, calibration, or difficulty in -this README is backed by +Every realism / calibration / difficulty claim in this README is +backed by [`validation/validation_report.md`](validation/validation_report.md), -which is regenerated by -[`scripts/validate_release_candidate.py`](../scripts/validate_release_candidate.py). -The driver runs an N=5 cross-seed sweep per tier (seeds 42–46), -applies the bands declared in -[`docs/release/v1_acceptance_gates_bands.yaml`](../docs/release/v1_acceptance_gates_bands.yaml), -and exits non-zero if anything falls outside band. The full gate -list is in -[`docs/release/v1_acceptance_gates.md`](../docs/release/v1_acceptance_gates.md); -every dataset-card claim cites the JSON path on the report so -machine-readable verification is possible. - -Headline cross-seed medians for the canonical sweep: +regenerated by +[`scripts/validate_release_candidate.py`](../scripts/validate_release_candidate.py) +with bands declared in +[`docs/release/v1_acceptance_gates_bands.yaml`](../docs/release/v1_acceptance_gates_bands.yaml). +Headline cross-seed medians (seeds 42–46): | Tier | LR AUC | AP | P@100 | Brier | |---|---|---|---|---| @@ -296,139 +145,84 @@ Headline cross-seed medians for the canonical sweep: | advanced | 0.886 | 0.351 | 0.34 | 0.061 | AP, P@100, conversion-rate, and lift orderings hold across the -intended difficulty axis (intro > intermediate > advanced); see -[`validation/validation_report.md`](validation/validation_report.md) -for cross-seed spreads, calibration bins, lift curves, and -cohort-shift degradation. +intended difficulty axis (intro > intermediate > advanced). ## Intended uses -- Teaching baseline lead-scoring modelling on a flat snapshot. -- Teaching relational feature engineering against snapshot-safe - raw tables. -- Teaching leakage detection: the deliberate trap - (`total_touches_all`) is designed to be discoverable. +- Teaching baseline lead-scoring on a flat snapshot. +- Teaching relational feature engineering against snapshot-safe tables. +- Teaching leakage detection (the `total_touches_all` trap is + designed to be discoverable). - Teaching calibration, lift, P@K, value-aware ranking (`expected_acv × P(convert)`), and cohort-shift evaluation. -- Comparing model families (linear vs tree) under a controlled DGP. +- Comparing model families under a controlled DGP. ## Out-of-scope uses -- **Production lead scoring.** This is synthetic data; the company, - product, and customers are fictional. Do not deploy a model - trained on `leadforge-lead-scoring-v1` against real leads. -- **Vendor benchmarking or paper baselines.** The difficulty tiers - are calibrated for pedagogy; cross-paper comparability is not a - design goal. -- **Causal inference research that requires recovery of the true - DGP.** The instructor companion exposes the hidden graph for - teaching purposes, but real causal-inference benchmarks need - designed counterfactuals, not a sampled DAG. +- **Production lead scoring.** The company, product, and customers are + fictional. +- **Vendor benchmarking / paper baselines.** Difficulty tiers are + calibrated for pedagogy, not cross-paper comparability. +- **Causal-inference research that requires recovery of the true DGP.** + The instructor companion exposes the hidden graph for teaching, not + designed counterfactuals. - **Demographic / fairness research.** v1 does not model protected - attributes or sensitive demographic axes; any "bias" you find is - a bug in the simulation, not a teaching artefact. + attributes. ## Known limitations -- **Difficulty signal on raw AUC is flat.** Across the canonical - sweep, LR AUC is ~0.88 in every tier. Difficulty is visible in - AP / P@K / Brier / value capture, not in AUC alone — the - validation report uses AP and P@K as the headline difficulty - axis. Treat AUC as a sanity check, not a difficulty signal. -- **GBM does not consistently beat LR on this snapshot - (gate G7.4.4).** Across the canonical sweep, the GBM−LR AUC - delta is slightly negative in every tier - (intro −0.0045, intermediate −0.0072, advanced −0.0133). v1's - snapshot is dominated by linear features (engagement aggregates - + firmographics) and a HistGBM does not consistently beat a - regularised logistic regression at this signal level. The - cross-tier sign check is therefore *informational* in v1; the - per-tier `gbm_minus_lr_auc` bands gate the release. v2 will - introduce non-linear interactions in the simulator (saturation - curves, threshold effects) so the gate bites; tracked in - [`docs/release/post_v1_roadmap.md`](../docs/release/post_v1_roadmap.md). -- **Channel signal is weak versus published industry data.** - Per [`docs/release/channel_signal_audit.md`](../docs/release/channel_signal_audit.md), - the largest per-channel rate spread is 0.043 and the largest - univariate AUC is 0.521 — well below the G2 / Gemini v2 - MQL→SQL band. Channel-conditional encoding is post-v1 work. -- **Cohort-shift degradation is small.** v1's bundles are - roughly IID-balanced over the 90-day horizon (no time-of-year - drift baked in). The cohort-shift gate (G6.4) is informational - in v1 and will bite in v2 once seasonality is injected. -- **Calibration is noisy at small per-bin n** in the advanced - tier (low base rate × small calibration bins). The Brier score - is the more reliable calibration signal at advanced; per-bin - calibration error is bounded by the gate but should not be - read as a precise miscalibration claim. - -## Composition (Datasheets-for-Datasets) - -- **Entities.** Accounts (the buying organisations), contacts (the - human stakeholders attached to accounts), leads (the funnel - unit; one per lead-creation event), touches (marketing/sales - contact events), sessions (web/trial sessions), - sales_activities (rep-logged activities), opportunities (sales - cycles attached to leads), and — instructor only — customers - and subscriptions (post-conversion entities). Per-row counts - are recorded in each bundle's `manifest.json` `tables` block. -- **Features.** 32 columns in the public student-facing snapshot, - grouped by category in - [`docs/release/feature_dictionary.md`](../docs/release/feature_dictionary.md). - The authoritative per-bundle CSV is - `feature_dictionary.csv`; it carries dtype, description, - category, target flag, and leakage flag for every column. -- **Label.** `converted_within_90_days` (boolean), event-derived - from the simulator. Never sampled directly. -- **Splits.** 70/15/15 train/valid/test, deterministic given the - bundle seed; recorded in `tasks/converted_within_90_days/task_manifest.json`. -- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, - package version stamped in `manifest.json`. The full hidden - DAG, latent registry, and mechanism summary are in the - instructor companion's `metadata/` directory. - -## Maintenance - -- **Versioning.** This is dataset version `v1`; the framework that - produced it is at package version `1.0.0+`. Future dataset - versions are tagged independently as `leadforge-lead-scoring-v2`, - etc.; the framework version is a separate axis. See - [`docs/release/v1_release_design.md`](../docs/release/v1_release_design.md) - for the rationale. -- **Issue templates.** `.github/ISSUE_TEMPLATE/` ships - `dataset_breakage_report.yml` (for "I broke the dataset") and - `realism_feedback.yml` (for realism critiques) once Phase 6 - lands. -- **Adversarial framing.** We *want* the dataset to be broken. - See `docs/release/break_me_guide.md` (lands in Phase 6) for - explicit invitations to find direct leakage, reconstruct labels - through joins, beat the baseline lift legitimately, surface - unrealistic distributions, identify documentation ambiguity, - and propose better calibration sources. -- **v2 decision log.** Once Phase 6 ships, - `docs/release/v2_decision_log.md` will track every accepted - v1 finding and the design call that came from it. -- **Maintainers.** [leadforge-dev](https://github.com/leadforge-dev/leadforge) - on GitHub. File issues; PRs welcome. - -## Provenance +- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across + every tier. Difficulty is visible in AP, P@K, Brier, and value + capture. Treat AUC as a sanity check, not a difficulty signal. +- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta + is slightly negative in every tier (intro −0.0045, intermediate + −0.0072, advanced −0.0133); v1's snapshot is dominated by linear + features. v2 will inject non-linear interactions in the simulator. +- **Channel signal is weak.** Per + [`docs/release/channel_signal_audit.md`](../docs/release/channel_signal_audit.md), + out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across + all tiers and the per-channel rate spread is ≤0.05. The simulator + does not encode channel-conditional probabilities; channel-conditional + encoding is post-v1 work. +- **Cohort-shift degradation is small.** v1 has no time-of-year drift + baked in; the cohort-shift gate (G6.4) is informational and will + bite in v2. + +## Composition + +- **Entities.** Accounts, contacts, leads, touches, sessions, + sales_activities, opportunities (public); plus customers and + subscriptions (instructor only). Per-row counts per bundle live in + `manifest.json`. +- **Features.** 32 public columns grouped by analytical role in + [`docs/release/feature_dictionary.md`](../docs/release/feature_dictionary.md); + the per-bundle `feature_dictionary.csv` is the authoritative + machine-readable spec. +- **Label.** `converted_within_90_days` (boolean), event-derived from + the simulator. Never sampled directly. +- **Splits.** 70/15/15 train/valid/test, deterministic given seed; + recorded in `tasks/converted_within_90_days/task_manifest.json`. +- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package + version stamped in `manifest.json`. + +## Maintenance, adversarial framing, license + +We *want* the dataset to be broken. Issue templates ship under +`.github/ISSUE_TEMPLATE/` (Phase 6); the break-me guide lands as +`docs/release/break_me_guide.md` (PR 6.3). Once Phase 6 ships, +`docs/release/v2_decision_log.md` will track every accepted finding +and the design call that came from it. File issues at +[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge); +PRs welcome. | Field | Value | |---|---| -| Generator | [leadforge](https://github.com/leadforge-dev/leadforge) `1.0.0+` | +| Generator | leadforge `1.0.0+` | | Recipe | `b2b_saas_procurement_v1` | -| Canonical seed | 42 | -| Cross-seed sweep | 42, 43, 44, 45, 46 (per tier) | +| Canonical seed | 42 (cross-seed sweep: 42–46) | | Bundle schema version | 5 | | Format | Parquet (canonical) + CSV (convenience) | -| License | MIT | - -Every bundle includes a `manifest.json` with the exact package -version, recipe, seed, generation timestamp, snapshot day, label -window, table inventory with row counts, and SHA-256 hashes for -all data files. To verify integrity, install leadforge and run -`leadforge validate `. - -## License +| License | MIT — see [LICENSE](LICENSE) | -MIT. See [LICENSE](LICENSE). +Verify integrity with `leadforge validate `; every file +is hashed in `manifest.json`. From f84c8b6fff8c7b127e14a25119f41b9ad7f92a0b Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 17:02:59 +0300 Subject: [PATCH 7/7] fix(scripts,docs): address Copilot review threads on PR 4.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six fixes from the Copilot reviews on PR #69: * scripts/audit_channel_signal.py — _label_to_int now uses pd.api.types.is_bool_dtype() so it explicitly handles pandas nullable BooleanDtype (the actual parquet dtype on the v1 bundles) alongside numpy bool. Previously it worked via a coincidental pd.to_numeric fallback, with a comment that misled future readers. * scripts/audit_channel_signal.py — render_markdown now takes both md_path and json_path and emits the JSON link as a relative path to the markdown's directory, so a `--out-md`/`--out-json` override produces a markdown report whose link target is correct. Defaults to the canonical "channel_signal_audit.json" basename when called without paths (the unit-test path). * scripts/audit_channel_signal.py — main() pins encoding="utf-8" on both write_text() calls so the audit output is byte-identical across operating systems and locale configurations. * scripts/audit_channel_signal.py — Discussion section is no longer bundle-specific. The previous prose claimed "for seed 42 the OOS numbers below match the report cell-for-cell" — true for the committed bundle but wrong for any other --release-dir. The new prose talks about which AUC is comparable and what conclusion the numbers in the per-tier sections support, both bundle-agnostic. * release/README.md — fixed the relational-feature-engineering Quick start example. The previous snippet did `leads.merge(touch_counts, on="lead_id")` where touch_counts was a Series with lead_id in its index, not as a column — would error in modern pandas. The new snippet uses .reset_index() and merges the resulting DataFrame. * docs/release/feature_dictionary.md — touches_week_1 documented as "days 0–7 inclusive" (8 day values) and touches_last_7_days qualified with "for snapshot_day=30, days 24–30 inclusive". Previously claimed "days 0–6" for week_1, which mismatched the snapshot builder's _day <= 7 window. Test changes: * test_release_audit_is_deterministic now writes both runs to the same path (back-to-back overwrite) instead of distinct tmp paths, so the relative-link rendering doesn't make the two outputs differ. * test_committed_audit_artifacts_match_fresh_regeneration uses the canonical "channel_signal_audit.{md,json}" basenames in tmp_path, so the relative link in the regenerated markdown matches the committed file's link. Two stale Copilot threads (firmographics "Six columns" and "bandage" typo) were already addressed in commit f6b274e during the first self-review pass. 1175/1175 tests pass; ruff + mypy clean; the regenerated audit artifacts are byte-identical via the canonical-path mode. Co-Authored-By: Claude Opus 4.7 --- docs/release/channel_signal_audit.md | 6 +- docs/release/feature_dictionary.md | 4 +- release/README.md | 6 +- scripts/audit_channel_signal.py | 75 ++++++++++++++++------ tests/scripts/test_audit_channel_signal.py | 57 ++++++++-------- 5 files changed, 91 insertions(+), 57 deletions(-) diff --git a/docs/release/channel_signal_audit.md b/docs/release/channel_signal_audit.md index 4786ff8..2cc3d56 100644 --- a/docs/release/channel_signal_audit.md +++ b/docs/release/channel_signal_audit.md @@ -1,6 +1,6 @@ # Channel-signal audit — leadforge-lead-scoring-v1 -Audit produced by `scripts/audit_channel_signal.py`; see `docs/release/channel_signal_audit.json` for the machine-readable form. +Audit produced by `scripts/audit_channel_signal.py`; see `channel_signal_audit.json` for the machine-readable form. **Scope.** For every tier we compute per-channel conversion rates on the train split and the univariate AUC of channel against `converted_within_90_days`, scored as the empirical positive rate per channel (a 1-D Bayes classifier). Two AUCs are reported: an **in-sample** number (train rates → train labels — biased upward by construction) and an **out-of-sample** number (train rates → test labels — directly comparable to the `source_only` baselines in `release/validation/validation_report.json`). @@ -62,5 +62,5 @@ The numbers above answer one question: *how strongly does channel alone signal 9 Two empirical observations a reader can make from the numbers above: -1. **The out-of-sample univariate AUC reproduces the `source_only` baseline** in `release/validation/validation_report.json` (HistGBM trained on `lead_source` + `first_touch_channel` against the same test split). For seed 42 the OOS numbers below match the report cell-for-cell. The in-sample number is biased upward by construction — small at v1's N but visible — so the OOS number is the one to compare against any external baseline. -2. **Out-of-sample univariate AUC is close to chance** in every tier and the per-channel conversion-rate spread is small (≤0.05). Channel alone is a weak feature in v1 — consistent with the design: the simulator drives conversion through motif-family hazards keyed off latent traits, not channel-conditional probabilities. Channel-conditional encoding is tracked as post-v1 work in `docs/release/post_v1_roadmap.md`. +1. **The out-of-sample univariate AUC is the comparable number** for any external baseline. It uses train-derived rates scored against held-out test labels — the same shape as the `source_only` HistGBM baseline reported in `release/validation/validation_report.json`, which is built on the same task splits with `lead_source` + `first_touch_channel` as the only features. The in-sample number is biased upward by construction — small at v1's N but visible — and is reported here for transparency rather than comparison. +2. **The numerical conclusion is bundle-specific.** When the per-channel rate spread is small and the OOS univariate AUC is close to chance, channel alone is a weak feature for the bundle this audit was run against. v1's bundles currently produce that outcome (see the per-tier sections above) — consistent with the design: the simulator drives conversion through motif-family hazards keyed off latent traits, not channel-conditional probabilities. Channel-conditional encoding is tracked as post-v1 work in `docs/release/post_v1_roadmap.md`. diff --git a/docs/release/feature_dictionary.md b/docs/release/feature_dictionary.md index feaff07..790354a 100644 --- a/docs/release/feature_dictionary.md +++ b/docs/release/feature_dictionary.md @@ -99,8 +99,8 @@ features cannot encode events that drove the late-window outcome. | `pricing_page_views` | Int64 | Cumulative pricing-page views across sessions. | | `demo_page_views` | Int64 | Cumulative demo-page views across sessions. | | `total_session_duration_seconds` | Int64 | Cumulative seconds across all sessions. | -| `touches_week_1` | Int64 | Touches in days 0–6 (early urgency proxy). | -| `touches_last_7_days` | Int64 | Touches in days 24–30 (late-window momentum proxy). | +| `touches_week_1` | Int64 | Touches in days 0–7 inclusive (early urgency proxy; the snapshot builder uses `_day <= 7`, which is 8 day values). | +| `touches_last_7_days` | Int64 | Touches in the last 7 days of the snapshot window — for `snapshot_day=30`, days 24–30 inclusive (the snapshot builder uses `_day > snapshot_day - 7`). | | `days_since_first_touch` | Float64 | NaN if the lead has had zero touches by snapshot day. | ## Funnel and sales-process diff --git a/release/README.md b/release/README.md index e0a58d6..cb1329c 100644 --- a/release/README.md +++ b/release/README.md @@ -55,9 +55,13 @@ df = pd.read_csv("intermediate/lead_scoring.csv") train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet") test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet") -# Relational tables (feature engineering) +# Relational tables (feature engineering — example) leads = pd.read_parquet("intermediate/tables/leads.parquet") touches = pd.read_parquet("intermediate/tables/touches.parquet") +my_touch_count = ( + touches.groupby("lead_id").size().rename("my_touch_count").reset_index() +) +features = leads.merge(my_touch_count, on="lead_id", how="left") # Reproduce from source # pip install leadforge diff --git a/scripts/audit_channel_signal.py b/scripts/audit_channel_signal.py index feef6a1..12a50f4 100644 --- a/scripts/audit_channel_signal.py +++ b/scripts/audit_channel_signal.py @@ -150,10 +150,15 @@ class AuditReport: def _label_to_int(series: pd.Series) -> pd.Series: - """Coerce a (possibly nullable boolean) label to ``int``.""" + """Coerce a label column to ``int``. - if series.dtype == "bool": - return series.astype(int) + Handles three dtypes the v1 bundles actually carry: numpy ``bool``, + pandas nullable ``BooleanDtype`` (used by the parquet schema), and + plain numeric. Other dtypes raise via ``pd.to_numeric``. + """ + + if pd.api.types.is_bool_dtype(series): + return series.astype("Int64").astype(int) return pd.to_numeric(series, errors="raise").astype(int) @@ -387,15 +392,40 @@ def _group_identical_columns(audits: Sequence[ChannelAudit]) -> list[ChannelGrou return groups -def render_markdown(report: AuditReport) -> str: - """Render the audit report as Markdown.""" +def render_markdown( + report: AuditReport, + *, + md_path: Path | None = None, + json_path: Path | None = None, +) -> str: + """Render the audit report as Markdown. + + The inline "see also" link to the machine-readable sibling adapts + to the actual output paths: when ``md_path`` and ``json_path`` are + given, the link is the JSON path expressed *relative to the + markdown file's directory* so it works whether the artifacts are + written to the canonical ``docs/release/`` location, a tmp + directory, or anywhere a CI script overrides. When neither is + given, the link is the canonical ``channel_signal_audit.json`` + filename. + """ + + if md_path is not None and json_path is not None: + try: + json_link = str(Path(json_path).relative_to(Path(md_path).parent)) + except ValueError: + # Different drive roots — keep the markdown readable by + # falling back to the caller's path verbatim. + json_link = str(json_path) + else: + json_link = DEFAULT_OUT_JSON.name lines: list[str] = [] lines.append("# Channel-signal audit — leadforge-lead-scoring-v1") lines.append("") lines.append( "Audit produced by `scripts/audit_channel_signal.py`; see " - "`docs/release/channel_signal_audit.json` for the machine-readable form." + f"`{json_link}` for the machine-readable form." ) lines.append("") lines.append( @@ -473,19 +503,22 @@ def render_markdown(report: AuditReport) -> str: lines.append("Two empirical observations a reader can make from the numbers above:") lines.append("") lines.append( - "1. **The out-of-sample univariate AUC reproduces the `source_only` baseline** in " - "`release/validation/validation_report.json` (HistGBM trained on `lead_source` + " - "`first_touch_channel` against the same test split). For seed 42 the OOS numbers " - "below match the report cell-for-cell. The in-sample number is biased upward by " - "construction — small at v1's N but visible — so the OOS number is the one to " - "compare against any external baseline." + "1. **The out-of-sample univariate AUC is the comparable number** for any " + "external baseline. It uses train-derived rates scored against held-out test " + "labels — the same shape as the `source_only` HistGBM baseline reported in " + "`release/validation/validation_report.json`, which is built on the same task " + "splits with `lead_source` + `first_touch_channel` as the only features. The " + "in-sample number is biased upward by construction — small at v1's N but " + "visible — and is reported here for transparency rather than comparison." ) lines.append( - "2. **Out-of-sample univariate AUC is close to chance** in every tier and the " - "per-channel conversion-rate spread is small (≤0.05). Channel alone is a weak " - "feature in v1 — consistent with the design: the simulator drives conversion " - "through motif-family hazards keyed off latent traits, not channel-conditional " - "probabilities. Channel-conditional encoding is tracked as post-v1 work in " + "2. **The numerical conclusion is bundle-specific.** When the per-channel rate " + "spread is small and the OOS univariate AUC is close to chance, channel alone " + "is a weak feature for the bundle this audit was run against. v1's bundles " + "currently produce that outcome (see the per-tier sections above) — consistent " + "with the design: the simulator drives conversion through motif-family hazards " + "keyed off latent traits, not channel-conditional probabilities. " + "Channel-conditional encoding is tracked as post-v1 work in " "`docs/release/post_v1_roadmap.md`." ) lines.append("") @@ -573,13 +606,15 @@ def main(argv: Sequence[str] | None = None) -> int: print(f"error: required column missing: {exc}", file=sys.stderr) return 2 - md = render_markdown(report) + md = render_markdown(report, md_path=args.out_md, json_path=args.out_json) js = render_json(report) args.out_md.parent.mkdir(parents=True, exist_ok=True) args.out_json.parent.mkdir(parents=True, exist_ok=True) - args.out_md.write_text(md) - args.out_json.write_text(js) + # Pin UTF-8 explicitly so the audit output is byte-identical across + # operating systems and locale configurations. + args.out_md.write_text(md, encoding="utf-8") + args.out_json.write_text(js, encoding="utf-8") if args.print: sys.stdout.write(md) diff --git a/tests/scripts/test_audit_channel_signal.py b/tests/scripts/test_audit_channel_signal.py index 6bc5784..aad8ea9 100644 --- a/tests/scripts/test_audit_channel_signal.py +++ b/tests/scripts/test_audit_channel_signal.py @@ -241,35 +241,26 @@ def test_release_audit_is_deterministic(tmp_path: Path) -> None: """Two back-to-back runs against the committed release bundle must produce byte-identical JSON and markdown output.""" - out_md_a = tmp_path / "a.md" - out_json_a = tmp_path / "a.json" - out_md_b = tmp_path / "b.md" - out_json_b = tmp_path / "b.json" - - rc_a = audit_module.main( - [ - "--release-dir", - str(_REPO_ROOT / "release"), - "--out-md", - str(out_md_a), - "--out-json", - str(out_json_a), - ] - ) - rc_b = audit_module.main( - [ - "--release-dir", - str(_REPO_ROOT / "release"), - "--out-md", - str(out_md_b), - "--out-json", - str(out_json_b), - ] - ) - assert rc_a == 0 - assert rc_b == 0 - assert out_md_a.read_bytes() == out_md_b.read_bytes() - assert out_json_a.read_bytes() == out_json_b.read_bytes() + out_md = tmp_path / "audit.md" + out_json = tmp_path / "audit.json" + cli_args = [ + "--release-dir", + str(_REPO_ROOT / "release"), + "--out-md", + str(out_md), + "--out-json", + str(out_json), + ] + assert audit_module.main(cli_args) == 0 + bytes_md_a = out_md.read_bytes() + bytes_json_a = out_json.read_bytes() + + assert audit_module.main(cli_args) == 0 + bytes_md_b = out_md.read_bytes() + bytes_json_b = out_json.read_bytes() + + assert bytes_md_a == bytes_md_b + assert bytes_json_a == bytes_json_b def test_main_reports_missing_release_dir( @@ -348,8 +339,12 @@ def test_committed_audit_artifacts_match_fresh_regeneration( # default (relative) ``release`` argument. monkeypatch.chdir(_REPO_ROOT) - out_md = tmp_path / "audit.md" - out_json = tmp_path / "audit.json" + # The committed markdown links the JSON sibling by relative + # filename (rendered from --out-md and --out-json being siblings), + # so re-run with the same basenames so the byte comparison covers + # the full file including the link line. + out_md = tmp_path / "channel_signal_audit.md" + out_json = tmp_path / "channel_signal_audit.json" rc = audit_module.main( [ "--out-md",