leadforge-dev · shaypal5 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.agent-plan.md b/.agent-plan.md
@@ -395,6 +395,19 @@ From self-review of PR #50. Completed in a single follow-up PR.
 | Group followup params into dataclass | ✓ `FollowupRampConfig` frozen dataclass in `mechanisms/counts.py`. `LatentDecayIntensity` accepts `followup: FollowupRampConfig | None`. Legacy params still accepted for backward compat. |
 | Fix `subsample` silent short-return | ✓ `subsample()` now raises `ValueError` when insufficient negatives. |
 
+### mid-project: generation framework for 3-week pair mid-project dataset ✓
+
+Generator code only — no dataset artifacts committed here (public repo).
+Dataset artifacts live in `leadforge-datasets-private/lead_scoring_midproject/`.
+
+| Item | Status |
+|---|---|
+| `leadforge/pipelines/build_midproject.py` | ✓ Pipeline module (seed=100, SUBSAMPLE_N=1200) |
+| `scripts/build_midproject_lead_scoring.py` | ✓ Build CLI |
+| `scripts/validate_midproject_lead_scoring.py` | ✓ Validation script |
+| `scripts/quick_baseline_eval_midproject.py` | ✓ Baseline evaluation script |
+| Dataset artifacts | ✓ In `leadforge-datasets-private` (private repo) |
+
 ### From post-v1 list
 
 - Second vertical

diff --git a/leadforge/pipelines/build_midproject.py b/leadforge/pipelines/build_midproject.py
@@ -0,0 +1,79 @@
+"""Pipeline functions for building the mid-project lead scoring dataset.
+
+Produces a single student-safe CSV with 1,200 rows at ~30% conversion rate.
+No leakage trap column — this dataset is published directly to students.
+
+Key parameters vs v7:
+- SEED = 100  (different seed → different rows from v7's seed=42)
+- SUBSAMPLE_N = 1200 (slightly larger than v7's 1000)
+- No instructor/trap variant
+- Same schema, narrative, missingness patterns as v7
+"""
+
+from __future__ import annotations
+
+import pandas as pd
+
+from leadforge.pipelines.common import (
+    ACV_CAP,
+    ACV_FLOOR,
+    FINAL_COLUMNS_STUDENT,
+    RENAME_MAP,
+    TARGET_RATE,
+    assign_acquisition_wave,
+    derive_features,
+    softcap_expected_acv,
+    subsample,
+)
+from leadforge.pipelines.common import (
+    inject_missingness_v6 as inject_missingness,
+)
+from leadforge.pipelines.common import (
+    rename_and_select as _rename_and_select_generic,
+)
+
+__all__ = [
+    "ACV_CAP",
+    "ACV_FLOOR",
+    "FINAL_COLUMNS_STUDENT",
+    "N_LEADS",
+    "RENAME_MAP",
+    "SEED",
+    "SNAPSHOT_DAY",
+    "SUBSAMPLE_N",
+    "TARGET_RATE",
+    "assign_acquisition_wave",
+    "derive_features",
+    "inject_missingness",
+    "rename_and_select",
+    "softcap_expected_acv",
+    "subsample",
+]
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+SEED = 100
+N_LEADS = 5000
+SNAPSHOT_DAY = 20
+SUBSAMPLE_N = 1200
+
+
+# ---------------------------------------------------------------------------
+# Version-specific pipeline steps
+# ---------------------------------------------------------------------------
+
+
+def rename_and_select(
+    df: pd.DataFrame,
+    *,
+    label_column: str = "converted_within_90_days",
+) -> pd.DataFrame:
+    """Rename snapshot columns to midproject names and select final column set."""
+    return _rename_and_select_generic(
+        df,
+        rename_map=RENAME_MAP,
+        final_columns=FINAL_COLUMNS_STUDENT,
+        instructor=False,
+        label_column=label_column,
+    )
diff --git a/scripts/build_midproject_lead_scoring.py b/scripts/build_midproject_lead_scoring.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""Build the mid-project lead scoring dataset.
+
+Usage:
+    python scripts/build_midproject_lead_scoring.py OUTPUT_DIR
+
+Produces one file in OUTPUT_DIR:
+    lead_scoring_midproject.csv   (student-safe, no leakage columns)
+
+1,200 rows at ~30% conversion rate, snapshot day 20.
+Seed: 100.  Schema identical to lead_scoring_intro_v7.csv.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pandas as pd
+
+from leadforge.api.generator import Generator
+from leadforge.pipelines.build_midproject import (
+    N_LEADS,
+    SEED,
+    SNAPSHOT_DAY,
+    SUBSAMPLE_N,
+    assign_acquisition_wave,
+    derive_features,
+    inject_missingness,
+    rename_and_select,
+    softcap_expected_acv,
+    subsample,
+)
+from leadforge.render.snapshots import build_snapshot
+
+# ---------------------------------------------------------------------------
+# Orchestration
+# ---------------------------------------------------------------------------
+
+
+def generate_bundle(seed: int = SEED, n_leads: int = N_LEADS):
+    """Generate a full bundle and return (snapshot, bundle)."""
+    gen = Generator.from_recipe(
+        "b2b_saas_procurement_v1",
+        seed=seed,
+        exposure_mode="research_instructor",
+        n_leads=n_leads,
+        difficulty="intro",
+    )
+    bundle = gen.generate(latent_touch_intensity=True)
+    snapshot = build_snapshot(
+        bundle.simulation_result,
+        bundle.population,
+        snapshot_day=SNAPSHOT_DAY,
+    )
+    return snapshot, bundle
+
+
+def build_midproject_dataset(seed: int = SEED) -> pd.DataFrame:
+    """Full pipeline: generate → derive → process → subsample → missingness."""
+    print("Generating bundle...", file=sys.stderr)
+    snapshot, _bundle = generate_bundle(seed=seed)
+    conv = snapshot["converted_within_90_days"].mean()
+    print(
+        f"  Raw snapshot: {len(snapshot)} rows, conversion={conv:.1%}",
+        file=sys.stderr,
+    )
+
+    df = derive_features(snapshot)
+    df = softcap_expected_acv(df, seed)
+    df = assign_acquisition_wave(df, seed)
+    df = rename_and_select(df)
+
+    print(f"Subsampling to {SUBSAMPLE_N} rows...", file=sys.stderr)
+    df = subsample(df, seed, n=SUBSAMPLE_N)
+    print(
+        f"  Subsampled: {len(df)} rows, conversion={df['converted'].mean():.1%}",
+        file=sys.stderr,
+    )
+
+    print("Injecting missingness...", file=sys.stderr)
+    df = inject_missingness(df, seed)
+
+    return df
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} OUTPUT_DIR", file=sys.stderr)
+        sys.exit(1)
+
+    output_dir = Path(sys.argv[1])
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    df = build_midproject_dataset()
+
+    out_path = output_dir / "lead_scoring_midproject.csv"
+    df.to_csv(out_path, index=False)
+    print(
+        f"Midproject: {len(df)} rows x {len(df.columns)} cols → {out_path}",
+        file=sys.stderr,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/quick_baseline_eval_midproject.py b/scripts/quick_baseline_eval_midproject.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""Quick baseline evaluation for the mid-project lead scoring dataset.
+
+Usage:
+    python scripts/quick_baseline_eval_midproject.py CSV_PATH
+
+Runs LR + RF + GBM baselines, value-aware scoring, and feature importance.
+"""
+
+from __future__ import annotations
+
+import sys
+
+import numpy as np
+import pandas as pd
+from sklearn.base import clone
+from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import average_precision_score, roc_auc_score
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+
+from leadforge.pipelines.common import BINARY_FEATURES, CAT_FEATURES, NUM_FEATURES, TARGET
+from leadforge.pipelines.ml import LEAKAGE_PREFIX, build_preprocessor, sanitize_categoricals
+
+_EVAL_NUM_FEATURES = NUM_FEATURES + BINARY_FEATURES
+
+
+def main() -> None:
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} CSV_PATH", file=sys.stderr)
+        sys.exit(1)
+
+    df = sanitize_categoricals(pd.read_csv(sys.argv[1]), CAT_FEATURES)
+    leakage = {c for c in df.columns if c.startswith(LEAKAGE_PREFIX)}
+    cat_cols = [c for c in CAT_FEATURES if c in df.columns and c not in leakage]
+    num_cols = [c for c in _EVAL_NUM_FEATURES if c in df.columns and c not in leakage]
+
+    y = df[TARGET].astype(int)
+    x = df[cat_cols + num_cols]
+
+    print(f"Dataset: {len(df)} rows, {len(df.columns)} cols")
+    print(f"Conversion rate: {y.mean():.1%}")
+    print(f"Features: {len(cat_cols)} cat + {len(num_cols)} num = {len(cat_cols) + len(num_cols)}")
+
+    print("\n" + "=" * 60)
+    print("MODEL COMPARISON (5-seed average, 70/30 stratified)")
+    print("=" * 60)
+
+    models = {
+        "LR": LogisticRegression(max_iter=1000, solver="lbfgs", random_state=42),
+        "RF": RandomForestClassifier(n_estimators=100, random_state=42),
+        "GBM": GradientBoostingClassifier(n_estimators=100, random_state=42),
+    }
+    for name, clf in models.items():
+        aucs = []
+        for seed in range(42, 47):
+            x_tr, x_te, y_tr, y_te = train_test_split(
+                x, y, test_size=0.30, random_state=seed, stratify=y
+            )
+            pipe = Pipeline([("pre", build_preprocessor(num_cols, cat_cols)), ("clf", clone(clf))])
+            pipe.fit(x_tr, y_tr)
+            aucs.append(roc_auc_score(y_te, pipe.predict_proba(x_te)[:, 1]))
+        print(f"  {name:4s}: AUC = {np.mean(aucs):.4f} (std={np.std(aucs):.4f})")
+
+    print("\n" + "=" * 60)
+    print("DETAILED METRICS (seed 42)")
+    print("=" * 60)
+
+    x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.30, random_state=42, stratify=y)
+    pipe = Pipeline(
+        [
+            ("pre", build_preprocessor(num_cols, cat_cols)),
+            ("clf", LogisticRegression(max_iter=1000, solver="lbfgs", random_state=42)),
+        ]
+    )
+    pipe.fit(x_tr, y_tr)
+    probs = pipe.predict_proba(x_te)[:, 1]
+    auc = roc_auc_score(y_te, probs)
+    pr_auc = average_precision_score(y_te, probs)
+    base = y_te.mean()
+    print(f"  AUC:    {auc:.4f}")
+    print(f"  PR-AUC: {pr_auc:.4f}")
+    print(f"  Base rate: {base:.1%}")
+
+    order = np.argsort(-probs)
+    y_sorted = y_te.values[order]
+    for k in [25, 50, 100]:
+        if k <= len(y_te):
+            prec = y_sorted[:k].mean()
+            rec = y_sorted[:k].sum() / y_te.sum()
+            lift = prec / base
+            print(f"  P@{k}={prec:.3f}  R@{k}={rec:.3f}  Lift@{k}={lift:.2f}x")
+
+    print("\nValue-aware ranking:")
+    test_acv = pd.to_numeric(df.loc[x_te.index, "expected_acv"], errors="coerce").fillna(0).values
+    test_conv = y_te.values
+    ev = probs * test_acv
+    for k in [25, 50]:
+        top_prob = np.argsort(-probs)[:k]
+        cap_prob = np.sum(test_acv[top_prob] * test_conv[top_prob])
+        conv_prob = int(test_conv[top_prob].sum())
+        top_ev = np.argsort(-ev)[:k]
+        cap_ev = np.sum(test_acv[top_ev] * test_conv[top_ev])
+        conv_ev = int(test_conv[top_ev].sum())
+        uplift = (cap_ev - cap_prob) / cap_prob * 100 if cap_prob > 0 else 0.0
+        print(
+            f"  K={k}: prob=${cap_prob:,.0f} (conv={conv_prob})  "
+            f"ev=${cap_ev:,.0f} (conv={conv_ev})  uplift={uplift:+.1f}%"
+        )
+
+    print("\nFeature importance (GBM):")
+    gbm_pipe = Pipeline(
+        [
+            ("pre", build_preprocessor(num_cols, cat_cols)),
+            ("clf", GradientBoostingClassifier(n_estimators=100, random_state=42)),
+        ]
+    )
+    gbm_pipe.fit(x_tr, y_tr)
+    importances = gbm_pipe.named_steps["clf"].feature_importances_
+    ohe = gbm_pipe.named_steps["pre"].named_transformers_["cat"].named_steps["encoder"]
+    cat_names = list(ohe.get_feature_names_out(cat_cols))
+    feature_names = num_cols + cat_names
+    imp_df = pd.DataFrame({"feature": feature_names, "importance": importances})
+    imp_df = imp_df.sort_values("importance", ascending=False)
+    for _, row in imp_df.head(15).iterrows():
+        print(f"  {row['feature']:40s} {row['importance']:.4f}")
+
+    print("\nMissingness summary:")
+    for col in df.columns:
+        n_miss = df[col].isna().sum()
+        if n_miss > 0:
+            print(f"  {col}: {n_miss} ({n_miss / len(df):.1%})")
+
+
+if __name__ == "__main__":
+    main()