Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .agent-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,19 @@ From self-review of PR #50. Completed in a single follow-up PR.
| Group followup params into dataclass | ✓ `FollowupRampConfig` frozen dataclass in `mechanisms/counts.py`. `LatentDecayIntensity` accepts `followup: FollowupRampConfig | None`. Legacy params still accepted for backward compat. |
| Fix `subsample` silent short-return | ✓ `subsample()` now raises `ValueError` when insufficient negatives. |

### mid-project: generation framework for 3-week pair mid-project dataset ✓

Generator code only — no dataset artifacts committed here (public repo).
Dataset artifacts live in `leadforge-datasets-private/lead_scoring_midproject/`.

| Item | Status |
|---|---|
| `leadforge/pipelines/build_midproject.py` | ✓ Pipeline module (seed=100, SUBSAMPLE_N=1200) |
| `scripts/build_midproject_lead_scoring.py` | ✓ Build CLI |
| `scripts/validate_midproject_lead_scoring.py` | ✓ Validation script |
| `scripts/quick_baseline_eval_midproject.py` | ✓ Baseline evaluation script |
| Dataset artifacts | ✓ In `leadforge-datasets-private` (private repo) |

### From post-v1 list

- Second vertical
Expand Down
79 changes: 79 additions & 0 deletions leadforge/pipelines/build_midproject.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Pipeline functions for building the mid-project lead scoring dataset.

Produces a single student-safe CSV with 1,200 rows at ~30% conversion rate.
No leakage trap column — this dataset is published directly to students.

Key parameters vs v7:
- SEED = 100 (different seed → different rows from v7's seed=42)
- SUBSAMPLE_N = 1200 (slightly larger than v7's 1000)
- No instructor/trap variant
- Same schema, narrative, missingness patterns as v7
"""

from __future__ import annotations

import pandas as pd

from leadforge.pipelines.common import (
ACV_CAP,
ACV_FLOOR,
FINAL_COLUMNS_STUDENT,
RENAME_MAP,
TARGET_RATE,
assign_acquisition_wave,
derive_features,
softcap_expected_acv,
subsample,
)
from leadforge.pipelines.common import (
inject_missingness_v6 as inject_missingness,
)
from leadforge.pipelines.common import (
rename_and_select as _rename_and_select_generic,
)

__all__ = [
"ACV_CAP",
"ACV_FLOOR",
"FINAL_COLUMNS_STUDENT",
"N_LEADS",
"RENAME_MAP",
"SEED",
"SNAPSHOT_DAY",
"SUBSAMPLE_N",
"TARGET_RATE",
"assign_acquisition_wave",
"derive_features",
"inject_missingness",
"rename_and_select",
"softcap_expected_acv",
"subsample",
]

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
SEED = 100
N_LEADS = 5000
SNAPSHOT_DAY = 20
SUBSAMPLE_N = 1200


# ---------------------------------------------------------------------------
# Version-specific pipeline steps
# ---------------------------------------------------------------------------


def rename_and_select(
df: pd.DataFrame,
*,
label_column: str = "converted_within_90_days",
) -> pd.DataFrame:
"""Rename snapshot columns to midproject names and select final column set."""
return _rename_and_select_generic(
df,
rename_map=RENAME_MAP,
final_columns=FINAL_COLUMNS_STUDENT,
instructor=False,
label_column=label_column,
)
111 changes: 111 additions & 0 deletions scripts/build_midproject_lead_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python3
"""Build the mid-project lead scoring dataset.

Usage:
python scripts/build_midproject_lead_scoring.py OUTPUT_DIR

Produces one file in OUTPUT_DIR:
lead_scoring_midproject.csv (student-safe, no leakage columns)

1,200 rows at ~30% conversion rate, snapshot day 20.
Seed: 100. Schema identical to lead_scoring_intro_v7.csv.
"""

from __future__ import annotations

import sys
from pathlib import Path

import pandas as pd

from leadforge.api.generator import Generator
from leadforge.pipelines.build_midproject import (
N_LEADS,
SEED,
SNAPSHOT_DAY,
SUBSAMPLE_N,
assign_acquisition_wave,
derive_features,
inject_missingness,
rename_and_select,
softcap_expected_acv,
subsample,
)
from leadforge.render.snapshots import build_snapshot

# ---------------------------------------------------------------------------
# Orchestration
# ---------------------------------------------------------------------------


def generate_bundle(seed: int = SEED, n_leads: int = N_LEADS):
"""Generate a full bundle and return (snapshot, bundle)."""
gen = Generator.from_recipe(
"b2b_saas_procurement_v1",
seed=seed,
exposure_mode="research_instructor",
n_leads=n_leads,
difficulty="intro",
)
bundle = gen.generate(latent_touch_intensity=True)
snapshot = build_snapshot(
bundle.simulation_result,
bundle.population,
snapshot_day=SNAPSHOT_DAY,
)
return snapshot, bundle


def build_midproject_dataset(seed: int = SEED) -> pd.DataFrame:
"""Full pipeline: generate → derive → process → subsample → missingness."""
print("Generating bundle...", file=sys.stderr)
snapshot, _bundle = generate_bundle(seed=seed)
conv = snapshot["converted_within_90_days"].mean()
print(
f" Raw snapshot: {len(snapshot)} rows, conversion={conv:.1%}",
file=sys.stderr,
)

df = derive_features(snapshot)
df = softcap_expected_acv(df, seed)
df = assign_acquisition_wave(df, seed)
df = rename_and_select(df)

print(f"Subsampling to {SUBSAMPLE_N} rows...", file=sys.stderr)
df = subsample(df, seed, n=SUBSAMPLE_N)
print(
f" Subsampled: {len(df)} rows, conversion={df['converted'].mean():.1%}",
file=sys.stderr,
)

print("Injecting missingness...", file=sys.stderr)
df = inject_missingness(df, seed)

return df


# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------


def main() -> None:
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} OUTPUT_DIR", file=sys.stderr)
sys.exit(1)

output_dir = Path(sys.argv[1])
output_dir.mkdir(parents=True, exist_ok=True)

df = build_midproject_dataset()

out_path = output_dir / "lead_scoring_midproject.csv"
df.to_csv(out_path, index=False)
print(
f"Midproject: {len(df)} rows x {len(df.columns)} cols → {out_path}",
file=sys.stderr,
)


if __name__ == "__main__":
main()
137 changes: 137 additions & 0 deletions scripts/quick_baseline_eval_midproject.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env python3
"""Quick baseline evaluation for the mid-project lead scoring dataset.

Usage:
python scripts/quick_baseline_eval_midproject.py CSV_PATH

Runs LR + RF + GBM baselines, value-aware scoring, and feature importance.
"""

from __future__ import annotations

import sys

import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from leadforge.pipelines.common import BINARY_FEATURES, CAT_FEATURES, NUM_FEATURES, TARGET
from leadforge.pipelines.ml import LEAKAGE_PREFIX, build_preprocessor, sanitize_categoricals

_EVAL_NUM_FEATURES = NUM_FEATURES + BINARY_FEATURES


def main() -> None:
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} CSV_PATH", file=sys.stderr)
sys.exit(1)

df = sanitize_categoricals(pd.read_csv(sys.argv[1]), CAT_FEATURES)
leakage = {c for c in df.columns if c.startswith(LEAKAGE_PREFIX)}
cat_cols = [c for c in CAT_FEATURES if c in df.columns and c not in leakage]
num_cols = [c for c in _EVAL_NUM_FEATURES if c in df.columns and c not in leakage]

y = df[TARGET].astype(int)
x = df[cat_cols + num_cols]

print(f"Dataset: {len(df)} rows, {len(df.columns)} cols")
print(f"Conversion rate: {y.mean():.1%}")
print(f"Features: {len(cat_cols)} cat + {len(num_cols)} num = {len(cat_cols) + len(num_cols)}")

print("\n" + "=" * 60)
print("MODEL COMPARISON (5-seed average, 70/30 stratified)")
print("=" * 60)

models = {
"LR": LogisticRegression(max_iter=1000, solver="lbfgs", random_state=42),
"RF": RandomForestClassifier(n_estimators=100, random_state=42),
"GBM": GradientBoostingClassifier(n_estimators=100, random_state=42),
}
for name, clf in models.items():
aucs = []
for seed in range(42, 47):
x_tr, x_te, y_tr, y_te = train_test_split(
x, y, test_size=0.30, random_state=seed, stratify=y
)
pipe = Pipeline([("pre", build_preprocessor(num_cols, cat_cols)), ("clf", clone(clf))])
pipe.fit(x_tr, y_tr)
aucs.append(roc_auc_score(y_te, pipe.predict_proba(x_te)[:, 1]))
print(f" {name:4s}: AUC = {np.mean(aucs):.4f} (std={np.std(aucs):.4f})")

print("\n" + "=" * 60)
print("DETAILED METRICS (seed 42)")
print("=" * 60)

x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.30, random_state=42, stratify=y)
pipe = Pipeline(
[
("pre", build_preprocessor(num_cols, cat_cols)),
("clf", LogisticRegression(max_iter=1000, solver="lbfgs", random_state=42)),
]
)
pipe.fit(x_tr, y_tr)
probs = pipe.predict_proba(x_te)[:, 1]
auc = roc_auc_score(y_te, probs)
pr_auc = average_precision_score(y_te, probs)
base = y_te.mean()
print(f" AUC: {auc:.4f}")
print(f" PR-AUC: {pr_auc:.4f}")
print(f" Base rate: {base:.1%}")

order = np.argsort(-probs)
y_sorted = y_te.values[order]
for k in [25, 50, 100]:
if k <= len(y_te):
prec = y_sorted[:k].mean()
rec = y_sorted[:k].sum() / y_te.sum()
lift = prec / base
print(f" P@{k}={prec:.3f} R@{k}={rec:.3f} Lift@{k}={lift:.2f}x")

print("\nValue-aware ranking:")
test_acv = pd.to_numeric(df.loc[x_te.index, "expected_acv"], errors="coerce").fillna(0).values
test_conv = y_te.values
ev = probs * test_acv
for k in [25, 50]:
top_prob = np.argsort(-probs)[:k]
cap_prob = np.sum(test_acv[top_prob] * test_conv[top_prob])
conv_prob = int(test_conv[top_prob].sum())
top_ev = np.argsort(-ev)[:k]
cap_ev = np.sum(test_acv[top_ev] * test_conv[top_ev])
conv_ev = int(test_conv[top_ev].sum())
uplift = (cap_ev - cap_prob) / cap_prob * 100 if cap_prob > 0 else 0.0
print(
f" K={k}: prob=${cap_prob:,.0f} (conv={conv_prob}) "
f"ev=${cap_ev:,.0f} (conv={conv_ev}) uplift={uplift:+.1f}%"
)

print("\nFeature importance (GBM):")
gbm_pipe = Pipeline(
[
("pre", build_preprocessor(num_cols, cat_cols)),
("clf", GradientBoostingClassifier(n_estimators=100, random_state=42)),
]
)
gbm_pipe.fit(x_tr, y_tr)
importances = gbm_pipe.named_steps["clf"].feature_importances_
ohe = gbm_pipe.named_steps["pre"].named_transformers_["cat"].named_steps["encoder"]
cat_names = list(ohe.get_feature_names_out(cat_cols))
feature_names = num_cols + cat_names
imp_df = pd.DataFrame({"feature": feature_names, "importance": importances})
imp_df = imp_df.sort_values("importance", ascending=False)
for _, row in imp_df.head(15).iterrows():
print(f" {row['feature']:40s} {row['importance']:.4f}")

print("\nMissingness summary:")
for col in df.columns:
n_miss = df[col].isna().sum()
if n_miss > 0:
print(f" {col}: {n_miss} ({n_miss / len(df):.1%})")


if __name__ == "__main__":
main()
Loading
Loading