diff --git a/.agent-plan.md b/.agent-plan.md index ff7933b..849a996 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -6,11 +6,51 @@ ## Current System State -**v1.0.0 released (2026-05-02).** All milestones (M0–M13) complete. Package version bumped to 1.0.0 in pyproject.toml and leadforge/version.py. README updated with `pip install leadforge`. CHANGELOG consolidated under v1.0.0 heading. +**v1.0.0 released (2026-05-02).** All milestones (M0–M13) complete. Teaching dataset series (v1–v7) approved by consumer. Package version bumped to 1.0.0 in pyproject.toml and leadforge/version.py. --- -## Next Up — v4 Lead Scoring Dataset +## Next Up — Public Kaggle/HuggingFace Release + +First public dataset release: `leadforge-b2b-lead-scoring`. Three difficulty tiers (intro/intermediate/advanced) as full relational bundles + flat CSV convenience exports, plus a research_instructor companion for intermediate. + +### Public release — Phase 1: Dataset card improvement ✓ (in PR) + +- [x] `render_dataset_card()` accepts `table_counts` dict → renders table inventory +- [x] Feature categories section rendered from `LEAD_SNAPSHOT_FEATURES` (category counts, examples, leakage flags) +- [x] `write_bundle()` passes `table_row_counts` to card renderer +- [x] 4 new tests (table inventory with/without counts, feature categories, leakage flags) + +### Public release — Phase 2: Build script + flat CSV ✓ (in PR) + +- [x] `scripts/build_public_release.py` — generates 4 bundles, validates, creates flat CSV exports +- [x] Flat CSV drops `current_stage` (contains terminal stages that encode the label at 90-day horizon) +- [x] All 4 bundles pass `validate_bundle()` + +### Public release — Phase 3: Platform README + HF card ✓ (in PR) + +- [x] `release/README.md` — landing page with directory structure, quick-start snippets, dataset summary, provenance +- [x] `release/HF_DATASET_CARD.md` — YAML frontmatter with configs for each difficulty tier + +### Public release — Phase 4: Baseline notebook ✓ (in PR) + +- [x] `release/notebooks/01_baseline_lead_scoring.ipynb` — LR + GBM baselines, P@K, value-aware ranking, feature importance +- [x] Excludes `current_stage` and leakage-flagged columns +- [x] Works from pre-generated Parquet files (no leadforge install needed) + +### Public release — Phase 5: Generate final release + upload (pending) + +- [ ] Run build script, verify SHA-256 hash determinism +- [ ] Upload to Kaggle and HuggingFace +- [ ] Announce + +### Known issue: `current_stage` leakage at 90-day horizon + +The full bundle snapshot includes `current_stage` which at day 90 contains terminal stages (`closed_won`/`closed_lost`). This perfectly encodes the label. The flat CSV export drops it; the Parquet task splits retain it with documentation. A proper fix (windowed snapshot or column redaction in the exposure layer) is deferred. + +--- + +## Previous Focus — v4–v7 Lead Scoring Datasets The primary focus is producing a v4 lead scoring dataset that fixes the issues found in v1–v3 datasets. This requires targeted engine changes + a build pipeline, followed by dataset release. diff --git a/.gitignore b/.gitignore index 064c41a..8347321 100644 --- a/.gitignore +++ b/.gitignore @@ -208,3 +208,11 @@ __marimo__/ # MacOS DS_Store files .DS_Store + +# Generated output bundles +out/ +release/intro/ +release/intermediate/ +release/advanced/ +release/intermediate_instructor/ +release/LICENSE diff --git a/leadforge/api/bundle.py b/leadforge/api/bundle.py index f4c97b4..2b33016 100644 --- a/leadforge/api/bundle.py +++ b/leadforge/api/bundle.py @@ -80,7 +80,9 @@ def write_bundle( # ------------------------------------------------------------------ # 3. Dataset card and feature dictionary # ------------------------------------------------------------------ - (root / "dataset_card.md").write_text(render_dataset_card(bundle.spec, task_manifest=task)) + (root / "dataset_card.md").write_text( + render_dataset_card(bundle.spec, task_manifest=task, table_counts=table_row_counts) + ) write_feature_dictionary(root / "feature_dictionary.csv") # ------------------------------------------------------------------ diff --git a/leadforge/narrative/dataset_card.py b/leadforge/narrative/dataset_card.py index 0e9d5b1..aad823e 100644 --- a/leadforge/narrative/dataset_card.py +++ b/leadforge/narrative/dataset_card.py @@ -6,8 +6,11 @@ from __future__ import annotations +from collections import Counter from typing import TYPE_CHECKING +from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES + if TYPE_CHECKING: from leadforge.core.models import WorldSpec from leadforge.schema.tasks import TaskManifest @@ -16,6 +19,7 @@ def render_dataset_card( world_spec: WorldSpec, task_manifest: TaskManifest | None = None, + table_counts: dict[str, int] | None = None, ) -> str: """Return a Markdown dataset card string for *world_spec*. @@ -24,17 +28,18 @@ def render_dataset_card( task_manifest: Optional task manifest whose ``description`` is used as the label definition prose. When ``None`` or when ``description`` is empty, a generic fallback is rendered. + table_counts: Optional mapping of table name → row count. When + provided, the table inventory section renders actual counts + instead of a placeholder. - Sections present at all milestones: + Sections: - Header (recipe id, version, seed, exposure mode) - Narrative summary (company, product, market, GTM) - Primary task and label definition - - Suggested use cases - - Caveats - - Sections populated in later milestones (rendered as stubs here): - Table inventory - Feature categories + - Suggested use cases + - Caveats """ cfg = world_spec.config narrative = world_spec.narrative @@ -122,24 +127,47 @@ def render_dataset_card( ] # ------------------------------------------------------------------ - # Table inventory (stub — populated in later milestones) + # Table inventory # ------------------------------------------------------------------ - lines += [ - "## Table inventory", - "", - "*Table counts will appear here once the simulation layer is implemented (v0.3.0+).*", - "", - ] + lines += ["## Table inventory", ""] + if table_counts is not None: + lines += [ + "| Table | Rows |", + "|---|---:|", + ] + for tbl, count in table_counts.items(): + lines.append(f"| {tbl} | {count:,} |") + lines.append("") + else: + lines += [ + "*Table counts not available (pass ``table_counts`` to populate).*", + "", + ] # ------------------------------------------------------------------ - # Feature categories (stub) + # Feature categories # ------------------------------------------------------------------ + lines += ["## Feature categories", ""] + category_counts: Counter[str] = Counter() + for feat in LEAD_SNAPSHOT_FEATURES: + category_counts[feat.category] += 1 lines += [ - "## Feature categories", - "", - "*Feature dictionary will appear here once the schema layer is implemented (v0.3.0+).*", - "", + "| Category | Count | Examples |", + "|---|---:|---|", ] + for cat, count in category_counts.items(): + examples = [ + f.name for f in LEAD_SNAPSHOT_FEATURES if f.category == cat and not f.is_target + ][:3] + lines.append(f"| {cat} | {count} | {', '.join(examples)} |") + leakage_cols = [f.name for f in LEAD_SNAPSHOT_FEATURES if f.leakage_risk] + if leakage_cols: + lines += [ + "", + f"**Leakage-flagged columns:** {', '.join(f'`{c}`' for c in leakage_cols)}. " + "See `feature_dictionary.csv` for details.", + ] + lines.append("") # ------------------------------------------------------------------ # Suggested use cases diff --git a/leadforge/schema/features.py b/leadforge/schema/features.py index 8dadaf7..41aab34 100644 --- a/leadforge/schema/features.py +++ b/leadforge/schema/features.py @@ -116,8 +116,12 @@ class FeatureSpec: FeatureSpec( "current_stage", "string", - "Funnel stage at snapshot anchor date.", + "Funnel stage at snapshot anchor date. WARNING: at full-horizon " + "(90-day) snapshots this contains terminal stages (closed_won / " + "closed_lost) that encode the label. Exclude from modeling or use " + "a windowed snapshot.", "lead_meta", + leakage_risk=True, ), FeatureSpec( "is_mql", diff --git a/release/HF_DATASET_CARD.md b/release/HF_DATASET_CARD.md new file mode 100644 index 0000000..529ef69 --- /dev/null +++ b/release/HF_DATASET_CARD.md @@ -0,0 +1,104 @@ +--- +language: + - en +license: mit +task_categories: + - tabular-classification +tags: + - lead-scoring + - b2b + - crm + - synthetic + - relational + - sales + - funnel + - binary-classification + - reproducible +size_categories: + - 1K-10K +configs: + - config_name: intro + data_files: + - split: train + path: intro/tasks/converted_within_90_days/train.parquet + - split: validation + path: intro/tasks/converted_within_90_days/valid.parquet + - split: test + path: intro/tasks/converted_within_90_days/test.parquet + - config_name: intermediate + data_files: + - split: train + path: intermediate/tasks/converted_within_90_days/train.parquet + - split: validation + path: intermediate/tasks/converted_within_90_days/valid.parquet + - split: test + path: intermediate/tasks/converted_within_90_days/test.parquet + - config_name: advanced + data_files: + - split: train + path: advanced/tasks/converted_within_90_days/train.parquet + - split: validation + path: advanced/tasks/converted_within_90_days/valid.parquet + - split: test + path: advanced/tasks/converted_within_90_days/test.parquet +--- + +# LeadForge: Synthetic B2B Lead Scoring Dataset + +A relational, reproducible, multi-difficulty lead scoring dataset generated by [leadforge](https://github.com/leadforge-dev/leadforge) -- an open-source Python framework for synthetic CRM/funnel data. + +## Why this dataset? + +1. **Relational structure.** 9 normalized tables plus ML-ready task splits. Practice feature engineering from raw tables, or grab the flat file and start modeling. +2. **Three difficulty tiers.** Same world, different signal-to-noise ratios. +3. **Reproducible and leakage-safe.** Deterministic generation (seed 42), SHA-256 hashes, explicit leakage trap. + +## Quick start + +```python +from datasets import load_dataset + +# Load intermediate difficulty +ds = load_dataset("leadforge/leadforge-b2b-lead-scoring", name="intermediate") +train = ds["train"].to_pandas() +valid = ds["validation"].to_pandas() # Note: file is valid.parquet, split name is "validation" +test = ds["test"].to_pandas() +``` + +Or use the flat CSV: + +```python +import pandas as pd +df = pd.read_csv("hf://datasets/leadforge/leadforge-b2b-lead-scoring/intermediate/lead_scoring.csv") +``` + +## Dataset summary + +| | Intro | Intermediate | Advanced | +|---|---|---|---| +| Leads | 5,000 | 5,000 | 5,000 | +| Features | 35 | 35 | 35 | +| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` | +| Signal strength | 0.90 | 0.70 | 0.50 | +| Noise scale | 0.10 | 0.30 | 0.55 | +| Missing rate | 2% | 8% | 18% | + +## The scenario + +**Veridian Technologies** sells cloud procurement automation to mid-market firms (200-2,000 employees). Sales channels: inbound (45%), SDR outbound (35%), partner referrals (20%). Four buyer personas. **Task:** predict conversion within 90 days. + +## Relational tables + +Each difficulty tier includes 9 Parquet tables under `tables/`: accounts, contacts, leads, touches, sessions, sales_activities, opportunities, customers, subscriptions. These form a normalized CRM schema linked by foreign keys. + +## Leakage trap + +`total_touches_all` counts touches over the full 90-day window including post-snapshot events. Flagged as `leakage_risk=True` in `feature_dictionary.csv`. + +## Research companion + +`intermediate_instructor/` includes the full causal structure: world graph (DAG), latent trait registry, and mechanism assignments. + +## Provenance + +Generated by [leadforge](https://github.com/leadforge-dev/leadforge) v1.0.0, recipe `b2b_saas_procurement_v1`, seed 42. MIT license. See `manifest.json` in each bundle for SHA-256 hashes. diff --git a/release/README.md b/release/README.md new file mode 100644 index 0000000..0895879 --- /dev/null +++ b/release/README.md @@ -0,0 +1,173 @@ +# LeadForge: Synthetic B2B Lead Scoring Dataset + +A relational, reproducible, multi-difficulty lead scoring dataset generated by [leadforge](https://github.com/leadforge-dev/leadforge) -- an open-source Python framework for synthetic CRM/funnel data. + +## Why this dataset? + +Most public lead scoring datasets are flat CSVs with opaque provenance. This one is different: + +1. **Relational structure.** 9 normalized tables (accounts, contacts, leads, touches, sessions, sales activities, opportunities, customers, subscriptions) plus ML-ready task splits. Practice feature engineering from raw tables, or grab the flat file and start modeling. + +2. **Three difficulty tiers.** Same company, same product, same buyer personas -- different difficulty profiles. Each tier declares different signal strength, noise, and missingness parameters in its manifest. (See [Known limitations](#known-limitations) for current status.) + +3. **Reproducible and leakage-safe.** Deterministic generation from a fixed seed. SHA-256 hashes for every file in `manifest.json`. Leakage-prone columns (`total_touches_all`, `current_stage`) are explicitly flagged in the feature dictionary. All features are anchored at the snapshot date -- no post-cutoff data leaks in. + +## What's inside + +``` +release/ +|-- README.md # This file +|-- LICENSE # MIT +|-- intro/ # Difficulty tier 1 +| |-- manifest.json # Provenance: seed, recipe, version, file hashes +| |-- dataset_card.md # Human-readable dataset summary +| |-- feature_dictionary.csv # Column descriptions, types, leakage flags +| |-- lead_scoring.csv # Flat convenience file (all splits + split column) +| |-- tables/ # 9 relational Parquet tables +| | |-- accounts.parquet +| | |-- contacts.parquet +| | |-- leads.parquet +| | |-- touches.parquet +| | |-- sessions.parquet +| | |-- sales_activities.parquet +| | |-- opportunities.parquet +| | |-- customers.parquet +| | |-- subscriptions.parquet +| |-- tasks/converted_within_90_days/ # Pre-split ML task +| |-- train.parquet # 70% of leads +| |-- valid.parquet # 15% of leads +| |-- test.parquet # 15% of leads +|-- intermediate/ # Difficulty tier 2 (same structure) +|-- advanced/ # Difficulty tier 3 (same structure) +|-- intermediate_instructor/ # Research companion (adds metadata/) +| |-- metadata/ # Hidden causal structure +| |-- graph.json # World graph (DAG) +| |-- graph.graphml # World graph (GraphML) +| |-- world_spec.json # Full generation config +| |-- latent_registry.json # Per-entity latent trait values +| |-- mechanism_summary.json # Causal mechanism assignments +|-- notebooks/ + |-- 01_baseline_lead_scoring.ipynb # Baseline modeling walkthrough +``` + +## Quick start + +### Option 1: Flat CSV (simplest) + +```python +import pandas as pd + +df = pd.read_csv("intermediate/lead_scoring.csv") +train = df[df["split"] == "train"].drop(columns=["split"]) +test = df[df["split"] == "test"].drop(columns=["split"]) +``` + +### Option 2: Parquet task splits (recommended) + +```python +import pandas as pd + +train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet") +test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet") +``` + +**Note:** The Parquet files contain `current_stage` and `total_touches_all`, both flagged as `leakage_risk` in `feature_dictionary.csv`. Exclude them from your feature set. The flat CSV (`lead_scoring.csv`) has these columns pre-removed. + +### Option 3: Relational tables (feature engineering) + +```python +import pandas as pd + +accounts = pd.read_parquet("intermediate/tables/accounts.parquet") +leads = pd.read_parquet("intermediate/tables/leads.parquet") +touches = pd.read_parquet("intermediate/tables/touches.parquet") + +# Engineer your own features from raw event tables +touch_counts = touches.groupby("lead_id").size().rename("my_touch_count") +features = leads.merge(accounts, on="account_id").merge(touch_counts, on="lead_id", how="left") +``` + +### Option 4: Reproduce from source + +```bash +pip install leadforge +leadforge generate \ + --recipe b2b_saas_procurement_v1 \ + --seed 42 \ + --mode student_public \ + --difficulty intermediate \ + --out my_bundle +``` + +## Dataset summary + +| | Intro | Intermediate | Advanced | +|---|---|---|---| +| Leads | 5,000 | 5,000 | 5,000 | +| Accounts | 1,500 | 1,500 | 1,500 | +| Contacts | 4,200 | 4,200 | 4,200 | +| Columns | 35 (34 features + 1 target) | 35 | 35 | +| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` | +| Signal strength | 0.90 | 0.70 | 0.50 | +| Noise scale | 0.10 | 0.30 | 0.55 | +| Missing rate | 2% | 8% | 18% | + +## The scenario + +**Veridian Technologies** is a Series B startup (Austin, US) selling **Veridian Procure**, a cloud-based procurement and AP automation platform, to mid-market firms (200-2,000 employees) in the US and UK. + +The sales funnel runs through inbound marketing (45%), SDR outbound (35%), and partner referrals (20%). Four buyer personas drive deals: VP Finance (economic buyer), AP Manager (champion), IT Director (technical evaluator), and Procurement Manager (end user). + +**Task:** predict whether a lead will convert (closed-won) within 90 days of entering the funnel. + +## Feature dictionary + +34 features + 1 target across 6 categories: + +| Category | Count | Examples | +|---|---|---| +| Account | 6 | `industry`, `region`, `employee_band`, `estimated_revenue_band` | +| Contact | 4 | `role_function`, `seniority`, `buyer_role` | +| Lead metadata | 7 | `lead_source`, `first_touch_channel`, `is_mql`, `is_sql` | +| Engagement | 11 | `touch_count`, `session_count`, `pricing_page_views`, `touches_week_1` | +| Sales | 6 | `activity_count`, `opportunity_created`, `expected_acv` | +| Target | 1 | `converted_within_90_days` | + +See `feature_dictionary.csv` in each bundle for full descriptions and dtypes. + +**Leakage-flagged columns** (marked `leakage_risk=True` in the feature dictionary): + +- `total_touches_all` -- counts touches over the full 90-day window, including post-snapshot events. Can you spot why this leaks? +- `current_stage` -- at the 90-day horizon, contains terminal stages (`closed_won`/`closed_lost`) that encode the label directly. + +Both are dropped from the flat CSV (`lead_scoring.csv`). If you load the Parquet task splits directly, exclude them from your feature set. + +## Research companion + +The `intermediate_instructor/` bundle includes the full hidden causal structure: + +- **World graph:** The DAG of causal relationships driving lead outcomes +- **Latent registry:** Per-entity latent trait values (account fit, contact authority, engagement propensity) +- **Mechanism summary:** How each node in the graph maps to simulation behavior + +This enables research on causal inference, model interpretability, and DGP-aware evaluation. + +## Known limitations + +- **Difficulty tiers share the same conversion rate.** The simulation engine does not yet modulate conversion rates by difficulty profile. All three tiers produce similar base rates (~70%). The difficulty profiles are declared in each bundle's manifest and will produce meaningfully different signal-to-noise ratios once the engine is updated. For now, the primary difference between tiers is the declared profile metadata. + +## Provenance + +| Field | Value | +|---|---| +| Generator | [leadforge](https://github.com/leadforge-dev/leadforge) v1.0.0 | +| Recipe | `b2b_saas_procurement_v1` | +| Seed | 42 | +| Format | Parquet + CSV | +| License | MIT | + +Every bundle includes a `manifest.json` with the exact package version, recipe, seed, generation timestamp, and SHA-256 hashes for all data files. To verify integrity or regenerate, install leadforge and run the generation command above. + +## License + +MIT. See [LICENSE](LICENSE). diff --git a/release/notebooks/01_baseline_lead_scoring.ipynb b/release/notebooks/01_baseline_lead_scoring.ipynb new file mode 100644 index 0000000..8861584 --- /dev/null +++ b/release/notebooks/01_baseline_lead_scoring.ipynb @@ -0,0 +1,368 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Baseline Lead Scoring Models\n", + "\n", + "This notebook trains baseline models on the **LeadForge B2B Lead Scoring** dataset.\n", + "It works directly from the pre-generated Parquet files -- no `leadforge` installation required.\n", + "\n", + "We'll cover:\n", + "1. Loading the task splits\n", + "2. Exploring the features\n", + "3. Training Logistic Regression and Gradient Boosting baselines\n", + "4. Evaluating with AUC, PR-AUC, and Precision@K\n", + "5. Value-aware ranking (probability vs. expected value)\n", + "6. Feature importance\n", + "\n", + "**Requirements:** `pandas`, `scikit-learn`, `matplotlib` (all available in Kaggle notebooks by default)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load the data\n", + "\n", + "We use the `intermediate` difficulty tier. Change the path to `intro/` or `advanced/` to try other tiers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# Adjust this path to your dataset location\n", + "BUNDLE = \"../intermediate\"\n", + "TASK = \"converted_within_90_days\"\n", + "\n", + "train = pd.read_parquet(f\"{BUNDLE}/tasks/{TASK}/train.parquet\")\n", + "valid = pd.read_parquet(f\"{BUNDLE}/tasks/{TASK}/valid.parquet\")\n", + "test = pd.read_parquet(f\"{BUNDLE}/tasks/{TASK}/test.parquet\")\n", + "\n", + "print(f\"Train: {len(train):,} rows\")\n", + "print(f\"Valid: {len(valid):,} rows\")\n", + "print(f\"Test: {len(test):,} rows\")\n", + "print(\"\\nConversion rates:\")\n", + "for name, df in [(\"train\", train), (\"valid\", valid), (\"test\", test)]:\n", + " print(f\" {name}: {df[TASK].mean():.1%}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature dictionary\n", + "feat_dict = pd.read_csv(f\"{BUNDLE}/feature_dictionary.csv\")\n", + "feat_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Explore the features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Identify feature types\n", + "TARGET = TASK\n", + "ID_COLS = [\"account_id\", \"contact_id\", \"lead_id\", \"lead_created_at\"]\n", + "LEAKAGE_COLS = [c for c in train.columns if feat_dict[feat_dict[\"name\"] == c][\"leakage_risk\"].any()]\n", + "\n", + "print(f\"Leakage-flagged columns (excluded): {LEAKAGE_COLS}\")\n", + "\n", + "feature_cols = [c for c in train.columns if c not in ID_COLS + [TARGET] + LEAKAGE_COLS]\n", + "cat_cols = [c for c in feature_cols if train[c].dtype == \"string\" or train[c].dtype == \"object\"]\n", + "bool_cols = [c for c in feature_cols if train[c].dtype == \"boolean\"]\n", + "num_cols = [c for c in feature_cols if c not in cat_cols + bool_cols]\n", + "\n", + "print(f\"\\nCategorical: {len(cat_cols)} -- {cat_cols}\")\n", + "print(f\"Boolean: {len(bool_cols)} -- {bool_cols}\")\n", + "print(f\"Numeric: {len(num_cols)} -- {num_cols}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Missing values\n", + "missing = train[feature_cols].isnull().sum()\n", + "missing = missing[missing > 0].sort_values(ascending=False)\n", + "if len(missing) > 0:\n", + " print(\"Missing values in training set:\")\n", + " for col, count in missing.items():\n", + " print(f\" {col}: {count} ({count / len(train):.1%})\")\n", + "else:\n", + " print(\"No missing values in training set.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Summary statistics for numeric features\n", + "train[num_cols].describe().T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Build preprocessing pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "\n", + "\n", + "# Convert boolean columns to int for sklearn\n", + "def prep_df(df):\n", + " out = df[feature_cols].copy()\n", + " for c in bool_cols:\n", + " out[c] = out[c].astype(\"Int64\")\n", + " return out\n", + "\n", + "\n", + "numeric_features = num_cols + bool_cols\n", + "categorical_features = cat_cols\n", + "\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\n", + " \"num\",\n", + " Pipeline(\n", + " [\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler()),\n", + " ]\n", + " ),\n", + " numeric_features,\n", + " ),\n", + " (\n", + " \"cat\",\n", + " Pipeline(\n", + " [\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", + " (\"encoder\", OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)),\n", + " ]\n", + " ),\n", + " categorical_features,\n", + " ),\n", + " ]\n", + ")\n", + "\n", + "X_train = prep_df(train)\n", + "y_train = train[TARGET].astype(int)\n", + "X_test = prep_df(test)\n", + "y_test = test[TARGET].astype(int)\n", + "\n", + "print(f\"X_train: {X_train.shape}, X_test: {X_test.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Train baselines and evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import GradientBoostingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import average_precision_score, roc_auc_score\n", + "\n", + "models = {\n", + " \"Logistic Regression\": LogisticRegression(max_iter=1000, solver=\"lbfgs\", random_state=42),\n", + " \"Gradient Boosting\": GradientBoostingClassifier(n_estimators=200, random_state=42),\n", + "}\n", + "\n", + "results = []\n", + "fitted_models = {}\n", + "\n", + "for name, model in models.items():\n", + " pipe = Pipeline([(\"preprocess\", preprocessor), (\"model\", model)])\n", + " pipe.fit(X_train, y_train)\n", + " y_prob = pipe.predict_proba(X_test)[:, 1]\n", + "\n", + " auc = roc_auc_score(y_test, y_prob)\n", + " pr_auc = average_precision_score(y_test, y_prob)\n", + "\n", + " # Precision@K\n", + " for k in [25, 50, 100]:\n", + " top_k_idx = np.argsort(-y_prob)[:k]\n", + " p_at_k = y_test.iloc[top_k_idx].mean()\n", + " base_rate = y_test.mean()\n", + " lift = p_at_k / base_rate\n", + " results.append(\n", + " {\n", + " \"Model\": name,\n", + " \"Metric\": f\"P@{k}\",\n", + " \"Value\": f\"{p_at_k:.3f}\",\n", + " \"Lift\": f\"{lift:.2f}x\",\n", + " }\n", + " )\n", + "\n", + " results.append({\"Model\": name, \"Metric\": \"ROC-AUC\", \"Value\": f\"{auc:.3f}\", \"Lift\": \"\"})\n", + " results.append({\"Model\": name, \"Metric\": \"PR-AUC\", \"Value\": f\"{pr_auc:.3f}\", \"Lift\": \"\"})\n", + " fitted_models[name] = pipe\n", + " print(f\"{name}: AUC={auc:.3f}, PR-AUC={pr_auc:.3f}\")\n", + "\n", + "pd.DataFrame(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Value-aware ranking\n", + "\n", + "When deals have different sizes (`expected_acv`), ranking by probability alone leaves money on the table.\n", + "Ranking by expected value (P(convert) x ACV) captures more revenue in the top-K." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if \"expected_acv\" in test.columns:\n", + " best_pipe = fitted_models[\"Gradient Boosting\"]\n", + " y_prob = best_pipe.predict_proba(X_test)[:, 1]\n", + " acv = test[\"expected_acv\"].fillna(test[\"expected_acv\"].median()).values\n", + " ev = y_prob * acv\n", + "\n", + " true_acv = test[\"expected_acv\"].fillna(0).values\n", + " converted = y_test.values.astype(bool)\n", + "\n", + " for k in [25, 50, 100]:\n", + " # Probability ranking\n", + " prob_top_k = np.argsort(-y_prob)[:k]\n", + " prob_acv = true_acv[prob_top_k][converted[prob_top_k]].sum()\n", + "\n", + " # EV ranking\n", + " ev_top_k = np.argsort(-ev)[:k]\n", + " ev_acv = true_acv[ev_top_k][converted[ev_top_k]].sum()\n", + "\n", + " uplift = (ev_acv - prob_acv) / prob_acv * 100 if prob_acv > 0 else 0\n", + " print(\n", + " f\"K={k}: Prob ranking ${prob_acv:,.0f} | \"\n", + " f\"EV ranking ${ev_acv:,.0f} | Uplift: {uplift:+.1f}%\"\n", + " )\n", + "else:\n", + " print(\"No expected_acv column found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Feature importance (Gradient Boosting)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "gbm_pipe = fitted_models[\"Gradient Boosting\"]\n", + "gbm_model = gbm_pipe.named_steps[\"model\"]\n", + "preproc = gbm_pipe.named_steps[\"preprocess\"]\n", + "\n", + "# Get feature names after encoding\n", + "num_names = numeric_features\n", + "cat_encoder = preproc.named_transformers_[\"cat\"].named_steps[\"encoder\"]\n", + "cat_names = list(cat_encoder.get_feature_names_out(categorical_features))\n", + "all_names = num_names + cat_names\n", + "\n", + "importances = gbm_model.feature_importances_\n", + "feat_imp = pd.Series(importances, index=all_names).sort_values(ascending=False)\n", + "\n", + "top_n = 15\n", + "fig, ax = plt.subplots(figsize=(8, 5))\n", + "feat_imp.head(top_n).plot.barh(ax=ax)\n", + "ax.set_xlabel(\"Importance\")\n", + "ax.set_title(f\"Top {top_n} Features (Gradient Boosting)\")\n", + "ax.invert_yaxis()\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(f\"\\nTop {top_n} features:\")\n", + "for name, imp in feat_imp.head(top_n).items():\n", + " print(f\" {name}: {imp:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Try other difficulty tiers\n", + "\n", + "Change `BUNDLE` at the top of this notebook to point to `intro/` or `advanced/` and re-run all cells.\n", + "You should see:\n", + "- **Intro:** Higher AUC (cleaner signal, ~2% missing values)\n", + "- **Intermediate:** Moderate AUC (~8% missing values, more noise)\n", + "- **Advanced:** Lower AUC (~18% missing values, much noisier)\n", + "\n", + "## Explore the relational tables\n", + "\n", + "The flat task splits are derived from 9 relational tables under `tables/`. You can engineer your own features:\n", + "\n", + "```python\n", + "touches = pd.read_parquet(f\"{BUNDLE}/tables/touches.parquet\")\n", + "sessions = pd.read_parquet(f\"{BUNDLE}/tables/sessions.parquet\")\n", + "# ... join, aggregate, and build features from raw events\n", + "```\n", + "\n", + "See the [leadforge README](https://github.com/leadforge-dev/leadforge) for more details." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/scripts/build_public_release.py b/scripts/build_public_release.py new file mode 100644 index 0000000..e348453 --- /dev/null +++ b/scripts/build_public_release.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +"""Build the public release bundles for Kaggle/HuggingFace. + +Usage: + python scripts/build_public_release.py [OUTPUT_DIR] + +Generates four bundles: +- intro/ (student_public, intro difficulty) +- intermediate/ (student_public, intermediate difficulty) +- advanced/ (student_public, advanced difficulty) +- intermediate_instructor/ (research_instructor, intermediate difficulty) + +Each student_public bundle also gets a flat CSV convenience export +(lead_scoring.csv) merging train/valid/test with a ``split`` column. + +All bundles are validated with ``leadforge validate`` after generation. +""" + +from __future__ import annotations + +import json +import shutil +import sys +from pathlib import Path + +import pandas as pd + +from leadforge.api.generator import Generator +from leadforge.validation.bundle_checks import validate_bundle + +SEED = 42 +RECIPE = "b2b_saas_procurement_v1" + +# (directory_name, exposure_mode, difficulty) +BUNDLES = [ + ("intro", "student_public", "intro"), + ("intermediate", "student_public", "intermediate"), + ("advanced", "student_public", "advanced"), + ("intermediate_instructor", "research_instructor", "intermediate"), +] + + +def generate_and_save( + out_dir: Path, + exposure_mode: str, + difficulty: str, + seed: int = SEED, +) -> None: + """Generate a bundle and write it to *out_dir*.""" + gen = Generator.from_recipe( + RECIPE, + seed=seed, + exposure_mode=exposure_mode, + difficulty=difficulty, + ) + bundle = gen.generate() + bundle.save(str(out_dir)) + + +# Columns to drop from the flat CSV convenience export. +# current_stage at the 90-day horizon contains terminal stages (closed_won / +# closed_lost) that perfectly encode the label — it is leakage. The column +# remains in the Parquet task splits for completeness but must be excluded +# from modeling. The flat CSV drops it to prevent accidental misuse. +_FLAT_CSV_DROP_COLS = {"current_stage"} + + +def write_flat_csv(bundle_dir: Path) -> Path: + """Merge task splits into a single CSV with a ``split`` column. + + Drops columns listed in ``_FLAT_CSV_DROP_COLS`` to prevent accidental + leakage in the convenience export. + """ + task_dir = bundle_dir / "tasks" / "converted_within_90_days" + frames = [] + for split_name in ("train", "valid", "test"): + path = task_dir / f"{split_name}.parquet" + if path.exists(): + df = pd.read_parquet(path) + df.insert(0, "split", split_name) + frames.append(df) + merged = pd.concat(frames, ignore_index=True) + drop = [c for c in _FLAT_CSV_DROP_COLS if c in merged.columns] + if drop: + merged = merged.drop(columns=drop) + csv_path = bundle_dir / "lead_scoring.csv" + merged.to_csv(csv_path, index=False) + return csv_path + + +def print_summary(bundle_dir: Path, name: str) -> None: + """Print row counts and conversion rate for a bundle.""" + manifest_path = bundle_dir / "manifest.json" + with open(manifest_path) as f: + manifest = json.load(f) + + table_summary = ", ".join(f"{t}={info['row_count']}" for t, info in manifest["tables"].items()) + task_info = manifest["tasks"].get("converted_within_90_days", {}) + total_task_rows = sum(task_info.get(f"{s}_rows", 0) for s in ("train", "valid", "test")) + + # Compute conversion rate from the train split Parquet (avoid re-reading CSV). + conv_str = "" + train_path = bundle_dir / "tasks" / "converted_within_90_days" / "train.parquet" + if train_path.exists(): + train_df = pd.read_parquet(train_path, columns=["converted_within_90_days"]) + rate = train_df["converted_within_90_days"].mean() + conv_str = f", train_conversion={rate:.1%}" + + print(f" {name}: {table_summary}") + print(f" task rows={total_task_rows}{conv_str}") + + +def main() -> None: + output_root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("release") + output_root.mkdir(parents=True, exist_ok=True) + + # Copy LICENSE + license_src = Path(__file__).resolve().parent.parent / "LICENSE" + if license_src.exists(): + shutil.copy2(license_src, output_root / "LICENSE") + + for dir_name, exposure_mode, difficulty in BUNDLES: + bundle_dir = output_root / dir_name + print(f"Generating {dir_name} ({exposure_mode}, {difficulty})...", file=sys.stderr) + generate_and_save(bundle_dir, exposure_mode, difficulty) + + # Flat CSV for student_public bundles + if exposure_mode == "student_public": + csv_path = write_flat_csv(bundle_dir) + print(f" Flat CSV: {csv_path}", file=sys.stderr) + + # Validate + print(f" Validating {dir_name}...", file=sys.stderr) + errors = validate_bundle(bundle_dir) + if errors: + print(f" FAIL: {len(errors)} error(s):", file=sys.stderr) + for e in errors: + print(f" - {e}", file=sys.stderr) + sys.exit(1) + print(f" OK: {dir_name} passed validation.", file=sys.stderr) + + # Summary + print("\n=== Release Summary ===") + for dir_name, _, _ in BUNDLES: + bundle_dir = output_root / dir_name + print_summary(bundle_dir, dir_name) + + print(f"\nRelease directory: {output_root.resolve()}") + + +if __name__ == "__main__": + main() diff --git a/tests/narrative/test_dataset_card.py b/tests/narrative/test_dataset_card.py index e607b49..386dc10 100644 --- a/tests/narrative/test_dataset_card.py +++ b/tests/narrative/test_dataset_card.py @@ -203,3 +203,52 @@ def test_card_non_default_task_via_factory_has_generic_prose() -> None: assert "`churned_within_60_days`" in card assert "60-day" in card assert "closed_won" not in card + + +# --------------------------------------------------------------------------- +# Table inventory +# --------------------------------------------------------------------------- + + +def test_card_table_inventory_with_counts() -> None: + """When table_counts is provided, the card renders a row-count table.""" + counts = {"accounts": 1500, "contacts": 4200, "leads": 5000} + card = render_dataset_card(_make_world_spec(), table_counts=counts) + assert "| accounts | 1,500 |" in card + assert "| leads | 5,000 |" in card + + +def test_card_table_inventory_without_counts() -> None: + """Without table_counts, the card shows a placeholder.""" + card = render_dataset_card(_make_world_spec()) + assert "not available" in card.lower() + + +def test_card_table_inventory_empty_dict_renders_empty_table() -> None: + """An empty dict should render the table header with no rows, not the placeholder.""" + card = render_dataset_card(_make_world_spec(), table_counts={}) + assert "| Table | Rows |" in card + assert "not available" not in card.lower() + + +# --------------------------------------------------------------------------- +# Feature categories +# --------------------------------------------------------------------------- + + +def test_card_feature_categories_rendered() -> None: + """Feature categories are always rendered from LEAD_SNAPSHOT_FEATURES.""" + card = render_dataset_card(_make_world_spec()) + assert "| Category | Count | Examples |" in card + assert "account" in card + assert "engagement" in card + assert "sales" in card + assert "target" in card + + +def test_card_leakage_flagged_columns() -> None: + """Leakage-flagged columns are listed in the feature categories section.""" + card = render_dataset_card(_make_world_spec()) + assert "`total_touches_all`" in card + assert "`current_stage`" in card + assert "Leakage-flagged" in card