diff --git a/leadforge/narrative/dataset_card.py b/leadforge/narrative/dataset_card.py
index ea265cd..7363119 100644
--- a/leadforge/narrative/dataset_card.py
+++ b/leadforge/narrative/dataset_card.py
@@ -1,7 +1,10 @@
 """Dataset card renderer.
 
 Produces the ``dataset_card.md`` artifact from a :class:`WorldSpec`.
-The card follows the structure required by the architecture spec (§14.3).
+The card is written for a data scientist with no prior leadforge knowledge —
+it opens with a plain-English explanation of what the dataset is, what the
+prediction task is, and what the difficulty tier means before getting into
+technical metadata.
 """
 
 from __future__ import annotations
@@ -16,6 +19,38 @@
     from leadforge.schema.tasks import TaskManifest
 
 
+# Tier descriptions for the plain-English "this tier" callout.
+# Keys match the ``difficulty`` values used in difficulty_profiles.yaml.
+_TIER_DESCRIPTIONS: dict[str, str] = {
+    "intro": (
+        "The **intro tier is the easiest version of this task.** Signal is strong, "
+        "conversion rate is high, and missing values are minimal. A simple logistic "
+        "regression is competitive. Use this tier to prototype your pipeline and "
+        "sanity-check your approach before scaling up difficulty."
+    ),
+    "intermediate": (
+        "The **intermediate tier is the default benchmark.** Conversion rate is more "
+        "realistic for B2B SaaS than the intro tier, and noise is moderate enough "
+        "that feature engineering starts to matter. GBM does not consistently beat "
+        "logistic regression here (the snapshot is dominated by near-linear features). "
+        "Calibration becomes important at this prevalence level."
+    ),
+    "advanced": (
+        "The **advanced tier is a calibration and rare-event exercise.** Conversion "
+        "rate is low (~8%) and noise is heavy. AUC barely moves across tiers by "
+        "design; here you will want average precision, P@K, and value-weighted "
+        "ranking (``expected_acv × P(convert)``) to measure what matters. "
+        "Calibration is harder in this tier: a miscalibrated model can rank "
+        "correctly but still predict systematically wrong probabilities."
+    ),
+}
+
+_TIER_DESCRIPTION_FALLBACK = (
+    "See the difficulty profile YAML for signal_strength, noise_scale, "
+    "and missing_rate knobs for this tier."
+)
+
+
 def render_dataset_card(
     world_spec: WorldSpec,
     task_manifest: TaskManifest | None = None,
@@ -24,6 +59,10 @@ def render_dataset_card(
 ) -> str:
     """Return a Markdown dataset card string for *world_spec*.
 
+    The card is structured for a zero-prior-knowledge data scientist:
+    it opens with what the dataset is and what you are predicting, then
+    has a per-tier callout, then the technical inventory.
+
     Args:
         world_spec: The world specification containing config and narrative.
         task_manifest: Optional task manifest whose ``description`` is used
@@ -38,103 +77,88 @@ def render_dataset_card(
             describes only what is actually present.
 
     Sections:
-    - Header (recipe id, version, seed, exposure mode)
-    - Narrative summary (company, product, market, GTM)
-    - Primary task and label definition
+    - Title (tier-aware)
+    - What is this / what you are predicting
+    - This tier at a glance (difficulty callout)
     - Table inventory
     - Feature categories
-    - Suggested use cases
+    - Simulated world (company / product / market)
+    - How to load (Python snippet)
+    - Reproducibility (recipe, seed, version)
     - Caveats
     """
     cfg = world_spec.config
     narrative = world_spec.narrative
+    difficulty = str(cfg.difficulty) if cfg.difficulty else "unknown"
 
     lines: list[str] = []
 
     # ------------------------------------------------------------------
-    # Header
+    # Title — tier-aware
     # ------------------------------------------------------------------
-    snapshot_label = (
-        f"{cfg.snapshot_day} days (windowed)"
-        if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days
-        else f"{cfg.horizon_days} days (full horizon)"
-    )
+    tier_title = difficulty.capitalize()
     lines += [
-        "# leadforge dataset card",
-        "",
-        "| Field | Value |",
-        "|---|---|",
-        f"| Recipe | `{cfg.recipe_id}` |",
-        f"| Package version | `{cfg.package_version}` |",
-        f"| Seed | `{cfg.seed}` |",
-        f"| Exposure mode | `{cfg.exposure_mode}` |",
-        f"| Difficulty | `{cfg.difficulty}` |",
-        f"| Horizon | {cfg.horizon_days} days |",
-        f"| Label window | {cfg.label_window_days} days |",
-        f"| Feature snapshot window | {snapshot_label} |",
+        f"# B2B Lead Scoring Dataset — {tier_title} Tier",
         "",
     ]
 
     # ------------------------------------------------------------------
-    # Narrative summary
-    # ------------------------------------------------------------------
-    lines.append("## Narrative summary")
-    lines.append("")
-    if narrative is not None:
-        c = narrative.company
-        p = narrative.product
-        m = narrative.market
-        gtm = narrative.gtm_motion
-        lines += [
-            f"**Vendor:** {c.name} ({c.stage}, founded {c.founded_year},"
-            f" {c.hq_city}, {c.hq_country})",
-            "",
-            f"**Product:** {p.name} — {p.category}. "
-            f"Deployment: {p.deployment}. "
-            f"Pricing: {p.pricing_model}. "
-            f"ACV range: ${p.acv_range_usd[0]:,}–${p.acv_range_usd[1]:,}.",
-            "",
-            f"**Target market:** {m.icp_employee_range[0]}–{m.icp_employee_range[1]}-employee"
-            f" firms in {', '.join(m.geographies)}. "
-            f"Key industries: {', '.join(m.icp_industries)}. "
-            f"Average deal size: ${m.avg_deal_size_usd:,}. "
-            f"Average sales cycle: {m.avg_sales_cycle_days} days.",
-            "",
-            f"**GTM motion:** {', '.join(gtm.channels)} "
-            f"({gtm.inbound_share:.0%} inbound / "
-            f"{gtm.outbound_share:.0%} outbound / "
-            f"{gtm.partner_share:.0%} partner).",
-            "",
-            "**Buyer personas:**",
-            "",
-        ]
-        for persona in narrative.personas:
-            ellipsis = "…" if len(persona.title_variants) > 2 else ""
-            lines.append(
-                f"- **{persona.role}** ({persona.decision_authority}) — "
-                f"{', '.join(persona.title_variants[:2])}{ellipsis}"
-            )
-        lines.append("")
-    else:
-        lines += ["*Narrative unavailable for this dataset.*", ""]
-
-    # ------------------------------------------------------------------
-    # Primary task
+    # What is this / prediction task intro
     # ------------------------------------------------------------------
+    snapshot_label = (
+        f"{cfg.snapshot_day} days"
+        if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days
+        else f"{cfg.horizon_days} days (full horizon)"
+    )
     if task_manifest is not None and task_manifest.description:
         label_def = task_manifest.description
     else:
         label_def = (
-            f"Binary label evaluated over a {cfg.label_window_days}-day window "
-            f"from the snapshot anchor date. The label is event-derived — never "
+            f"Binary label: did this lead close as a paid deal within "
+            f"{cfg.label_window_days} days? The label is event-derived — never "
             f"sampled directly."
         )
     lines += [
-        "## Primary task",
+        "**This is a synthetic dataset** for practicing B2B lead scoring. It was "
+        "generated by [leadforge](https://github.com/leadforge-dev/leadforge), an "
+        "open-source Python framework for producing realistic CRM/funnel training "
+        "data. No real company, customer, or transaction is represented.",
+        "",
+        "**What you are predicting:** Each row is a sales lead at a fictional B2B "
+        "SaaS company. The task is binary classification:",
+        "",
+        f"> `{cfg.primary_task}` — {label_def}",
+        "",
+        f"Features capture the first {snapshot_label} of CRM activity per lead "
+        "(email/call touches, product sessions, deal stage, account firmographics). "
+        "The label is derived from simulated events — never directly sampled — so "
+        "there is genuine causal structure behind the signal.",
+        "",
+        "---",
+        "",
+    ]
+
+    # ------------------------------------------------------------------
+    # This tier at a glance
+    # ------------------------------------------------------------------
+    tier_desc = _TIER_DESCRIPTIONS.get(difficulty, _TIER_DESCRIPTION_FALLBACK)
+    lines += [
+        f"## This tier: {difficulty}",
+        "",
+        "| Property | Value |",
+        "|---|---|",
+        f"| Signal strength | {cfg.signal_strength} / 1.0 |"
+        if hasattr(cfg, "signal_strength")
+        else "| Signal strength | see difficulty_profiles.yaml |",
+        "",
+        tier_desc,
         "",
-        f"**Task:** `{cfg.primary_task}`",
+        "This dataset ships in three tiers — **intro → intermediate → advanced** — "
+        "with decreasing signal, lower conversion rates, and heavier noise and "
+        "missingness. All three tiers share the same schema and simulate the same "
+        "fictional B2B world.",
         "",
-        f"**Label definition:** {label_def}",
+        "---",
         "",
     ]
 
@@ -144,22 +168,49 @@ def render_dataset_card(
     lines += ["## Table inventory", ""]
     if table_counts is not None:
         lines += [
-            "| Table | Rows |",
-            "|---|---:|",
+            "| Table | Rows | Description |",
+            "|---|---:|---|",
         ]
+        _table_descriptions = {
+            "accounts": "One row per company",
+            "contacts": "One row per buyer-side individual (multiple per account)",
+            "leads": "One row per lead — the prediction unit",
+            "touches": f"Marketing / SDR outreach events (first {snapshot_label} per lead)",
+            "sessions": f"Product demo or trial sessions (first {snapshot_label} per lead)",
+            "sales_activities": (
+                f"CRM activities: calls, emails, meetings (first {snapshot_label} per lead)"
+            ),
+            "opportunities": f"Deal records opened before the {snapshot_label} snapshot",
+        }
         for tbl, count in table_counts.items():
-            lines.append(f"| {tbl} | {count:,} |")
-        lines.append("")
-    else:
+            desc = _table_descriptions.get(tbl, "")
+            lines.append(f"| {tbl} | {count:,} | {desc} |")
         lines += [
-            "*Table counts not available (pass ``table_counts`` to populate).*",
             "",
+            "**Snapshot-safe:** event tables contain only rows with timestamps "
+            f"≤ {snapshot_label} from lead creation. Outcome columns "
+            "(`converted_within_90_days`, `conversion_timestamp`, `close_outcome`) "
+            "are excluded from the public relational tables — they appear only in the "
+            "task splits.",
+        ]
+    else:
+        lines += [
+            "*Table counts not available (pass `table_counts` to populate).*",
         ]
+    lines += ["", "---", ""]
 
     # ------------------------------------------------------------------
     # Feature categories
     # ------------------------------------------------------------------
-    lines += ["## Feature categories", ""]
+    _category_labels: dict[str, str] = {
+        "account": "Account",
+        "contact": "Contact",
+        "lead_meta": "Lead metadata",
+        "engagement": "Engagement",
+        "sales": "Sales",
+        "target": "Target",
+    }
+    lines += ["## Features", ""]
     category_counts: Counter[str] = Counter()
     for feat in features:
         category_counts[feat.category] += 1
@@ -168,56 +219,194 @@ def render_dataset_card(
         "|---|---:|---|",
     ]
     for cat, count in category_counts.items():
-        examples = [f.name for f in features if f.category == cat and not f.is_target][:3]
-        lines.append(f"| {cat} | {count} | {', '.join(examples)} |")
+        label = _category_labels.get(cat, cat)
+        examples = [f"`{f.name}`" for f in features if f.category == cat and not f.is_target][:3]
+        lines.append(f"| {label} | {count} | {', '.join(examples)} |")
     leakage_cols = [f.name for f in features if f.leakage_risk]
     if leakage_cols:
         lines += [
             "",
-            f"**Leakage-flagged columns:** {', '.join(f'`{c}`' for c in leakage_cols)}. "
+            f"**Leakage-flagged columns:** "
+            f"{', '.join(f'`{c}`' for c in leakage_cols)}. "
+            f"{'This column aggregates' if len(leakage_cols) == 1 else 'These columns aggregate'} "
+            f"events over the full {cfg.horizon_days}-day window (not just the "
+            f"{snapshot_label} feature window) and "
+            f"{'is' if len(leakage_cols) == 1 else 'are'} deliberately retained as "
+            f"a leakage-detection teaching exercise. Drop "
+            f"{'it' if len(leakage_cols) == 1 else 'them'} from your feature set "
+            f"unless you are studying leakage. "
             "See `feature_dictionary.csv` for details.",
         ]
+    lines += [
+        "",
+        "See `feature_dictionary.csv` for the full column-by-column specification.",
+        "",
+        "---",
+        "",
+    ]
+
+    # ------------------------------------------------------------------
+    # Simulated world (company / product / market)
+    # ------------------------------------------------------------------
+    lines.append("## The simulated world")
     lines.append("")
+    if narrative is not None:
+        c = narrative.company
+        p = narrative.product
+        m = narrative.market
+        gtm = narrative.gtm_motion
+        lines += [
+            "The dataset simulates a fictional company — "
+            f"**{c.name}** — "
+            f"a {c.stage} startup ({c.hq_city}, {c.hq_country}, "
+            f"founded {c.founded_year}) selling **{p.name}**, a "
+            f"{p.deployment.replace('_', ' ')} {p.category}. "
+            "Everything below is invented:",
+            "",
+            f"- **Target customers:** {m.icp_employee_range[0]}–{m.icp_employee_range[1]}"
+            f"-employee firms in {', '.join(m.geographies)} "
+            f"({', '.join(m.icp_industries)})",
+            f"- **Deal range:** ${p.acv_range_usd[0]:,}–${p.acv_range_usd[1]:,} ACV; "
+            f"average deal ${m.avg_deal_size_usd:,}; "
+            f"average sales cycle {m.avg_sales_cycle_days} days",
+            f"- **Go-to-market:** {gtm.inbound_share:.0%} inbound marketing, "
+            f"{gtm.outbound_share:.0%} SDR outbound, "
+            f"{gtm.partner_share:.0%} partner referrals",
+        ]
+        if narrative.personas:
+            persona_strs = []
+            for persona in narrative.personas:
+                title = persona.title_variants[0] if persona.title_variants else persona.role
+                # Include the internal role key (e.g. vp_finance) as a machine-readable
+                # anchor alongside the human-readable title.
+                persona_strs.append(
+                    f"{title} / {persona.role} ({persona.decision_authority.replace('_', ' ')})"
+                )
+            lines.append(f"- **Buyer personas:** {', '.join(persona_strs)}")
+        lines += [
+            "",
+            "In this public version, the hidden causal graph, latent trait scores, "
+            "and mechanism parameters are withheld. The instructor companion bundle "
+            "includes them.",
+        ]
+    else:
+        lines += ["*Narrative unavailable for this dataset.*"]
+    lines += ["", "---", ""]
 
     # ------------------------------------------------------------------
-    # Suggested use cases
+    # How to load
     # ------------------------------------------------------------------
     lines += [
-        "## Suggested use cases",
+        "## How to load",
+        "",
+        "```python",
+        "import pandas as pd",
+        "",
+        "# Flat CSV — all leads, all splits combined (convenient for exploration)",
+        'df = pd.read_csv("lead_scoring.csv")',
+        f'X = df.drop(columns=["{cfg.primary_task}"])',
+        f'y = df["{cfg.primary_task}"]',
+        "",
+        "# Parquet task splits — recommended for model training",
+        f'train = pd.read_parquet("tasks/{cfg.primary_task}/train.parquet")',
+        f'valid = pd.read_parquet("tasks/{cfg.primary_task}/valid.parquet")',
+        f'test  = pd.read_parquet("tasks/{cfg.primary_task}/test.parquet")',
+        "",
+        "# Relational tables — for feature engineering",
+        'leads   = pd.read_parquet("tables/leads.parquet")',
+        'touches = pd.read_parquet("tables/touches.parquet")',
+        "```",
         "",
-        "- Teaching binary classification on realistic CRM data",
-        "- Portfolio projects demonstrating end-to-end ML pipelines",
-        "- Benchmarking lead-scoring models under controlled signal/noise conditions",
+        "Splits are 70 / 15 / 15 (train / valid / test), stratified on the target, "
+        f"deterministic given seed {cfg.seed}.",
+        "",
+        "**Note on account overlap:** ~93% of test-set accounts also appear in the "
+        "training set (splits are keyed on `lead_id`). Headline AUC overstates "
+        "generalisation to *unseen* accounts. For a faithful out-of-sample estimate, "
+        'use `GroupKFold(groups=df["account_id"])`.',
+        "",
+        "---",
+        "",
+    ]
+
+    # ------------------------------------------------------------------
+    # Reproducibility
+    # ------------------------------------------------------------------
+    lines += [
+        "## Reproducibility",
+        "",
+        f"Generated with **leadforge v{cfg.package_version}**, "
+        f"recipe `{cfg.recipe_id}`, seed {cfg.seed}, "
+        f"difficulty `{difficulty}`. To reproduce:",
+        "",
+        "```bash",
+        "pip install leadforge",
+        f"leadforge generate --recipe {cfg.recipe_id} --seed {cfg.seed} \\",
+        f"                   --mode student_public --difficulty {difficulty} --out my_bundle",
+        "```",
+        "",
+        "Every file in this bundle is SHA-256 hashed in `manifest.json`. Run "
+        "`leadforge validate my_bundle` to verify integrity.",
+        "",
+        "**Author:** [Shay Palachy Affek](https://huggingface.co/shaypal5) "
+        "· [Kaggle](https://www.kaggle.com/derelictpanda) "
+        "· [GitHub](https://github.com/shaypalachy)",
+        "",
+        "---",
+        "",
+    ]
+
+    # ------------------------------------------------------------------
+    # Intended uses
+    # ------------------------------------------------------------------
+    lines += [
+        "## Intended uses",
+        "",
+        "- Teaching binary classification on realistic B2B CRM data",
+        "- Portfolio projects demonstrating end-to-end lead-scoring pipelines",
+        "- Benchmarking model families under controlled signal / noise / prevalence conditions",
+        "- Teaching leakage detection, calibration, lift, P@K, and value-weighted ranking",
         "- Research on causal structure in funnel conversion data",
         "",
+        "**Out of scope:** production lead scoring (the company is fictional), vendor "
+        "benchmarking / paper baselines, or causal-inference research that requires "
+        "recovery of the true DGP (use the instructor bundle for that).",
+        "",
+        "---",
+        "",
     ]
 
     # ------------------------------------------------------------------
     # Caveats
     # ------------------------------------------------------------------
     if cfg.snapshot_day is not None and cfg.snapshot_day < cfg.horizon_days:
-        feature_window_caveat = (
-            f"- The label is evaluated over the full {cfg.label_window_days}-day "
-            f"window from lead creation; event-aggregate features (e.g. "
-            f"`touch_count`, `session_count`, `expected_acv`) observe only the "
-            f"first {cfg.snapshot_day} days of that window. The deliberate "
-            f"exception is `total_touches_all`, which counts touches over the "
-            f"full {cfg.horizon_days}-day horizon as a pedagogical leakage trap."
+        window_caveat = (
+            f"- **Snapshot window.** Engagement features cover days 0–{cfg.snapshot_day} "
+            f"per lead; the label resolves at day {cfg.horizon_days}. "
+            f"`total_touches_all` is the intentional exception — it aggregates over the "
+            f"full {cfg.horizon_days}-day window and is a leakage trap."
         )
     else:
-        feature_window_caveat = (
-            "- Features are anchored at the snapshot date. No post-anchor data is "
-            "included (leakage-free by construction)."
+        window_caveat = (
+            "- **Snapshot window.** Features are anchored at the snapshot date. "
+            "No post-anchor data is included (leakage-free by construction), "
+            "except `total_touches_all` which is the intentional leakage trap."
         )
+
     lines += [
         "## Caveats",
         "",
-        "- This is **synthetic** data. It does not represent any real company, product, or market.",
-        "- The hidden world structure varies by motif family and stochastic rewiring; "
-        "no two seeds produce the same DGP.",
-        feature_window_caveat,
-        "- In `student_public` mode, the latent world graph, mechanism summary, "
-        "and full world spec are withheld.",
+        "- **Synthetic data only.** No real company, customer, or market is represented.",
+        "- **AUC does not distinguish tiers.** LR AUC is similar across all three tiers "
+        "by design. The tiers differ in conversion rate, noise, and missing values — not "
+        "in rank discrimination. Use average precision, P@K, and calibration metrics to "
+        "see the difficulty gradient.",
+        "- **~93% train/test account overlap.** Splits are keyed on `lead_id`; most test "
+        "accounts also appear in train. Headline metrics overstate generalisation to "
+        "unseen accounts.",
+        window_caveat,
+        "- **Public version.** The hidden causal graph, latent trait scores, and "
+        "mechanism parameters are withheld. The instructor companion bundle includes them.",
         "",
     ]
 
diff --git a/release/huggingface-instructor/README.md b/release/huggingface-instructor/README.md
index 61ac5ad..0a339ad 100644
--- a/release/huggingface-instructor/README.md
+++ b/release/huggingface-instructor/README.md
@@ -1,6 +1,8 @@
 ---
 pretty_name: 'LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion'
 license: mit
+authors:
+  - shaypal5
 language:
   - en
 task_categories:
diff --git a/release/huggingface/README.md b/release/huggingface/README.md
index 2cfce2c..e10360e 100644
--- a/release/huggingface/README.md
+++ b/release/huggingface/README.md
@@ -1,6 +1,8 @@
 ---
 pretty_name: 'LeadForge: Synthetic B2B Lead Scoring (v1)'
 license: mit
+authors:
+  - shaypal5
 language:
   - en
 task_categories:
diff --git a/release/kaggle/dataset-metadata.json b/release/kaggle/dataset-metadata.json
index 051a9c3..dcda826 100644
--- a/release/kaggle/dataset-metadata.json
+++ b/release/kaggle/dataset-metadata.json
@@ -1,5 +1,5 @@
 {
-  "collaborators": [],
+  "collaborators": [{"username": "derelictpanda", "role": "writer"}],
   "description": "# LeadForge: Synthetic B2B Lead Scoring Dataset (`leadforge-lead-scoring-v1`)\n\nA relational, reproducible, three-tier synthetic CRM dataset family for\nteaching lead scoring at scale. Generated by\n[leadforge](https://github.com/leadforge-dev/leadforge), an\nopen-source Python framework for synthetic CRM/funnel data. The\nframework version is decoupled from the dataset version: the package\nstays at `1.x`; the dataset is published under the explicit `…-v1`\ntag.\n\n## Why lead scoring matters in 2024–2026\n\nMid-market SaaS vendors entered 2024–2026 with growth slowing and\ncustomer-acquisition costs rising[^macro], so predicting *which* leads\nconvert within a fixed window has moved from a marketing nicety to a\nsurvival skill. This dataset teaches that skill on a relational\nsubstrate, with the realistic confusions (snapshot-window discipline,\nleakage traps, channel signal weaker than vendor blogs imply) that\nstudents will hit when they finally get hands on real CRM data.\n\n[^macro]: Macroeconomic framing summarised in\n[`docs/external_review/summaries/gemini_v2_summary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/external_review/summaries/gemini_v2_summary.md)\n(median public-SaaS growth 30%→25% from 2023 to 2025; New CAC Ratio\nrose materially in 2024).\n\n## What's inside\n\n```\n.\n├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier\n│   ├── manifest.json                 # provenance + file hashes\n│   ├── metrics.json                  # per-tier headline metrics (medians + spreads)\n│   ├── dataset_card.md               # auto-rendered per-bundle card\n│   ├── feature_dictionary.csv        # authoritative column spec\n│   ├── lead_scoring.csv              # flat convenience CSV (all splits)\n│   ├── tables/*.parquet              # 7 snapshot-safe relational tables\n│   └── tasks/converted_within_90_days/{train,valid,test}.parquet\n├── docs/                             # vendored DGP / leakage / break-me docs (agent-readable)\n├── metrics.json                      # top-level cross-tier metrics summary\n├── claims_register.{md,json}         # claims → backing-artifact map (agent-readable)\n├── dataset-metadata.json             # Kaggle dataset metadata\n├── dataset-cover-image.png           # Kaggle cover image\n├── README.md                         # Kaggle package README\n└── LICENSE\n```\n\n`student_public` bundles ship the snapshot-safe relational view;\n`research_instructor` companions ship the full-horizon view plus the\nhidden causal structure (DAG, latent registry, mechanism summary)\nunder `metadata/`. The full layout is documented in each bundle's\n`manifest.json`.\n\n### Agent-reviewable artifacts\n\nThe published bundle is self-contained for AI review and offline\nauditing — every numeric / structural claim on this page can be\nverified without following an external link:\n\n- **`metrics.json` (root) + `<tier>/metrics.json`** — deterministic\n  JSON view of the headline LR AUC / AP / P@100 / Brier / conversion\n  rate / cohort-shift / cross-tier-ordering medians, with JSON-path\n  back-references to `validation/validation_report.json` (the\n  source of truth).\n- **`claims_register.{md,json}`** — every numerical or structural\n  claim on this page paired with the artifact and path that backs it.\n  Rendered from `claims_register_source.yaml` by\n  `scripts/build_claims_register.py`.\n- **`docs/`** — vendored copies of `generation_method.md`,\n  `channel_signal_audit.md`, `break_me_guide.md`,\n  `feature_dictionary.md`, `v1_acceptance_gates_bands.yaml`,\n  `v2_decision_log.md`, plus a hand-authored\n  `relational_table_schemas.csv` documenting every column of every\n  relational table.  These match the GitHub-blob links cited below but\n  ship inside the bundle so a reviewer never needs network access.\n- **`<tier>/manifest.json`** — SHA-256 hash for every file plus the\n  full redaction contract (`structural_redactions.columns`,\n  `omitted_tables`, `relational_snapshot_safe`, `snapshot_day`).\n- Kaggle / HuggingFace preview pages additionally inject a\n  `schema.org/Dataset` JSON-LD block in their `<head>` for agent\n  ingestion without HTML parsing.\n\n## Quick start\n\n```python\n# Flat CSV\ndf = pd.read_csv(\"intermediate/lead_scoring.csv\")\n\n# Parquet task splits (recommended)\ntrain = pd.read_parquet(\"intermediate/tasks/converted_within_90_days/train.parquet\")\ntest  = pd.read_parquet(\"intermediate/tasks/converted_within_90_days/test.parquet\")\n\n# Relational tables (feature engineering — example)\nleads   = pd.read_parquet(\"intermediate/tables/leads.parquet\")\ntouches = pd.read_parquet(\"intermediate/tables/touches.parquet\")\nmy_touch_count = (\n    touches.groupby(\"lead_id\").size().rename(\"my_touch_count\").reset_index()\n)\nfeatures = leads.merge(my_touch_count, on=\"lead_id\", how=\"left\")\n\n# Reproduce from source\n# pip install leadforge\n# leadforge generate --recipe b2b_saas_procurement_v1 --seed 42 \\\n#                    --mode student_public --difficulty intermediate --out my_bundle\n```\n\nThe label `converted_within_90_days` resolves over a 90-day window;\nengagement features (`touch_count`, `session_count`, etc.) are\ncomputed strictly over events on days `[0, 30]`. The deliberate\nexception is `total_touches_all`, the leakage trap — flagged\n`leakage_risk=True` in `feature_dictionary.csv`. Drop it from your\nfeature set unless you're demonstrating leakage detection.\n\n## Evaluation note — account and contact overlap\n\n**518 of 557 test accounts (≈93 %) appear in train** on the intermediate\nbundle; the other tiers are similar. Contact-level overlap is comparable\nin magnitude: most test contacts also have activity in the training set.\nThe random-split headline metrics therefore ride both account-level and\ncontact-level signal across the split boundary and over-estimate\ngeneralisation to unseen accounts and contacts. For a faithful\nout-of-sample number, retrain with `GroupKFold(account_id)` and report\nboth metrics. Notebook 02 demonstrates the detection recipe;\n[`break_me_guide.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/break_me_guide.md) §5 gives\nthe worked example.\n\n## Dataset summary\n\n**Tiers are prevalence and noise axes, not modelling-complexity axes.**\nLR AUC is ~0.88 in every tier by design. The tiers differ in conversion\nrate, missingness, and noise — not rank discrimination. Choose a tier\nbased on the teaching exercise, not on expected AUC:\n\n| | Intro | Intermediate | Advanced |\n|---|---|---|---|\n| **Tier purpose** | High-prevalence warm-up | Default benchmark | Low-prevalence · calibration · noise exercise |\n| Leads | 5,000 | 5,000 | 5,000 |\n| Accounts | 1,500 | 1,500 | 1,500 |\n| Contacts | 4,200 | 4,200 | 4,200 |\n| Snapshot columns | 31 / 34* | 31 / 34* | 31 / 34* |\n| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` |\n| Conversion rate (acceptance band, gate G7.\\*) | 24–61% | 12–31% | 4–12% |\n| Conversion rate (observed median, seeds 42–46) | 42.67% | 21.60% | 8.40% |\n| Signal strength | 0.90 | 0.70 | 0.50 |\n| Noise scale | 0.10 | 0.30 | 0.55 |\n| Missing rate | 2% | 8% | 18% |\n\n\\* `student_public` / `research_instructor`. Difficulty is modulated\nby the simulation engine — signal strength on latent-trait weights,\nGaussian noise on float features, MCAR missingness, outlier rate —\nnot post-hoc label flipping. The acceptance band is the recipe\ngate's tolerance window (`v1_acceptance_gates_bands.yaml` G7.\\*),\nnot the achievable range — observed five-seed spreads sit\ncomfortably inside the band.\n\n## The scenario\n\n**Veridian Technologies** is a fictional Series B startup (Austin, US)\nselling **Veridian Procure**, a procurement / AP automation SaaS, to\nmid-market firms (200–2,000 employees) in the US and UK. The funnel\nruns through inbound marketing (45%), SDR outbound (35%), and\npartner referrals (20%); four personas drive deals (VP Finance, AP\nManager, IT Director, Procurement Manager). **Task:** predict whether\na lead converts (`closed_won`) within 90 days. ACV bands are\n$18k–$120k. See\n[`docs/release/generation_method.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/generation_method.md)\nfor the full DGP, and the deeper \"what's modelled / approximate / not\nmodelled\" breakdown that this README only summarises.\n\n## Public vs instructor: what's redacted\n\nFiltering happens **during rendering**, not during simulation. The\nredaction contract is single-sourced in\n[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py);\nthe snapshot-safe writer and the validator import the same constants,\nso they cannot drift apart.\n\n| Source-of-truth constant | Public bundle treatment |\n|---|---|\n| `BANNED_LEAD_COLUMNS = (\"converted_within_90_days\", \"conversion_timestamp\")` | Dropped from `tables/leads.parquet` |\n| `BANNED_OPP_COLUMNS = (\"close_outcome\", \"closed_at\")` | Dropped from `tables/opportunities.parquet` |\n| `BANNED_TABLES = (\"customers\", \"subscriptions\")` | Omitted from public bundles |\n| `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` |\n| Snapshot redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` |\n| `total_touches_all` (deliberate trap) | **Retained in both modes**; flagged `leakage_risk=True` |\n\nEach bundle's `manifest.json` records `relational_snapshot_safe`,\n`redacted_columns`, and `snapshot_day`, so the bundle is\nself-describing.\n\n## Calibration\n\nEvery realism / calibration / difficulty claim in this README is\nbacked by\n[`validation/validation_report.md`](https://github.com/leadforge-dev/leadforge/blob/main/release/validation/validation_report.md),\nregenerated by\n[`scripts/validate_release_candidate.py`](https://github.com/leadforge-dev/leadforge/blob/main/scripts/validate_release_candidate.py)\nwith bands declared in\n[`docs/release/v1_acceptance_gates_bands.yaml`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v1_acceptance_gates_bands.yaml).\nHeadline cross-seed medians (seeds 42–46):\n\n| Tier | LR AUC | AP | P@100 | Brier | `calibration_max_bin_error` |\n|---|---|---|---|---|---|\n| intro | 0.879 | 0.761 | 0.80 | 0.130 | 0.25 |\n| intermediate | 0.886 | 0.575 | 0.59 | 0.110 | 0.25 |\n| advanced | 0.886 | 0.351 | 0.34 | 0.061 | **0.52** |\n\n**Reading this table:** LR AUC is flat across tiers by design — the\ntiers are a prevalence / noise axis, not a rank-discrimination axis.\nBrier score *improves* as prevalence falls (a prevalence effect, not\nbetter calibration); use `calibration_max_bin_error` to assess\ncalibration quality. Advanced's 0.52 max-bin error means the model's\npredicted probabilities are materially mis-scaled against actual\nconversion rates — a realistic miscalibration exercise.\n\nAP, P@100, conversion-rate, and lift orderings hold across the\nintended prevalence axis (intro > intermediate > advanced).\n\n## Intended uses\n\n- Teaching baseline lead-scoring on a flat snapshot.\n- Teaching relational feature engineering against snapshot-safe tables.\n- Teaching leakage detection (the `total_touches_all` trap is\n  designed to be discoverable).\n- Teaching calibration, lift, P@K, value-aware ranking\n  (`expected_acv × P(convert)`), and cohort-shift evaluation.\n- Comparing model families under a controlled DGP.\n\n## Out-of-scope uses\n\n- **Production lead scoring.** The company, product, and customers are\n  fictional.\n- **Vendor benchmarking / paper baselines.** Difficulty tiers are\n  calibrated for pedagogy, not cross-paper comparability.\n- **Causal-inference research that requires recovery of the true DGP.**\n  The instructor companion exposes the hidden graph for teaching, not\n  designed counterfactuals.\n- **Demographic / fairness research.** v1 does not model protected\n  attributes.\n\n## Known limitations\n\n- **Tiers are a prevalence / noise axis, not a modelling-complexity\n  axis.** LR AUC is ~0.88 in every tier; the three tiers differ in\n  conversion rate (43% / 22% / 8%), noise scale, and missingness —\n  not in rank discrimination. Use AP, P@K, and calibration metrics\n  to see the difficulty gradient; AUC alone will not show it.\n- **93% account and contact overlap across train / test splits.** Random\n  splits are keyed on lead ID; most test accounts and contacts also\n  appear in train. Headline metrics over-state generalisation to unseen\n  accounts and contacts. Use `GroupKFold(account_id)` for a faithful\n  estimate.\n- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta\n  is slightly negative in every tier (intro −0.0045, intermediate\n  −0.0072, advanced −0.0133); v1's snapshot is dominated by linear\n  features. v2 will inject non-linear interactions in the simulator.\n- **Channel signal is weak.** Per\n  [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md),\n  out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across\n  all tiers and the per-channel rate spread is ≤0.05. The simulator\n  does not encode channel-conditional probabilities; channel-conditional\n  encoding is post-v1 work.\n- **Cohort-shift degradation is small.** v1 has no time-of-year drift\n  baked in; the cohort-shift gate (G6.4) is informational and will\n  bite in v2.\n- **Advanced-tier noise can produce artifact zeros in count and duration\n  columns.** Gaussian noise is applied before MCAR missingness; the\n  snapshot builder clamps results below zero to zero. What users observe\n  is therefore not negative values but zeros that may be noise artifacts\n  rather than true zero values — e.g. `days_since_last_touch = 0` might\n  mean \"noised below zero, clamped\" rather than \"touched today\". Treat\n  suspicious zero clusters in the Advanced tier as intentional\n  data-cleaning exercise material.\n\n## Composition\n\n- **Entities.** Accounts, contacts, leads, touches, sessions,\n  sales_activities, opportunities (public); plus customers and\n  subscriptions (instructor only). Per-row counts per bundle live in\n  `manifest.json`.\n- **Features.** 31 public columns grouped by analytical role in\n  [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md);\n  the per-bundle `feature_dictionary.csv` is the authoritative\n  machine-readable spec.\n- **Label.** `converted_within_90_days` (boolean), event-derived from\n  the simulator. Never sampled directly.\n- **Splits.** 70/15/15 train/valid/test, deterministic given seed;\n  recorded in `tasks/converted_within_90_days/task_manifest.json`.\n  Splits are keyed on `lead_id`; see the *Evaluation note* above for\n  the account-overlap caveat.\n- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package\n  version stamped in `manifest.json`.\n\n## Maintenance, adversarial framing, license\n\nWe *want* the dataset to be broken. The\n[break-me guide](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/break_me_guide.md) catalogues\nnine adversarial patterns to look for (leakage, split\ncontamination, ranking inversions, calibration drift) with\nworked-example pointers back into the notebooks. Issue\ntemplates ship under `.github/ISSUE_TEMPLATE/`: a\n[breakage report](https://github.com/leadforge-dev/leadforge/blob/main/.github/ISSUE_TEMPLATE/dataset_breakage_report.yml)\nform for findings on the bundle itself, and a\n[realism feedback](https://github.com/leadforge-dev/leadforge/blob/main/.github/ISSUE_TEMPLATE/realism_feedback.yml)\nform for distributional critiques. Accepted findings are\nlogged in\n[`docs/release/v2_decision_log.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v2_decision_log.md).\nFile issues at\n[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge);\nPRs welcome.\n\n| Field | Value |\n|---|---|\n| Generator | leadforge `1.0.0+` |\n| Recipe | `b2b_saas_procurement_v1` |\n| Canonical seed | 42 (cross-seed sweep: 42–46) |\n| Bundle schema version | 5 |\n| Format | Parquet (canonical) + CSV (convenience) |\n| License | MIT — see [LICENSE](LICENSE) |\n\nVerify integrity with `leadforge validate <bundle_dir>`; every file\nis hashed in `manifest.json`.\n",
   "expectedUpdateFrequency": "never",
   "id": "leadforge/leadforge-lead-scoring-v1",
diff --git a/tests/narrative/test_dataset_card.py b/tests/narrative/test_dataset_card.py
index 386dc10..bd7ab0d 100644
--- a/tests/narrative/test_dataset_card.py
+++ b/tests/narrative/test_dataset_card.py
@@ -72,8 +72,12 @@ def test_card_renders_custom_task_and_window() -> None:
 
 
 def test_card_contains_use_cases() -> None:
+    # Card must explain what the dataset is intended for (section heading may vary).
     card = render_dataset_card(_make_world_spec())
-    assert "use cases" in card.lower()
+    card_lower = card.lower()
+    has_use_cases = "use cases" in card_lower
+    has_intended = "intended" in card_lower
+    assert has_use_cases or has_intended
 
 
 def test_card_contains_caveats() -> None:
@@ -110,9 +114,10 @@ def test_card_with_narrative_contains_geographies() -> None:
 
 
 def test_card_with_narrative_contains_personas() -> None:
+    # Persona information should appear; new format uses human titles alongside role keys.
     gen = Generator.from_recipe("b2b_saas_procurement_v1")
     card = render_dataset_card(gen.world_spec)
-    assert "vp_finance" in card
+    assert "vp_finance" in card  # role key included as machine-readable anchor
 
 
 # ---------------------------------------------------------------------------
@@ -240,10 +245,11 @@ def test_card_feature_categories_rendered() -> None:
     """Feature categories are always rendered from LEAD_SNAPSHOT_FEATURES."""
     card = render_dataset_card(_make_world_spec())
     assert "| Category | Count | Examples |" in card
-    assert "account" in card
-    assert "engagement" in card
-    assert "sales" in card
-    assert "target" in card
+    card_lower = card.lower()
+    assert "account" in card_lower
+    assert "engagement" in card_lower
+    assert "sales" in card_lower
+    assert "target" in card_lower
 
 
 def test_card_leakage_flagged_columns() -> None:
@@ -251,4 +257,5 @@ def test_card_leakage_flagged_columns() -> None:
     card = render_dataset_card(_make_world_spec())
     assert "`total_touches_all`" in card
     assert "`current_stage`" in card
-    assert "Leakage-flagged" in card
+    # Phrasing may vary; key invariant is that leakage is mentioned with the column names.
+    assert "leakage" in card.lower() or "Leakage-flagged" in card