From b8889496b0635d35bde9d55cf7f1930ef0f582f9 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 11 Jun 2026 20:19:31 +0300 Subject: [PATCH 1/5] feat(lifecycle): customer population builder [LTV-Ph] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First implementation milestone of the pLTV workstream (LTV-M3). Adds the lifecycle customer population builder — the starting point for the post- conversion subscription simulation. leadforge/schemes/lifecycle/population.py: - build_customer_population(n_customers, seed, motif_family, *, n_accounts, observation_date, acquisition_window_weeks) → CustomerPopulationResult. Two named RNG substreams (lifecycle_population_{accounts,customers}) keep each generation aspect independently stable. - CustomerPopulationResult: accounts, customers, latent_state, observation_date. - CustomerLatentState: account_latents + customer_latents dicts. - LIFECYCLE_MOTIF_FAMILIES tuple: 5 retention motif families (product_led_retention, relationship_led_retention, expansion_led_growth, payment_fragile, churner_dominated) each with distinct latent-mean biases. - 5 customer latent traits: latent_product_fit, latent_adoption_velocity, latent_budget_stability, latent_champion_strength, latent_organizational_stability. - D3 seam: opportunity_id=None (independent generation); reserved for future chaining from a lead-scoring bundle's converted leads. - D4 staggered starts: customer_start_at sampled uniformly in [obs_date - acquisition_window_weeks, obs_date), varying tenure at snapshot. - Plan + MRR: employee-band-conditional plan tier (starter/growth/enterprise) with MRR ranges $1k-$3.5k / $3.5k-$9k / $9k-$25k per month. - Contract terms: 12mo (65%) / 24mo (35%). tests/schemes/lifecycle/test_population.py (28 tests): - Shape: counts, type assertions, observation_date format. - Determinism: same seed → identical output; different seeds → different. - FK integrity: every customer.account_id in the account set; latent state covers exactly the customer and account populations. - Staggered starts: all starts < observation_date; all within acquisition window; distribution spans both halves of the window. - D3 seam: opportunity_id is None for all customers. - Latent distributions: all values in [0,1]; exactly 5 traits per customer. - Motif families: all 5 registered; each produces valid output; product_led has higher mean latent_product_fit than churner_dominated. - Entity fields: plan, MRR, contract_term, CSM rep, ID prefixes all valid. Full suite 1565 passed / 51 skipped; ruff + mypy clean. Co-Authored-By: Claude Sonnet 4.6 --- .agent-plan.md | 5 +- docs/ltv/roadmap.md | 4 +- leadforge/schemes/lifecycle/population.py | 373 +++++++++++++++++++++ tests/schemes/lifecycle/test_population.py | 263 +++++++++++++++ 4 files changed, 641 insertions(+), 4 deletions(-) create mode 100644 leadforge/schemes/lifecycle/population.py create mode 100644 tests/schemes/lifecycle/test_population.py diff --git a/.agent-plan.md b/.agent-plan.md index fc4b7fb..0a3b2f3 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -52,8 +52,9 @@ merged (#111); Pg.2 (split lead-scoring schema: entity rows/ALL_ROW_TYPES/ ALL_CONSTRAINTS/LEAD_SNAPSHOT_FEATURES/CONVERTED_WITHIN_90_DAYS moved to `schemes/lead_scoring/`; shared primitives stay in `schema/`) opened as **#112**. All M2 moves byte-identical. Sibling `leadforge-datasets-private` consumes bundle files, not internals — no lockstep update needed (heads-up -issue #8). Next: `LTV-Pg.2`, then `LTV-Pc` (pLTV feature/task specs, authored in -`schemes/lifecycle/`), then `LTV-M3` (lifecycle population). +issue #8). Next: `LTV-Pg.2` merged (#112). **LTV-M3** started: `LTV-Ph` (lifecycle customer +population builder) opened as **#NNN**. Next: `LTV-Pi` (lifecycle motif +families + mechanism policies). --- diff --git a/docs/ltv/roadmap.md b/docs/ltv/roadmap.md index 231ec9c..2a93bb4 100644 --- a/docs/ltv/roadmap.md +++ b/docs/ltv/roadmap.md @@ -43,7 +43,7 @@ protocol + registry, with the package physically reorganized into | `LTV-M0` | Planning + design lock | `LTV-Pa` | #102, #103 (+ scheme reframe) | | `LTV-M1` | Lifecycle schema foundation | `LTV-Pb`, `LTV-Pc` | #104 (Pb) | | `LTV-M2` | Generation-scheme architecture + physical reorg | `LTV-Pd`, `LTV-Pe`, `LTV-Pf`, `LTV-Pg` | #107 (Pd), #108 (Pe), #109 (Pf.1), #110 (Pf.2), #111 (Pg.1), #112 (Pg.2) | -| `LTV-M3` | Customer population + lifecycle world | `LTV-Ph`, `LTV-Pi` | | +| `LTV-M3` | Customer population + lifecycle world | `LTV-Ph`, `LTV-Pi` | #NNN (Ph) | | `LTV-M4` | Lifecycle simulation engine | `LTV-Pj`, `LTV-Pk` | | | `LTV-M5` | Customer snapshots + pLTV targets (both regimes) | `LTV-Pl`, `LTV-Pm` | | | `LTV-M6` | Register LifecycleScheme + recipe + manifest/version | `LTV-Pn`, `LTV-Po` | | @@ -157,7 +157,7 @@ Total: ~19 PRs across 9 milestones. > Built directly under `schemes/lifecycle/`. -- [ ] **`LTV-Ph`** — `feat(lifecycle): customer population builder`. Customer +- [ ] **`LTV-Ph`** — `feat(lifecycle): customer population builder` (**PR #NNN**). Customer entities, 5 new latent traits, **staggered start dates** ending at the absolute `observation_date` (D4); seam for future chained generation (D3). - Tests: determinism, latent distributions, staggered-start spread, FK diff --git a/leadforge/schemes/lifecycle/population.py b/leadforge/schemes/lifecycle/population.py new file mode 100644 index 0000000..d8671a6 --- /dev/null +++ b/leadforge/schemes/lifecycle/population.py @@ -0,0 +1,373 @@ +"""Lifecycle customer population generation. + +:func:`build_customer_population` is the single public entry point. It +produces a cohort of customers (accounts + lifecycle customer entities + +latent state) for the pLTV simulation. + +Design decisions reflected here +--------------------------------- +**D3 (independent generation)**: customers are generated self-contained — not +derived from a lead-scoring bundle's converted leads. A seam for future +chained generation exists via the nullable ``opportunity_id`` field on +:class:`~leadforge.schemes.lifecycle.entities.CustomerLifecycleRow`. + +**D4 (staggered start dates + fixed observation date)**: customers are acquired +across an ``acquisition_window_weeks`` window ending at the absolute +``observation_date``. Each customer receives a uniformly-sampled start date +within that window, so tenure-at-observation naturally varies from near-zero +(cold-start) to the full window length. + +RNG substreams +-------------- +All randomness derives from two named :class:`~leadforge.core.rng.RNGRoot` +substreams so population and latent draws are independently stable: + +- ``lifecycle_population_accounts`` — account entity generation. +- ``lifecycle_population_customers`` — customer entity + latent generation. +""" + +from __future__ import annotations + +import random +from dataclasses import dataclass, field +from datetime import date, timedelta + +from leadforge.core.ids import ID_PREFIXES, make_id +from leadforge.core.rng import RNGRoot +from leadforge.schema.entities import AccountRow +from leadforge.schemes.lifecycle.entities import CustomerLifecycleRow + +# --------------------------------------------------------------------------- +# Output types +# --------------------------------------------------------------------------- + + +@dataclass +class CustomerLatentState: + """Hidden ground-truth latent variables for the lifecycle population. + + Each mapping is ``entity_id → {trait_name: float_in_[0,1]}``. + """ + + account_latents: dict[str, dict[str, float]] = field(default_factory=dict) + customer_latents: dict[str, dict[str, float]] = field(default_factory=dict) + + +@dataclass +class CustomerPopulationResult: + """Output of one :func:`build_customer_population` call.""" + + accounts: list[AccountRow] + customers: list[CustomerLifecycleRow] + latent_state: CustomerLatentState + # ISO-8601 date at which snapshots and labels are anchored. + observation_date: str = "" + + +# --------------------------------------------------------------------------- +# Internal constants +# --------------------------------------------------------------------------- + +# Shared with the lead-scoring account generator so account firmographics are +# drawn from the same distribution (accounts are the same entity in both worlds). +_EMPLOYEE_BANDS = ("200-499", "500-999", "1000-1999", "2000+") +_EMPLOYEE_BAND_WEIGHTS = (0.35, 0.35, 0.20, 0.10) + +_REVENUE_BANDS = ("$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+") +_REVENUE_BAND_WEIGHTS = (0.25, 0.40, 0.25, 0.10) + +_PROCESS_MATURITY_BANDS = ("low", "medium", "high") +_PROCESS_MATURITY_BAND_WEIGHTS = (0.30, 0.45, 0.25) +_PROCESS_MATURITY_MEANS = {"low": 0.25, "medium": 0.50, "high": 0.75} + +# Industries drawn from the ICP defined in the procurement narrative. +_ICP_INDUSTRIES = ( + "manufacturing", + "logistics", + "professional_services", + "healthcare_non_clinical", +) + +_GEOGRAPHIES = ("US", "UK") + +# Subscription plans with MRR ranges (USD/month) indexed by employee band. +# Larger accounts tend to land on higher-ACV plans. +_PLAN_BY_EMPLOYEE_BAND: dict[str, tuple[str, ...]] = { + "200-499": ("starter", "starter", "growth"), + "500-999": ("starter", "growth", "growth"), + "1000-1999": ("growth", "growth", "enterprise"), + "2000+": ("growth", "enterprise", "enterprise"), +} + +_MRR_RANGE_BY_PLAN: dict[str, tuple[int, int]] = { + "starter": (1_000, 3_500), + "growth": (3_500, 9_000), + "enterprise": (9_000, 25_000), +} + +_CONTRACT_TERMS_MONTHS = (12, 24) +_CONTRACT_TERM_WEIGHTS = (0.65, 0.35) + +# Number of CSM reps assigned to customers. +_N_CSMS = 8 + +# Calendar base: observation date is derived relative to this anchor. +# Matches the lead-scoring world base date so any future cohort-linking +# remains temporally coherent. +_WORLD_BASE_DATE = date(2024, 1, 1) + +# Default acquisition window in weeks before the observation date. +_DEFAULT_ACQUISITION_WINDOW_WEEKS = 52 + +# Motif-family-specific additive bias on the 0.50 latent mean. +# Five retention motif families (see docs/ltv/design.md §6.1). +_MOTIF_LATENT_BIAS: dict[str, dict[str, float]] = { + "product_led_retention": { + "latent_product_fit": 0.12, + "latent_adoption_velocity": 0.06, + }, + "relationship_led_retention": { + "latent_champion_strength": 0.14, + "latent_organizational_stability": 0.06, + }, + "expansion_led_growth": { + "latent_adoption_velocity": 0.16, + "latent_product_fit": 0.06, + }, + "payment_fragile": { + "latent_budget_stability": -0.18, + "latent_organizational_stability": -0.06, + }, + "churner_dominated": { + "latent_product_fit": -0.14, + "latent_champion_strength": -0.10, + }, +} + +LIFECYCLE_MOTIF_FAMILIES: tuple[str, ...] = tuple(_MOTIF_LATENT_BIAS.keys()) + + +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- + + +def build_customer_population( + n_customers: int, + seed: int, + motif_family: str = "product_led_retention", + *, + n_accounts: int | None = None, + observation_date: str | None = None, + acquisition_window_weeks: int = _DEFAULT_ACQUISITION_WINDOW_WEEKS, +) -> CustomerPopulationResult: + """Generate accounts and lifecycle customers with their latent states. + + All randomness is derived from named :class:`~leadforge.core.rng.RNGRoot` + substreams, making the result fully deterministic for a given + ``(n_customers, seed, motif_family, n_accounts, observation_date, + acquisition_window_weeks)``. + + Args: + n_customers: Number of customer entities to generate. + seed: Master RNG seed. + motif_family: One of the five lifecycle retention motif families + (see :data:`LIFECYCLE_MOTIF_FAMILIES`). Controls the mean of + latent traits, making the simulated world structurally coherent. + n_accounts: Number of account entities to generate. Defaults to + ``max(n_customers // 3, 1)`` — on average ~3 customers per + account, reflecting enterprise B2B upsell / multi-product. + observation_date: ISO-8601 date at which the snapshot and labels are + anchored (the "as-of" date for the pLTV model). Defaults to + ``_WORLD_BASE_DATE + 1 year``. + acquisition_window_weeks: Width of the customer acquisition window + (weeks before ``observation_date``). Customer start dates are + sampled uniformly within this window, producing the tenure + variation needed for a realistic cold-start subpopulation. + + Returns: + A :class:`CustomerPopulationResult` containing the account list, + customer list, and latent state. + + Raises: + ValueError: if ``motif_family`` is not one of the registered families. + """ + if motif_family not in _MOTIF_LATENT_BIAS: + raise ValueError( + f"Unknown lifecycle motif family {motif_family!r}. " + f"Valid families: {sorted(_MOTIF_LATENT_BIAS)}" + ) + + if n_accounts is None: + n_accounts = max(n_customers // 3, 1) + + obs_date: date + if observation_date is None: + obs_date = _WORLD_BASE_DATE + timedelta(weeks=acquisition_window_weeks + 4) + else: + obs_date = date.fromisoformat(observation_date) + + acq_start: date = obs_date - timedelta(weeks=acquisition_window_weeks) + + root = RNGRoot(seed) + bias = _MOTIF_LATENT_BIAS.get(motif_family, {}) + + accounts, acct_latents = _generate_accounts( + n=n_accounts, + bias=bias, + rng=root.child("lifecycle_population_accounts"), + ) + + customers, cust_latents = _generate_customers( + n=n_customers, + accounts=accounts, + bias=bias, + acq_start=acq_start, + obs_date=obs_date, + rng=root.child("lifecycle_population_customers"), + ) + + return CustomerPopulationResult( + accounts=accounts, + customers=customers, + latent_state=CustomerLatentState( + account_latents=acct_latents, + customer_latents=cust_latents, + ), + observation_date=obs_date.isoformat(), + ) + + +# --------------------------------------------------------------------------- +# Account generation +# --------------------------------------------------------------------------- + + +def _generate_accounts( + n: int, + bias: dict[str, float], + rng: random.Random, +) -> tuple[list[AccountRow], dict[str, dict[str, float]]]: + """Generate *n* account entities with latent traits.""" + rows: list[AccountRow] = [] + latents: dict[str, dict[str, float]] = {} + + for i in range(1, n + 1): + acct_id = make_id(ID_PREFIXES["account"], i) + industry = rng.choice(_ICP_INDUSTRIES) + region = rng.choice(_GEOGRAPHIES) + employee_band = rng.choices(_EMPLOYEE_BANDS, weights=_EMPLOYEE_BAND_WEIGHTS, k=1)[0] + revenue_band = rng.choices(_REVENUE_BANDS, weights=_REVENUE_BAND_WEIGHTS, k=1)[0] + maturity_band = rng.choices( + _PROCESS_MATURITY_BANDS, weights=_PROCESS_MATURITY_BAND_WEIGHTS, k=1 + )[0] + days_before = rng.randint(30, 730) + created_at = (_WORLD_BASE_DATE - timedelta(days=days_before)).isoformat() + + rows.append( + AccountRow( + account_id=acct_id, + company_name=f"Company {acct_id}", + industry=industry, + region=region, + employee_band=employee_band, + estimated_revenue_band=revenue_band, + process_maturity_band=maturity_band, + created_at=created_at, + ) + ) + latents[acct_id] = { + "latent_account_fit": _sample_latent(rng, 0.50 + bias.get("latent_account_fit", 0.0)), + "latent_budget_readiness": _sample_latent( + rng, 0.50 + bias.get("latent_budget_readiness", 0.0) + ), + "latent_process_maturity": _sample_latent( + rng, _PROCESS_MATURITY_MEANS[maturity_band], std=0.15 + ), + } + + return rows, latents + + +# --------------------------------------------------------------------------- +# Customer generation +# --------------------------------------------------------------------------- + + +def _generate_customers( + n: int, + accounts: list[AccountRow], + bias: dict[str, float], + acq_start: date, + obs_date: date, + rng: random.Random, +) -> tuple[list[CustomerLifecycleRow], dict[str, dict[str, float]]]: + """Generate *n* lifecycle customer entities with latent traits. + + Customer start dates are sampled uniformly in ``[acq_start, obs_date)``, + realising the staggered-start design (D4). The nullable ``opportunity_id`` + is left ``None`` (independent generation, D3); it exists as the seam for + future chained generation from a lead-scoring bundle. + """ + acq_span_days = (obs_date - acq_start).days + csm_ids = [make_id("rep", i) for i in range(1, _N_CSMS + 1)] + + rows: list[CustomerLifecycleRow] = [] + latents: dict[str, dict[str, float]] = {} + + for i in range(1, n + 1): + cust_id = make_id(ID_PREFIXES["customer"], i) + account = rng.choice(accounts) + + # Staggered start date: uniform within the acquisition window. + days_offset = rng.randint(0, max(acq_span_days - 1, 0)) + start = acq_start + timedelta(days=days_offset) + + plan = rng.choice(_PLAN_BY_EMPLOYEE_BAND.get(account.employee_band, ("growth",))) + mrr_lo, mrr_hi = _MRR_RANGE_BY_PLAN[plan] + initial_mrr = rng.randint(mrr_lo, mrr_hi) + contract_months = rng.choices(_CONTRACT_TERMS_MONTHS, weights=_CONTRACT_TERM_WEIGHTS, k=1)[ + 0 + ] + csm_rep = rng.choice(csm_ids) + + rows.append( + CustomerLifecycleRow( + customer_id=cust_id, + account_id=account.account_id, + customer_start_at=start.isoformat(), + initial_plan=plan, + initial_mrr=initial_mrr, + contract_term_months=contract_months, + csm_rep_id=csm_rep, + opportunity_id=None, # seam for future chaining (D3) + ) + ) + latents[cust_id] = { + "latent_product_fit": _sample_latent(rng, 0.50 + bias.get("latent_product_fit", 0.0)), + "latent_adoption_velocity": _sample_latent( + rng, 0.50 + bias.get("latent_adoption_velocity", 0.0) + ), + "latent_budget_stability": _sample_latent( + rng, 0.50 + bias.get("latent_budget_stability", 0.0) + ), + "latent_champion_strength": _sample_latent( + rng, 0.50 + bias.get("latent_champion_strength", 0.0) + ), + "latent_organizational_stability": _sample_latent( + rng, 0.50 + bias.get("latent_organizational_stability", 0.0) + ), + } + + return rows, latents + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +def _sample_latent(rng: random.Random, mean: float = 0.50, std: float = 0.20) -> float: + """Draw a latent trait value in [0, 1] from a clipped Gaussian.""" + mean = max(0.10, min(0.90, mean)) + return max(0.0, min(1.0, rng.gauss(mean, std))) diff --git a/tests/schemes/lifecycle/test_population.py b/tests/schemes/lifecycle/test_population.py new file mode 100644 index 0000000..88e6bb6 --- /dev/null +++ b/tests/schemes/lifecycle/test_population.py @@ -0,0 +1,263 @@ +"""Tests for the lifecycle customer population builder (LTV-Ph).""" + +from datetime import date, timedelta + +import pytest + +from leadforge.schemes.lifecycle.population import ( + LIFECYCLE_MOTIF_FAMILIES, + CustomerLatentState, + CustomerPopulationResult, + build_customer_population, +) + +_N_CUSTOMERS = 120 +_N_ACCOUNTS = 40 +_SEED = 7 + + +# --------------------------------------------------------------------------- +# Basic shape +# --------------------------------------------------------------------------- + + +def test_returns_expected_types() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED) + assert isinstance(result, CustomerPopulationResult) + assert isinstance(result.latent_state, CustomerLatentState) + + +def test_account_and_customer_counts() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED, n_accounts=_N_ACCOUNTS) + assert len(result.accounts) == _N_ACCOUNTS + assert len(result.customers) == _N_CUSTOMERS + + +def test_default_n_accounts_is_n_customers_over_three() -> None: + result = build_customer_population(90, _SEED) + assert len(result.accounts) == 30 + + +def test_default_n_accounts_minimum_one() -> None: + result = build_customer_population(1, _SEED) + assert len(result.accounts) == 1 + + +def test_observation_date_recorded() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED, observation_date="2025-06-01") + assert result.observation_date == "2025-06-01" + + +def test_observation_date_default_format() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED) + # Must be a valid ISO-8601 date string. + parsed = date.fromisoformat(result.observation_date) + assert isinstance(parsed, date) + + +# --------------------------------------------------------------------------- +# Determinism +# --------------------------------------------------------------------------- + + +def test_deterministic_under_same_seed() -> None: + a = build_customer_population(_N_CUSTOMERS, _SEED, n_accounts=_N_ACCOUNTS) + b = build_customer_population(_N_CUSTOMERS, _SEED, n_accounts=_N_ACCOUNTS) + assert [c.customer_id for c in a.customers] == [c.customer_id for c in b.customers] + assert [c.customer_start_at for c in a.customers] == [c.customer_start_at for c in b.customers] + assert a.latent_state.customer_latents == b.latent_state.customer_latents + + +def test_different_seeds_produce_different_results() -> None: + a = build_customer_population(_N_CUSTOMERS, seed=1) + b = build_customer_population(_N_CUSTOMERS, seed=2) + assert [c.customer_start_at for c in a.customers] != [c.customer_start_at for c in b.customers] + + +# --------------------------------------------------------------------------- +# FK integrity +# --------------------------------------------------------------------------- + + +def test_customer_account_fk() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED, n_accounts=_N_ACCOUNTS) + account_ids = {a.account_id for a in result.accounts} + for cust in result.customers: + assert cust.account_id in account_ids, ( + f"customer {cust.customer_id} references unknown account {cust.account_id}" + ) + + +def test_latent_state_covers_all_customers() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED, n_accounts=_N_ACCOUNTS) + cust_ids = {c.customer_id for c in result.customers} + assert set(result.latent_state.customer_latents.keys()) == cust_ids + + +def test_latent_state_covers_all_accounts() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED, n_accounts=_N_ACCOUNTS) + acct_ids = {a.account_id for a in result.accounts} + assert set(result.latent_state.account_latents.keys()) == acct_ids + + +# --------------------------------------------------------------------------- +# Staggered start dates + acquisition-window boundary +# --------------------------------------------------------------------------- + + +def test_all_starts_before_observation_date() -> None: + obs = "2025-06-01" + obs_date = date.fromisoformat(obs) + result = build_customer_population(_N_CUSTOMERS, _SEED, observation_date=obs) + for cust in result.customers: + start = date.fromisoformat(cust.customer_start_at) + assert start < obs_date, ( + f"customer {cust.customer_id} starts on or after observation date: {start}" + ) + + +def test_all_starts_within_acquisition_window() -> None: + obs = "2025-06-01" + obs_date = date.fromisoformat(obs) + acq_weeks = 26 + acq_start = obs_date - timedelta(weeks=acq_weeks) + result = build_customer_population( + _N_CUSTOMERS, _SEED, observation_date=obs, acquisition_window_weeks=acq_weeks + ) + for cust in result.customers: + start = date.fromisoformat(cust.customer_start_at) + assert start >= acq_start, ( + f"customer {cust.customer_id} starts before acquisition window: {start}" + ) + + +def test_start_dates_span_the_window() -> None: + """With enough customers, start dates should cover both early and late in window.""" + obs = "2025-06-01" + obs_date = date.fromisoformat(obs) + acq_weeks = 52 + acq_start = obs_date - timedelta(weeks=acq_weeks) + mid_point = acq_start + timedelta(weeks=acq_weeks // 2) + + result = build_customer_population( + 200, _SEED, observation_date=obs, acquisition_window_weeks=acq_weeks + ) + starts = [date.fromisoformat(c.customer_start_at) for c in result.customers] + early = sum(1 for s in starts if s < mid_point) + late = sum(1 for s in starts if s >= mid_point) + # With 200 customers and a uniform distribution, both halves should have + # at least 25% of customers (expected ~50% each, allow wide tolerance). + assert early >= 25, f"too few early-cohort customers: {early}" + assert late >= 25, f"too few late-cohort customers: {late}" + + +def test_opportunity_id_is_none_independent_generation() -> None: + """D3: independent generation leaves the chaining seam empty.""" + result = build_customer_population(_N_CUSTOMERS, _SEED) + assert all(c.opportunity_id is None for c in result.customers) + + +# --------------------------------------------------------------------------- +# Latent distributions +# --------------------------------------------------------------------------- + + +def test_customer_latents_in_unit_interval() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED) + for cust_id, traits in result.latent_state.customer_latents.items(): + for trait, val in traits.items(): + assert 0.0 <= val <= 1.0, f"customer {cust_id} trait {trait}={val} outside [0,1]" + + +def test_customer_latents_has_five_traits() -> None: + expected = { + "latent_product_fit", + "latent_adoption_velocity", + "latent_budget_stability", + "latent_champion_strength", + "latent_organizational_stability", + } + result = build_customer_population(_N_CUSTOMERS, _SEED) + for cust_id, traits in result.latent_state.customer_latents.items(): + assert set(traits.keys()) == expected, ( + f"customer {cust_id} has unexpected traits: {set(traits.keys())}" + ) + + +def test_account_latents_in_unit_interval() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED) + for acct_id, traits in result.latent_state.account_latents.items(): + for trait, val in traits.items(): + assert 0.0 <= val <= 1.0, f"account {acct_id} trait {trait}={val} outside [0,1]" + + +# --------------------------------------------------------------------------- +# Motif families +# --------------------------------------------------------------------------- + + +def test_all_motif_families_registered() -> None: + expected = { + "product_led_retention", + "relationship_led_retention", + "expansion_led_growth", + "payment_fragile", + "churner_dominated", + } + assert set(LIFECYCLE_MOTIF_FAMILIES) == expected + + +@pytest.mark.parametrize("motif", LIFECYCLE_MOTIF_FAMILIES) +def test_each_motif_family_runs(motif: str) -> None: + result = build_customer_population(30, _SEED, motif_family=motif) + assert len(result.customers) == 30 + + +def test_motif_bias_shifts_latent_means() -> None: + """product_led_retention biases latent_product_fit upward; churner_dominated downward.""" + n = 400 + plt_result = build_customer_population(n, _SEED, motif_family="product_led_retention") + churn_result = build_customer_population(n, _SEED, motif_family="churner_dominated") + + def mean_fit(r: CustomerPopulationResult) -> float: + vals = [traits["latent_product_fit"] for traits in r.latent_state.customer_latents.values()] + return sum(vals) / len(vals) + + assert mean_fit(plt_result) > mean_fit(churn_result), ( + "product_led_retention should have higher mean latent_product_fit than churner_dominated" + ) + + +def test_unknown_motif_family_raises() -> None: + with pytest.raises(ValueError, match="Unknown lifecycle motif family"): + build_customer_population(10, _SEED, motif_family="does_not_exist") + + +# --------------------------------------------------------------------------- +# Entity field values +# --------------------------------------------------------------------------- + + +def test_customer_fields_populated() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED) + for cust in result.customers: + assert cust.customer_id.startswith("cust_") + assert cust.account_id.startswith("acct_") + assert cust.initial_plan in ("starter", "growth", "enterprise") + assert cust.initial_mrr > 0 + assert cust.contract_term_months in (12, 24) + assert cust.csm_rep_id.startswith("rep_") + + +def test_account_fields_populated() -> None: + result = build_customer_population(_N_CUSTOMERS, _SEED) + for acct in result.accounts: + assert acct.account_id.startswith("acct_") + assert acct.industry in ( + "manufacturing", + "logistics", + "professional_services", + "healthcare_non_clinical", + ) + assert acct.region in ("US", "UK") + assert acct.employee_band in ("200-499", "500-999", "1000-1999", "2000+") From 0ab0e2971fe0988ab83f0b30cfd9a50c9a5750a6 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 11 Jun 2026 20:20:04 +0300 Subject: [PATCH 2/5] docs(ltv): record LTV-Ph (#113) in roadmap + agent-plan [LTV-Ph] Co-Authored-By: Claude Sonnet 4.6 --- .agent-plan.md | 2 +- docs/ltv/roadmap.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index 0a3b2f3..a9ac0cb 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -53,7 +53,7 @@ ALL_CONSTRAINTS/LEAD_SNAPSHOT_FEATURES/CONVERTED_WITHIN_90_DAYS moved to `schemes/lead_scoring/`; shared primitives stay in `schema/`) opened as **#112**. All M2 moves byte-identical. Sibling `leadforge-datasets-private` consumes bundle files, not internals — no lockstep update needed (heads-up issue #8). Next: `LTV-Pg.2` merged (#112). **LTV-M3** started: `LTV-Ph` (lifecycle customer -population builder) opened as **#NNN**. Next: `LTV-Pi` (lifecycle motif +population builder) opened as **#113**. Next: `LTV-Pi` (lifecycle motif families + mechanism policies). --- diff --git a/docs/ltv/roadmap.md b/docs/ltv/roadmap.md index 2a93bb4..a9eb887 100644 --- a/docs/ltv/roadmap.md +++ b/docs/ltv/roadmap.md @@ -43,7 +43,7 @@ protocol + registry, with the package physically reorganized into | `LTV-M0` | Planning + design lock | `LTV-Pa` | #102, #103 (+ scheme reframe) | | `LTV-M1` | Lifecycle schema foundation | `LTV-Pb`, `LTV-Pc` | #104 (Pb) | | `LTV-M2` | Generation-scheme architecture + physical reorg | `LTV-Pd`, `LTV-Pe`, `LTV-Pf`, `LTV-Pg` | #107 (Pd), #108 (Pe), #109 (Pf.1), #110 (Pf.2), #111 (Pg.1), #112 (Pg.2) | -| `LTV-M3` | Customer population + lifecycle world | `LTV-Ph`, `LTV-Pi` | #NNN (Ph) | +| `LTV-M3` | Customer population + lifecycle world | `LTV-Ph`, `LTV-Pi` | #113 (Ph) | | `LTV-M4` | Lifecycle simulation engine | `LTV-Pj`, `LTV-Pk` | | | `LTV-M5` | Customer snapshots + pLTV targets (both regimes) | `LTV-Pl`, `LTV-Pm` | | | `LTV-M6` | Register LifecycleScheme + recipe + manifest/version | `LTV-Pn`, `LTV-Po` | | @@ -157,7 +157,7 @@ Total: ~19 PRs across 9 milestones. > Built directly under `schemes/lifecycle/`. -- [ ] **`LTV-Ph`** — `feat(lifecycle): customer population builder` (**PR #NNN**). Customer +- [ ] **`LTV-Ph`** — `feat(lifecycle): customer population builder` (**PR #113**). Customer entities, 5 new latent traits, **staggered start dates** ending at the absolute `observation_date` (D4); seam for future chained generation (D3). - Tests: determinism, latent distributions, staggered-start spread, FK From 67f22df198740fa4ef01684e80d491d4846becb7 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 11 Jun 2026 20:28:35 +0300 Subject: [PATCH 3/5] feat(lifecycle): fix population builder (self-review) [LTV-Ph] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four issues found by hostile self-review of the initial commit: 1. Account latents used lead-scoring trait names (latent_account_fit, latent_budget_readiness, latent_process_maturity) that the lifecycle simulation engine will never query. The engine queries latent_budget_stability and latent_organizational_stability at the account level. All three lead-scoring keys replaced; motif-family bias now wires correctly through the account latents (previously bias.get calls on account latents always returned 0.0 because no lifecycle motif key matched any account latent key — the motif had zero effect on account generation). Regression test added. 2. CSM rep IDs used make_id("rep", ...) bypassing ID_PREFIXES["rep"]. The rule in core/ids.py is always go through the registry. Fixed. 3. motif_family was a positional parameter — silently passing a wrong third argument (e.g. n_accounts as an int) produces a confusing ValueError at runtime rather than a TypeError at the call site. Made keyword-only. Regression test added (inspect.signature check). 4. The default observation_date formula used a bare `+ 4` weeks buffer with no explanation. Extracted to _OBS_DATE_BUFFER_WEEKS constant with a comment explaining its purpose (gives earliest-acquired customers a small subscription history before the snapshot). Comment accuracy fix: "shared with the lead-scoring account generator" → "mirrors the distribution of" (the code is parallel, not shared; cross-scheme import would create an awkward dependency). Full suite 1567 passed / 51 skipped; ruff + mypy clean. Co-Authored-By: Claude Sonnet 4.6 --- leadforge/schemes/lifecycle/population.py | 47 +++++++++++++++++----- tests/schemes/lifecycle/test_population.py | 31 ++++++++++++++ 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/leadforge/schemes/lifecycle/population.py b/leadforge/schemes/lifecycle/population.py index d8671a6..6dfd9f1 100644 --- a/leadforge/schemes/lifecycle/population.py +++ b/leadforge/schemes/lifecycle/population.py @@ -68,8 +68,11 @@ class CustomerPopulationResult: # Internal constants # --------------------------------------------------------------------------- -# Shared with the lead-scoring account generator so account firmographics are -# drawn from the same distribution (accounts are the same entity in both worlds). +# Mirrors the lead-scoring account firmographic distribution (accounts are the +# same entity in both worlds). The code is intentionally parallel rather than +# shared — the lifecycle simulation path is distinct and a cross-scheme import +# would create an awkward dependency. Latent trait *names* differ: the lifecycle +# account generator emits lifecycle-specific keys, not lead-scoring keys. _EMPLOYEE_BANDS = ("200-499", "500-999", "1000-1999", "2000+") _EMPLOYEE_BAND_WEIGHTS = (0.35, 0.35, 0.20, 0.10) @@ -119,6 +122,11 @@ class CustomerPopulationResult: # Default acquisition window in weeks before the observation date. _DEFAULT_ACQUISITION_WINDOW_WEEKS = 52 +# Extra buffer weeks between the end of the acquisition window and the +# observation date. Gives the earliest-acquired customers a small amount of +# subscription history before the snapshot, avoiding a hard edge at day 0. +_OBS_DATE_BUFFER_WEEKS = 4 + # Motif-family-specific additive bias on the 0.50 latent mean. # Five retention motif families (see docs/ltv/design.md §6.1). _MOTIF_LATENT_BIAS: dict[str, dict[str, float]] = { @@ -155,8 +163,8 @@ class CustomerPopulationResult: def build_customer_population( n_customers: int, seed: int, - motif_family: str = "product_led_retention", *, + motif_family: str = "product_led_retention", n_accounts: int | None = None, observation_date: str | None = None, acquisition_window_weeks: int = _DEFAULT_ACQUISITION_WINDOW_WEEKS, @@ -203,7 +211,9 @@ def build_customer_population( obs_date: date if observation_date is None: - obs_date = _WORLD_BASE_DATE + timedelta(weeks=acquisition_window_weeks + 4) + obs_date = _WORLD_BASE_DATE + timedelta( + weeks=acquisition_window_weeks + _OBS_DATE_BUFFER_WEEKS + ) else: obs_date = date.fromisoformat(observation_date) @@ -248,7 +258,15 @@ def _generate_accounts( bias: dict[str, float], rng: random.Random, ) -> tuple[list[AccountRow], dict[str, dict[str, float]]]: - """Generate *n* account entities with latent traits.""" + """Generate *n* account entities with lifecycle-relevant latent traits. + + Account firmographics mirror the lead-scoring distribution (same ICP + industries, employee bands, etc.) for future cohort-linking coherence. + The latent trait names are lifecycle-specific — the simulation engine + queries ``latent_budget_stability`` and ``latent_organizational_stability`` + at the account level; lead-scoring names (``latent_account_fit``, + ``latent_budget_readiness``) are not emitted here. + """ rows: list[AccountRow] = [] latents: dict[str, dict[str, float]] = {} @@ -276,13 +294,20 @@ def _generate_accounts( created_at=created_at, ) ) + # Account-level lifecycle latents. latent_budget_stability is correlated + # with revenue band (larger revenue → more stable budgets on average) but + # the motif-family bias can shift the distribution for the whole world. + # latent_process_maturity seeds organisational-stability — higher process + # maturity → more stable accounts. latents[acct_id] = { - "latent_account_fit": _sample_latent(rng, 0.50 + bias.get("latent_account_fit", 0.0)), - "latent_budget_readiness": _sample_latent( - rng, 0.50 + bias.get("latent_budget_readiness", 0.0) + "latent_budget_stability": _sample_latent( + rng, 0.50 + bias.get("latent_budget_stability", 0.0) ), - "latent_process_maturity": _sample_latent( - rng, _PROCESS_MATURITY_MEANS[maturity_band], std=0.15 + "latent_organizational_stability": _sample_latent( + rng, + _PROCESS_MATURITY_MEANS[maturity_band] + + bias.get("latent_organizational_stability", 0.0), + std=0.15, ), } @@ -310,7 +335,7 @@ def _generate_customers( future chained generation from a lead-scoring bundle. """ acq_span_days = (obs_date - acq_start).days - csm_ids = [make_id("rep", i) for i in range(1, _N_CSMS + 1)] + csm_ids = [make_id(ID_PREFIXES["rep"], i) for i in range(1, _N_CSMS + 1)] rows: list[CustomerLifecycleRow] = [] latents: dict[str, dict[str, float]] = {} diff --git a/tests/schemes/lifecycle/test_population.py b/tests/schemes/lifecycle/test_population.py index 88e6bb6..c6cdf14 100644 --- a/tests/schemes/lifecycle/test_population.py +++ b/tests/schemes/lifecycle/test_population.py @@ -191,6 +191,37 @@ def test_account_latents_in_unit_interval() -> None: assert 0.0 <= val <= 1.0, f"account {acct_id} trait {trait}={val} outside [0,1]" +def test_account_latents_use_lifecycle_not_lead_scoring_keys() -> None: + # Regression: account latents must use lifecycle trait names (queried by the + # lifecycle simulation engine), NOT lead-scoring names (latent_account_fit, + # latent_budget_readiness, latent_process_maturity) which the engine never reads. + expected = {"latent_budget_stability", "latent_organizational_stability"} + lead_scoring_names = { + "latent_account_fit", + "latent_budget_readiness", + "latent_process_maturity", + } + result = build_customer_population(_N_CUSTOMERS, _SEED) + for acct_id, traits in result.latent_state.account_latents.items(): + assert set(traits.keys()) == expected, ( + f"account {acct_id} has wrong latent keys: {set(traits.keys())}" + ) + assert not (set(traits.keys()) & lead_scoring_names), ( + f"account {acct_id} contains lead-scoring trait names: " + f"{set(traits.keys()) & lead_scoring_names}" + ) + + +def test_motif_family_must_be_keyword_only() -> None: + import inspect + + sig = inspect.signature(build_customer_population) + p = sig.parameters["motif_family"] + assert p.kind == inspect.Parameter.KEYWORD_ONLY, ( + "motif_family must be keyword-only to prevent silent positional misuse" + ) + + # --------------------------------------------------------------------------- # Motif families # --------------------------------------------------------------------------- From 262eb4734431375161dd104423c642159ee05795 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 11 Jun 2026 21:22:09 +0300 Subject: [PATCH 4/5] style: reformat test_module_layout.py (CI ruff format) --- tests/schemes/test_module_layout.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/schemes/test_module_layout.py b/tests/schemes/test_module_layout.py index aee1c44..2f08e31 100644 --- a/tests/schemes/test_module_layout.py +++ b/tests/schemes/test_module_layout.py @@ -27,7 +27,6 @@ "leadforge.schemes.lead_scoring.render.relational_snapshot_safe", ), ("leadforge.render.tasks", "leadforge.schemes.lead_scoring.render.tasks"), - ] From 86548f1f4bb3330155d1f2f72177837791819a94 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 11 Jun 2026 21:29:52 +0300 Subject: [PATCH 5/5] fix(lifecycle): address copilot review comments on population builder [LTV-Ph] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements three accepted recommendations from the copilot PR review: COPILOT-1 (module docstring misleading about RNG independence): Reworded to clarify that each substream handles both entity creation AND latent draws for its entity type (accounts-substream = account rows + account latents; customers-substream = customer rows + customer latents). The independence is between account generation and customer generation, not between "population" and "latent" draws. COPILOT-2 (docstring says "1 year" but default is 56 weeks): Updated observation_date docstring to show the actual formula (_WORLD_BASE_DATE + (acquisition_window_weeks + _OBS_DATE_BUFFER_WEEKS) weeks) and note the concrete value with built-in defaults (56 weeks ≈ 13 months). COPILOT-3 (missing input validation): Added ValueError guards for n_customers < 1, n_accounts < 1 (when explicit), and acquisition_window_weeks < 1 (0 would make every start == obs_date, violating the < obs_date boundary invariant). Pattern mirrors core/models.py::_require_positive_int. Four new validation tests added. COPILOT-4 (ID_PREFIXES["rep"] vs hardcoded string): Already fixed in the self-review commit (67f22df). Resolved as already treated. Co-Authored-By: Claude Sonnet 4.6 --- leadforge/schemes/lifecycle/population.py | 34 +++++++++++++++++----- tests/schemes/lifecycle/test_population.py | 27 +++++++++++++++++ 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/leadforge/schemes/lifecycle/population.py b/leadforge/schemes/lifecycle/population.py index 6dfd9f1..45e8432 100644 --- a/leadforge/schemes/lifecycle/population.py +++ b/leadforge/schemes/lifecycle/population.py @@ -20,10 +20,12 @@ RNG substreams -------------- All randomness derives from two named :class:`~leadforge.core.rng.RNGRoot` -substreams so population and latent draws are independently stable: +substreams. Each substream handles both entity creation and latent draws for +its entity type, so the two generation steps are independently stable — changes +to account generation do not affect customer IDs or latents, and vice versa: -- ``lifecycle_population_accounts`` — account entity generation. -- ``lifecycle_population_customers`` — customer entity + latent generation. +- ``lifecycle_population_accounts`` — account entity rows **and** account latents. +- ``lifecycle_population_customers`` — customer entity rows **and** customer latents. """ from __future__ import annotations @@ -187,10 +189,12 @@ def build_customer_population( account, reflecting enterprise B2B upsell / multi-product. observation_date: ISO-8601 date at which the snapshot and labels are anchored (the "as-of" date for the pLTV model). Defaults to - ``_WORLD_BASE_DATE + 1 year``. + ``_WORLD_BASE_DATE + (acquisition_window_weeks + + _OBS_DATE_BUFFER_WEEKS) weeks`` — with the built-in defaults that + is 56 weeks (≈ 13 months) after the world base date. acquisition_window_weeks: Width of the customer acquisition window - (weeks before ``observation_date``). Customer start dates are - sampled uniformly within this window, producing the tenure + (weeks before ``observation_date``). Must be ≥ 1. Customer start + dates are sampled uniformly within this window, producing the tenure variation needed for a realistic cold-start subpopulation. Returns: @@ -198,8 +202,24 @@ def build_customer_population( customer list, and latent state. Raises: - ValueError: if ``motif_family`` is not one of the registered families. + ValueError: if ``motif_family`` is not one of the registered families, + or if ``n_customers``, ``n_accounts`` (when provided), or + ``acquisition_window_weeks`` are not positive integers. """ + if not isinstance(n_customers, int) or isinstance(n_customers, bool) or n_customers < 1: + raise ValueError(f"n_customers must be a positive int, got {n_customers!r}") + if n_accounts is not None and ( + not isinstance(n_accounts, int) or isinstance(n_accounts, bool) or n_accounts < 1 + ): + raise ValueError(f"n_accounts must be a positive int or None, got {n_accounts!r}") + if ( + not isinstance(acquisition_window_weeks, int) + or isinstance(acquisition_window_weeks, bool) + or acquisition_window_weeks < 1 + ): + raise ValueError( + f"acquisition_window_weeks must be a positive int, got {acquisition_window_weeks!r}" + ) if motif_family not in _MOTIF_LATENT_BIAS: raise ValueError( f"Unknown lifecycle motif family {motif_family!r}. " diff --git a/tests/schemes/lifecycle/test_population.py b/tests/schemes/lifecycle/test_population.py index c6cdf14..e00ad6b 100644 --- a/tests/schemes/lifecycle/test_population.py +++ b/tests/schemes/lifecycle/test_population.py @@ -264,6 +264,33 @@ def test_unknown_motif_family_raises() -> None: build_customer_population(10, _SEED, motif_family="does_not_exist") +# --------------------------------------------------------------------------- +# Input validation (COPILOT-3) +# --------------------------------------------------------------------------- + + +def test_zero_n_customers_raises() -> None: + with pytest.raises(ValueError, match="n_customers"): + build_customer_population(0, _SEED) + + +def test_negative_n_customers_raises() -> None: + with pytest.raises(ValueError, match="n_customers"): + build_customer_population(-1, _SEED) + + +def test_zero_n_accounts_explicit_raises() -> None: + with pytest.raises(ValueError, match="n_accounts"): + build_customer_population(10, _SEED, n_accounts=0) + + +def test_zero_acquisition_window_raises() -> None: + # acquisition_window_weeks=0 would make every start == obs_date, + # violating the < obs_date boundary invariant. + with pytest.raises(ValueError, match="acquisition_window_weeks"): + build_customer_population(10, _SEED, acquisition_window_weeks=0) + + # --------------------------------------------------------------------------- # Entity field values # ---------------------------------------------------------------------------