Merge branch 'output-formatting' into om.Constraints

hmgaudecker · hmgaudecker · commit c1256f9a8c76 · 2026-03-16T17:34:54.000+01:00
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -174,8 +174,7 @@ The codebase uses:
 **Simulation:**
 
 - `simulate_dataset(model_spec, params, n_obs=None, data=None, policies=None, seed=None)`
-  — returns dict with `"unanchored_states"`, `"anchored_states"`,
-  `"aug_unanchored_states"`, `"aug_measurements"`
+  — returns dict with `"unanchored_states"`, `"anchored_states"`
 - `simulate_policy_effect(model_spec, params, data, policies, seed=None)` — returns
   DataFrame of factor mean differences between policy and baseline
 
@@ -184,7 +183,7 @@ The codebase uses:
 - `plot_likelihood_contributions(model_spec, data, params, period=None)`
 - `plot_residual_boxplots(model_spec, data, params, period=None)`
 - `decompose_measurement_variance(model_spec, params, data)` — returns DataFrame indexed
-  by `(aug_period, measurement, factor)` with signal/noise columns
+  by `(period, measurement, factor)` with signal/noise columns
 - `summarize_measurement_reliability(variance_decomposition)`
 - `create_state_ranges(filtered_states, factors, quantile_cutoff=None)`
 
@@ -237,6 +236,27 @@ These are not in `__all__` but are imported directly by application projects:
 - `ensure_containers_are_immutable()` recursively converts dict→MappingProxyType,
   list→tuple, set→frozenset
 
+### Period vs Aug_period
+
+Models with endogenous factors split each calendar period into multiple **augmented
+periods** (`aug_period`). The public API uses `period` (user-facing); `aug_period` is
+strictly internal. All public functions now return `period`:
+
+- `ModelSpec` — clean, no `aug_period` exposure.
+- `get_transition_plots()` — clean, accepts `period`/`periods`.
+- `get_filtered_states()` — clean, returns `period` column.
+- `simulate_dataset()` — clean, returns `period` in states DataFrames.
+- `plot_residual_boxplots()` / `plot_likelihood_contributions()` — clean, accept and
+  return `period`.
+- `decompose_measurement_variance()` — clean, indexed by
+  `(period, measurement, factor)`.
+- `simulate_policy_effect()` / `simulate_dataset()` policies — accept `"period"` key.
+- `ProcessedModel.labels` — exposes `aug_periods_to_periods` mapping (acceptable for
+  internal/advanced use).
+
+When writing new public-facing code, always accept and return `period`. Convert to
+`aug_period` internally using `ProcessedModel.labels.aug_periods_to_periods`.
+
 ## Testing
 
 - pytest with markers: `wip`, `unit`, `integration`, `end_to_end`
diff --git a/src/skillmodels/diagnostic_plots.py b/src/skillmodels/diagnostic_plots.py
@@ -61,19 +61,25 @@ def plot_residual_boxplots(
         how="left",
     )
 
+    # Map aug_period → period for the public API
+    ap_to_p = processed_model.labels.aug_periods_to_periods
+
     available_periods = sorted(residuals_df[period_col].unique())
 
     if period is not None:
+        # Find aug_period(s) matching the requested period
+        aug_periods_for_period = [ap for ap, p in ap_to_p.items() if p == period]
+        aug_period = aug_periods_for_period[0] if aug_periods_for_period else period
         return _create_residual_boxplot_for_period(
             residuals_df=residuals_df,
-            period=period,
+            period=aug_period,
             period_col=period_col,
             show_reference_line=show_reference_line,
             layout_kwargs=layout_kwargs,
         )
 
     return {
-        p: _create_residual_boxplot_for_period(
+        ap_to_p.get(p, p): _create_residual_boxplot_for_period(
             residuals_df=residuals_df,
             period=p,
             period_col=period_col,
@@ -173,18 +179,24 @@ def plot_likelihood_contributions(
         how="left",
     )
 
+    # Map aug_period → period for the public API
+    ap_to_p = processed_model.labels.aug_periods_to_periods
+
     available_periods = sorted(contributions_df[period_col].unique())
 
     if period is not None:
+        # Find aug_period(s) matching the requested period
+        aug_periods_for_period = [ap for ap, p in ap_to_p.items() if p == period]
+        aug_period = aug_periods_for_period[0] if aug_periods_for_period else period
         return _create_likelihood_boxplot_for_period(
             contributions_df=contributions_df,
-            period=period,
+            period=aug_period,
             period_col=period_col,
             layout_kwargs=layout_kwargs,
         )
 
     return {
-        p: _create_likelihood_boxplot_for_period(
+        ap_to_p.get(p, p): _create_likelihood_boxplot_for_period(
             contributions_df=contributions_df,
             period=p,
             period_col=period_col,
diff --git a/src/skillmodels/filtered_states.py b/src/skillmodels/filtered_states.py
@@ -35,10 +35,20 @@ def get_filtered_states(
         use_aug_period=True,
     )
 
+    # Map aug_period → period for the public API
+    ap_to_p = processed_model.labels.aug_periods_to_periods
+    for df in (anchored_states_df, unanchored_states_df):
+        df["period"] = df["aug_period"].map(ap_to_p)
+        df.drop(columns="aug_period", inplace=True)  # noqa: PD002
+
     anchored_ranges = create_state_ranges(
         filtered_states=anchored_states_df,
         factors=processed_model.labels.latent_factors,
     )
+    unanchored_ranges = create_state_ranges(
+        filtered_states=unanchored_states_df,
+        factors=processed_model.labels.latent_factors,
+    )
 
     return {
         "anchored_states": {
diff --git a/src/skillmodels/process_debug_data.py b/src/skillmodels/process_debug_data.py
@@ -182,8 +182,8 @@ def create_state_ranges(
     """Compute minimum and maximum state values for each factor by period.
 
     Args:
-        filtered_states: DataFrame with filtered states. Must have a "period" or
-            "aug_period" column.
+        filtered_states: DataFrame with filtered states. Must have a "period"
+            column.
         factors: List of factor names to compute ranges for.
         quantile_cutoff: If provided, use quantiles instead of min/max. The cutoff
             is applied symmetrically: the minimum is the `quantile_cutoff` quantile
diff --git a/src/skillmodels/simulate_data.py b/src/skillmodels/simulate_data.py
@@ -44,8 +44,8 @@ def simulate_dataset(
         data: Dataset in the same format as for estimation, containing
             information about observed factors and control variables.
         policies: Each dictionary specifies a stochastic shock to a latent factor
-            AT THE END of "period" for "factor" with mean "effect_size" and
-            "standard deviation".
+            AT THE END of ``"period"`` for ``"factor"`` with mean
+            ``"effect_size"`` and ``"standard_deviation"``.
         seed: Random seed for reproducibility. If None, uses numpy's default random
             state.
 
@@ -125,7 +125,14 @@ def simulate_dataset(
         n_obs=n_obs,
     )
 
-    aug_measurements, aug_latent_data = _simulate_dataset(
+    # Convert "period" keys in policies to "aug_period" for internal use
+    if policies is not None:
+        policies = _convert_policy_periods(
+            policies=policies,
+            endogenous_factors_info=processed_model.endogenous_factors_info,
+        )
+
+    _aug_measurements, aug_latent_data = _simulate_dataset(
         latent_states=states,
         covs=covs,
         log_weights=log_weights,
@@ -173,14 +180,6 @@ def simulate_dataset(
                 factors=processed_model.labels.latent_factors,
             ),
         },
-        "aug_unanchored_states": {
-            "states": aug_latent_data,
-            "state_ranges": create_state_ranges(
-                filtered_states=aug_latent_data,
-                factors=processed_model.labels.latent_factors,
-            ),
-        },
-        "aug_measurements": aug_measurements,
     }
 
 
@@ -396,6 +395,31 @@ def _collapse_aug_periods_to_periods(
     )
 
 
+def _convert_policy_periods(
+    policies: list[dict],
+    endogenous_factors_info: EndogenousFactorsInfo,
+) -> list[dict]:
+    """Convert ``"period"`` keys in policy dicts to ``"aug_period"``.
+
+    Policies may specify either ``"period"`` (public API) or ``"aug_period"``
+    (legacy/internal). This normalises to ``"aug_period"`` for the simulation loop.
+    """
+    converted = []
+    for policy in policies:
+        if "aug_period" in policy:
+            converted.append(policy)
+        elif "period" in policy:
+            p = dict(policy)
+            period = p.pop("period")
+            aug_periods = endogenous_factors_info.aug_periods_from_period(period)
+            # Use the first aug_period for the given period
+            p["aug_period"] = aug_periods[0]
+            converted.append(p)
+        else:
+            raise ValueError("Each policy dict must contain a 'period' key.")
+    return converted
+
+
 def _get_shock(
     rng: np.random.Generator,
     mean: float,
@@ -514,7 +538,7 @@ def simulate_policy_effect(
         data: Dataset with observed factors and control variables.
         policies: List of policy dictionaries. Each dictionary specifies a
             stochastic shock to a latent factor with keys:
-            - "period" or "aug_period": When to apply the shock
+            - "period": When to apply the shock
             - "factor": Which factor to shock
             - "effect_size": Mean of the shock
             - "standard_deviation": Standard deviation of the shock (use 0 for
@@ -564,8 +588,6 @@ def simulate_policy_effect(
     policy_means = policy_states.groupby("period").mean()
 
     # Drop non-factor columns
-    factor_cols = [
-        c for c in baseline_means.columns if c not in ("id", "aug_period", "period")
-    ]
+    factor_cols = [c for c in baseline_means.columns if c not in ("id", "period")]
 
     return policy_means[factor_cols] - baseline_means[factor_cols]
diff --git a/src/skillmodels/variance_decomposition.py b/src/skillmodels/variance_decomposition.py
@@ -5,10 +5,13 @@
 Section 4.2.2.
 """
 
+from collections.abc import Mapping
+
 import pandas as pd
 
 from skillmodels.filtered_states import get_filtered_states
 from skillmodels.model_spec import ModelSpec
+from skillmodels.process_model import process_model
 
 
 def decompose_measurement_variance(
@@ -34,7 +37,7 @@ def decompose_measurement_variance(
         data: Empirical dataset used to estimate the model.
 
     Returns:
-        DataFrame indexed by (aug_period, measurement, factor) with columns:
+        DataFrame indexed by (period, measurement, factor) with columns:
         - loading: The factor loading (L)
         - factor_variance: Var(F) for that period
         - meas_sd: The measurement error standard deviation
@@ -54,32 +57,50 @@ def decompose_measurement_variance(
     )
     filtered_states = filtered_result["anchored_states"]["states"]
 
+    processed_model = process_model(model_spec)
     return _compute_variance_decomposition(
         filtered_states=filtered_states,
         params=params,
+        aug_periods_to_periods=processed_model.labels.aug_periods_to_periods,
     )
 
 
 def _compute_variance_decomposition(
     filtered_states: pd.DataFrame,
     params: pd.DataFrame,
+    aug_periods_to_periods: Mapping[int, int],
 ) -> pd.DataFrame:
     """Compute variance decomposition from filtered states and parameters.
 
     Args:
         filtered_states: DataFrame with filtered states, must have columns for
-            each factor plus "aug_period" and "id".
+            each factor plus "period" and "id".
         params: DataFrame with model parameters indexed by
             (category, aug_period, name1, name2).
+        aug_periods_to_periods: Mapping from aug_period to period.
 
     Returns:
         DataFrame with variance decomposition results.
 
     """
+    # Build reverse mapping: period → aug_period (pick first aug_period per period)
+    periods_to_aug_periods = {}
+    for ap, p in aug_periods_to_periods.items():
+        if p not in periods_to_aug_periods:
+            periods_to_aug_periods[p] = ap
+
+    # Add aug_period column for internal merges with params
+    filtered_states = filtered_states.copy()
+    filtered_states["aug_period"] = filtered_states["period"].map(
+        periods_to_aug_periods
+    )
+
     # Compute factor variances by period
     periods = filtered_states["aug_period"].unique()
     factor_cols = [
-        c for c in filtered_states.columns if c not in ("aug_period", "id", "mixture")
+        c
+        for c in filtered_states.columns
+        if c not in ("aug_period", "period", "id", "mixture")
     ]
 
     factor_variances = {}
@@ -133,8 +154,11 @@ def _compute_variance_decomposition(
     merged["fraction_noise"] = noise_var / total_var
     merged["signal_to_noise_ratio"] = signal_var / noise_var
 
+    # Map aug_period → period for the public API
+    merged["period"] = merged["aug_period"].map(aug_periods_to_periods)
+
     # Set index and select columns
-    return merged.set_index(["aug_period", "measurement", "factor"])[
+    return merged.set_index(["period", "measurement", "factor"])[
         [
             "loading",
             "factor_variance",
diff --git a/src/skillmodels/visualize_transition_equations.py b/src/skillmodels/visualize_transition_equations.py
@@ -196,7 +196,7 @@ def get_transition_plots(  # noqa: C901, PLR0912
         layout_kwargs: Dictionary of key word arguments used to
             update layout of plotly image object. If None, the default kwargs
             defined in the function will be used.
-        states: Pre-computed filtered states DataFrame (with an `aug_period`
+        states: Pre-computed filtered states DataFrame (with a `period`
             column). If provided, skip the internal `get_filtered_states` call.
         include_correction_factors: Whether to include correction factors in the
             plots. Default False.
@@ -258,6 +258,9 @@ def get_transition_plots(  # noqa: C901, PLR0912
         states = get_filtered_states(model_spec=model_spec, data=data, params=params)[
             "anchored_states"
         ]["states"]
+
+    states = _normalize_states_columns(states)
+
     return _get_dictionary_with_plots(
         model=processed_model,
         data=data,
@@ -371,8 +374,6 @@ def _get_dictionary_with_plots(
     else:
         colors = colorscale
 
-    period_col = "aug_period" if "aug_period" in states_data.columns else "period"
-
     plots_dict = {}
     for output_factor, input_factor in itertools.product(latent_factors, all_factors):
         combined_data = _prepare_plot_data_for_factor_pair(
@@ -381,7 +382,6 @@ def _get_dictionary_with_plots(
             state_ranges=state_ranges,
             parsed_params=parsed_params,
             periods=periods,
-            period_col=period_col,
             input_factor=input_factor,
             output_factor=output_factor,
             all_factors=all_factors,
@@ -414,7 +414,6 @@ def _prepare_plot_data_for_factor_pair(
     state_ranges: dict[str, pd.DataFrame],
     parsed_params: ParsedParams,
     periods: list[int],
-    period_col: str,
     input_factor: str,
     output_factor: str,
     all_factors: tuple[str, ...],
@@ -438,7 +437,7 @@ def _prepare_plot_data_for_factor_pair(
         transition_params = {
             output_factor: parsed_params.transition[output_factor][aug_period]
         }
-        period_states = states_data[states_data[period_col] == aug_period]
+        period_states = states_data[states_data["aug_period"] == aug_period]
 
         plot_data = _prepare_single_period_plot_data(
             states_data=period_states,
@@ -645,6 +644,29 @@ def _get_states_data(
     return states_data
 
 
+def _normalize_states_columns(states: pd.DataFrame) -> pd.DataFrame:
+    """Ensure `aug_period` and `id` are columns, not index levels.
+
+    Pre-computed states DataFrames may carry period information as `period`
+    (in the index or a column) instead of `aug_period`.  Downstream code
+    uniformly expects `aug_period` as a column, so this helper promotes
+    index levels to columns and renames `period` → `aug_period` when the
+    latter is absent.
+    """
+    # Promote relevant index levels to columns.
+    names_to_reset = [
+        n for n in states.index.names if n in ("period", "aug_period", "id")
+    ]
+    if names_to_reset:
+        states = states.reset_index(level=names_to_reset)
+
+    # Rename period → aug_period when aug_period is missing.
+    if "aug_period" not in states.columns and "period" in states.columns:
+        states = states.rename(columns={"period": "aug_period"})
+
+    return states
+
+
 def _prepare_data_for_one_plot_fixed_quantile_2d(
     states_data: pd.DataFrame,
     state_ranges: dict[str, pd.DataFrame],
diff --git a/tests/test_variance_decomposition.py b/tests/test_variance_decomposition.py
diff --git a/tests/test_visualize_factor_distributions.py b/tests/test_visualize_factor_distributions.py