Validate replicate scale/rscales, combined_weights contract, fix df metadata

igerber · claude · igerber · commit 6c6494f908b8 · 2026-03-28T13:36:02.000-04:00
- Validate replicate_scale &gt; 0 and replicate_rscales finite non-negative
  in SurveyDesign.__post_init__
- Validate combined_weights=True contract in resolve(): reject w_r &gt; 0
  where w_full == 0 (malformed design)
- Fix CS IPW/DR path: pass survey df to safe_inference_batch
- Fix ContinuousDiD/EfficientDiD: don't propagate df=0 sentinel to
  survey_metadata (keep as None for display)
- Add TWFE, StackedDiD rejection tests + scale/rscales validation tests
- Update survey-roadmap.md: CS now has full survey support, accurate
  replicate limitation descriptions

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/continuous_did.py b/diff_diff/continuous_did.py
@@ -531,8 +531,10 @@ def fit(
                     raw_w_unit = _unit_resolved.weights
                     survey_metadata = compute_survey_metadata(_unit_resolved, raw_w_unit)
 
-                # Propagate replicate df override to survey_metadata for display consistency
-                if _survey_df is not None and survey_metadata is not None:
+                # Propagate replicate df override to survey_metadata for display
+                # (but not the df=0 sentinel — keep metadata as None for undefined df)
+                if (_survey_df is not None and _survey_df != 0
+                        and survey_metadata is not None):
                     if survey_metadata.df_survey != _survey_df:
                         survey_metadata.df_survey = _survey_df
 
diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py
@@ -1081,7 +1081,9 @@ def _recompute_unit_survey_metadata(self, panel_metadata):
                 self._unit_resolved_survey.weights,
             )
             # Propagate effective replicate df if available
-            if self._survey_df is not None and meta.df_survey != self._survey_df:
+            # (but not the df=0 sentinel — keep metadata as None for undefined df)
+            if (self._survey_df is not None and self._survey_df != 0
+                    and meta.df_survey != self._survey_df):
                 meta.df_survey = self._survey_df
             return meta
         return panel_metadata
diff --git a/diff_diff/staggered.py b/diff_diff/staggered.py
@@ -1214,8 +1214,16 @@ def _compute_all_att_gt_covariate_reg(
 
         # Batch inference
         if task_keys:
+            # Use survey df for replicate designs (propagated from precomputed)
+            _ipw_dr_df = precomputed.get("df_survey") if precomputed is not None else None
+            # Guard: replicate design with undefined df → NaN inference
+            if (_ipw_dr_df is None and precomputed is not None
+                    and precomputed.get("resolved_survey_unit") is not None
+                    and hasattr(precomputed["resolved_survey_unit"], 'uses_replicate_variance')
+                    and precomputed["resolved_survey_unit"].uses_replicate_variance):
+                _ipw_dr_df = 0
             t_stats, p_values, ci_lowers, ci_uppers = safe_inference_batch(
-                np.array(atts), np.array(ses), alpha=self.alpha
+                np.array(atts), np.array(ses), alpha=self.alpha, df=_ipw_dr_df
             )
             for idx, key in enumerate(task_keys):
                 group_time_effects[key]["t_stat"] = float(t_stats[idx])
diff --git a/diff_diff/survey.py b/diff_diff/survey.py
@@ -131,13 +131,24 @@ def __post_init__(self):
                     f"replicate_strata length ({len(self.replicate_strata)}) must "
                     f"match replicate_weights length ({len(self.replicate_weights)})"
                 )
-        # Validate rscales length
+        # Validate scale/rscales values and length
+        if self.replicate_scale is not None:
+            if not (np.isfinite(self.replicate_scale) and self.replicate_scale > 0):
+                raise ValueError(
+                    f"replicate_scale must be a positive finite number, "
+                    f"got {self.replicate_scale}"
+                )
         if self.replicate_rscales is not None and self.replicate_weights is not None:
             if len(self.replicate_rscales) != len(self.replicate_weights):
                 raise ValueError(
                     f"replicate_rscales length ({len(self.replicate_rscales)}) must "
                     f"match replicate_weights length ({len(self.replicate_weights)})"
                 )
+            rscales_arr = np.asarray(self.replicate_rscales, dtype=float)
+            if not np.all(np.isfinite(rscales_arr)):
+                raise ValueError("replicate_rscales must be finite")
+            if np.any(rscales_arr < 0):
+                raise ValueError("replicate_rscales must be non-negative")
 
     def resolve(self, data: pd.DataFrame) -> "ResolvedSurveyDesign":
         """
@@ -214,6 +225,26 @@ def resolve(self, data: pd.DataFrame) -> "ResolvedSurveyDesign":
                 raise ValueError("Replicate weights contain Inf values")
             if np.any(rep_arr < 0):
                 raise ValueError("Replicate weights must be non-negative")
+            # Validate combined_weights contract: when True, replicate columns
+            # include the full-sample weight, so w_r > 0 with w_full == 0 is
+            # malformed (observation excluded from full sample but included in
+            # a replicate).
+            combined = (
+                self.combined_weights
+                if self.combined_weights is not None
+                else True
+            )
+            if combined:
+                zero_full = weights == 0
+                if np.any(zero_full):
+                    rep_positive_on_zero = np.any(rep_arr[zero_full] > 0, axis=1)
+                    if np.any(rep_positive_on_zero):
+                        raise ValueError(
+                            "Malformed combined_weights=True design: some "
+                            "replicate columns have positive weight where "
+                            "full-sample weight is zero. Either fix the "
+                            "replicate columns or use combined_weights=False."
+                        )
             # Do NOT normalize replicate columns — the IF path uses w_r/w_full
             # ratios that must reflect the true replicate design, not rescaled sums
             n_rep = rep_arr.shape[1]
diff --git a/docs/survey-roadmap.md b/docs/survey-roadmap.md
@@ -46,7 +46,7 @@ message pointing to the planned phase or describing the limitation.
 |-----------|------|----------------|-------|
 | ImputationDiD | `imputation.py` | Analytical | Weighted iterative FE, weighted ATT aggregation, weighted conservative variance (Theorem 3); bootstrap+survey deferred |
 | TwoStageDiD | `two_stage.py` | Analytical | Weighted iterative FE, weighted Stage 2 OLS, weighted GMM sandwich variance; bootstrap+survey deferred |
-| CallawaySantAnna | `staggered.py` | Weights-only | Weights-only SurveyDesign (strata/PSU/FPC rejected); reg supports covariates, IPW/DR no-covariate only; survey-weighted WIF in aggregation; full design SEs, covariates+IPW/DR, and bootstrap+survey deferred |
+| CallawaySantAnna | `staggered.py` | Full | Full SurveyDesign (strata/PSU/FPC/replicate weights); reg supports covariates, IPW/DR no-covariate only; survey-weighted WIF in aggregation; replicate IF variance for analytical SEs |
 
 **Infrastructure**: Weighted `solve_logit()` added to `linalg.py` — survey weights
 enter the IRLS working weights as `w_survey * mu * (1 - mu)`. This also unblocked
@@ -100,10 +100,11 @@ JKn requires explicit `replicate_strata` (per-replicate stratum assignment).
 - Dispatch in `LinearRegression.fit()` and `staggered_aggregation.py`
 - Replicate weights mutually exclusive with strata/PSU/FPC
 - Survey df = rank(replicate_weights) - 1, matching R's `survey::degf()`
-- **Limitations**: SunAbraham rejects replicate-weight designs (weighted
-  within-transformation must be recomputed per replicate — not yet implemented).
-  ContinuousDiD and EfficientDiD reject replicate weights + `n_bootstrap > 0`
-  (replicate variance is analytical, not bootstrap-compatible).
+- **Limitations**: Supported in CallawaySantAnna, ContinuousDiD, EfficientDiD,
+  TripleDifference (analytical only, no bootstrap). Rejected with
+  `NotImplementedError` in DifferenceInDifferences, TwoWayFixedEffects,
+  MultiPeriodDiD, StackedDiD, SunAbraham, ImputationDiD, TwoStageDiD,
+  SyntheticDiD, TROP.
 
 ### DEFF Diagnostics ✅ (2026-03-26)
 Per-coefficient design effects comparing survey vcov to SRS (HC1) vcov.
diff --git a/tests/test_survey_phase6.py b/tests/test_survey_phase6.py
@@ -1357,6 +1357,57 @@ def test_callaway_santanna_replicate_bootstrap_rejected(self):
                 survey_design=sd,
             )
 
+    def test_twfe_replicate_rejected(self):
+        """TwoWayFixedEffects should reject replicate-weight designs."""
+        from diff_diff.twfe import TwoWayFixedEffects
+
+        data, rep_cols = TestEstimatorReplicateWeights._make_staggered_replicate_data()
+        sd = SurveyDesign(
+            weights="weight", replicate_weights=rep_cols,
+            replicate_method="JK1",
+        )
+        with pytest.raises(NotImplementedError, match="TwoWayFixedEffects"):
+            TwoWayFixedEffects().fit(
+                data, outcome="outcome", treatment="first_treat",
+                unit="unit", time="time", survey_design=sd,
+            )
+
+    def test_stacked_did_replicate_rejected(self):
+        """StackedDiD should reject replicate-weight designs."""
+        from diff_diff import StackedDiD
+
+        data, rep_cols = TestEstimatorReplicateWeights._make_staggered_replicate_data()
+        sd = SurveyDesign(
+            weights="weight", replicate_weights=rep_cols,
+            replicate_method="JK1",
+        )
+        with pytest.raises(NotImplementedError, match="StackedDiD"):
+            StackedDiD().fit(
+                data, outcome="outcome", unit="unit", time="time",
+                first_treat="first_treat", survey_design=sd,
+            )
+
+    def test_invalid_replicate_scale_rejected(self):
+        """Negative or zero replicate_scale should be rejected."""
+        with pytest.raises(ValueError, match="positive finite"):
+            SurveyDesign(
+                weights="w", replicate_weights=["r1", "r2"],
+                replicate_method="JK1", replicate_scale=-1.0,
+            )
+        with pytest.raises(ValueError, match="positive finite"):
+            SurveyDesign(
+                weights="w", replicate_weights=["r1", "r2"],
+                replicate_method="JK1", replicate_scale=0.0,
+            )
+
+    def test_invalid_replicate_rscales_rejected(self):
+        """Negative replicate_rscales should be rejected."""
+        with pytest.raises(ValueError, match="non-negative"):
+            SurveyDesign(
+                weights="w", replicate_weights=["r1", "r2"],
+                replicate_method="JK1", replicate_rscales=[-1.0, 1.0],
+            )
+
 
 # =============================================================================
 # Effective-sample and d.f. consistency tests