Skip to content

Commit c6c2c38

Browse files
igerberclaude
andcommitted
Use positive-weight count for HC1/classical df with zero-weight rows
When pweight/aweight fits have zero-weight rows (from subpopulation), use np.count_nonzero(weights > 0) for HC1 df adjustments instead of total n. Zero-weight rows contribute nothing to the sandwich and should not inflate df. Also fix clustered SEs to exclude zero-total-weight clusters from G/(G-1) small-sample correction, and DEFF effective_n to use positive-weight count. Survey design df (n_PSU - n_strata) unchanged — preserves design structure matching R's survey::degf() convention. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent a3d2343 commit c6c2c38

3 files changed

Lines changed: 31 additions & 12 deletions

File tree

diff_diff/linalg.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,10 +1046,16 @@ def _compute_robust_vcov_numpy(
10461046
else:
10471047
bread_matrix = X.T @ X
10481048

1049-
# Effective n for df computation (fweights use sum(w))
1049+
# Effective n for df computation
1050+
# fweights: sum(w) (frequency expansion)
1051+
# pweight/aweight with zeros: positive-weight count (zero-weight rows
1052+
# contribute nothing to the sandwich and should not inflate df)
10501053
n_eff = n
1051-
if weights is not None and weight_type == "fweight":
1052-
n_eff = int(round(np.sum(weights)))
1054+
if weights is not None:
1055+
if weight_type == "fweight":
1056+
n_eff = int(round(np.sum(weights)))
1057+
elif np.any(weights == 0):
1058+
n_eff = int(np.count_nonzero(weights > 0))
10531059

10541060
# Compute weighted scores for cluster-robust meat (outer product of sums).
10551061
# pweight/fweight multiply by w; aweight and unweighted use raw residuals.
@@ -1075,6 +1081,11 @@ def _compute_robust_vcov_numpy(
10751081
unique_clusters = np.unique(cluster_ids)
10761082
n_clusters = len(unique_clusters)
10771083

1084+
# Exclude clusters with zero total weight (subpopulation-zeroed)
1085+
if weights is not None and weight_type != "fweight" and np.any(weights == 0):
1086+
cluster_weights = pd.Series(weights).groupby(cluster_ids).sum()
1087+
n_clusters = int((cluster_weights > 0).sum())
1088+
10781089
if n_clusters < 2:
10791090
raise ValueError(f"Need at least 2 clusters for cluster-robust SEs, got {n_clusters}")
10801091

@@ -1741,10 +1752,14 @@ def fit(
17411752
nan_mask = np.isnan(coefficients)
17421753
k_effective = k - np.sum(nan_mask) # Number of identified coefficients
17431754

1744-
# For fweights, df uses sum(w) - k (effective sample size)
1755+
# Effective n for df: fweights use sum(w), pweight/aweight with
1756+
# zeros use positive-weight count (zero-weight rows don't contribute)
17451757
n_eff_df = n
1746-
if self.weights is not None and self.weight_type == "fweight":
1747-
n_eff_df = int(round(np.sum(self.weights)))
1758+
if self.weights is not None:
1759+
if self.weight_type == "fweight":
1760+
n_eff_df = int(round(np.sum(self.weights)))
1761+
elif np.any(self.weights == 0):
1762+
n_eff_df = int(np.count_nonzero(self.weights > 0))
17481763

17491764
if k_effective == 0:
17501765
# All coefficients dropped - no valid inference

diff_diff/survey.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,9 @@ def compute_deff_diagnostics(
849849
from diff_diff.linalg import compute_robust_vcov
850850

851851
n = X.shape[0]
852+
# Use positive-weight count for effective n (zero-weight rows from
853+
# subpopulation don't contribute to the effective sample)
854+
n_eff = int(np.count_nonzero(weights > 0)) if np.any(weights == 0) else n
852855

853856
# SRS baseline: HC1 weighted sandwich ignoring design structure
854857
srs_vcov = compute_robust_vcov(
@@ -861,7 +864,7 @@ def compute_deff_diagnostics(
861864
# DEFF = survey_var / srs_var
862865
with np.errstate(divide="ignore", invalid="ignore"):
863866
deff = np.where(srs_var > 0, survey_var / srs_var, np.nan)
864-
eff_n = np.where(deff > 0, n / deff, np.nan)
867+
eff_n = np.where(deff > 0, n_eff / deff, np.nan)
865868

866869
survey_se = np.sqrt(np.maximum(survey_var, 0.0))
867870
srs_se = np.sqrt(np.maximum(srs_var, 0.0))

docs/methodology/REGISTRY.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2093,11 +2093,12 @@ Domain estimation preserving full design structure.
20932093
- **Note:** Weight validation relaxed from "strictly positive" to
20942094
"non-negative" to support zero-weight observations. Negative weights
20952095
still rejected. All-zero weight vectors rejected at solver level.
2096-
- **Note:** Survey df and variance adjustment factors use total n
2097-
(including zero-weight rows), matching R's `survey::degf()` convention
2098-
after `subset()`. This preserves the design structure for correct
2099-
variance estimation. Zero-weight rows contribute zero-valued scores
2100-
to the sandwich meat but are counted in df = n_PSU - n_strata.
2096+
- **Note:** Survey design df (`n_PSU - n_strata`) uses total design
2097+
structure (including zero-weight rows), matching R's `survey::degf()`
2098+
convention after `subset()`. The generic HC1/classical inference paths
2099+
use positive-weight count for df adjustments, ensuring zero-weight
2100+
padding is inference-invariant outside the survey vcov path. DEFF
2101+
effective-n also uses positive-weight count.
21012102
- **Note:** For replicate-weight designs, `subpopulation()` zeros out both
21022103
full-sample and replicate weight columns for excluded observations,
21032104
preserving all replicate metadata.

0 commit comments

Comments
 (0)