Skip to content

Commit d368795

Browse files
committed
Implement pooled_cov and scaling as per matchit R package
1 parent 53d8ed3 commit d368795

1 file changed

Lines changed: 39 additions & 10 deletions

File tree

methods/matching/find_pairs.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,15 @@
1515

1616
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
1717

18+
# Implement similar logic to the R matchit package
19+
# https://github.com/kosukeimai/MatchIt/blob/7b7b48af995ad1d2ab83a72bad9a689bbb9e8151/R/aux_functions.R#L550
20+
def pooled_scaled_cov(treatment, control):
21+
treatment -= np.mean(treatment, axis=0)
22+
control -= np.mean(control, axis=0)
23+
combined_scaled = np.concatenate([treatment, control])
24+
size = len(combined_scaled)
25+
return np.cov(combined_scaled, rowvar=False) * (size-1) / (size-2)
26+
1827
def find_match_iteration(
1928
k_parquet_filename: str,
2029
s_parquet_filename: str,
@@ -52,10 +61,36 @@ def find_match_iteration(
5261
# As well as all the LUC columns for later use
5362
luc_columns = [x for x in s_set.columns if x.startswith('luc')]
5463

55-
s_subset_for_cov = s_subset[['elevation', 'slope', 'access', \
56-
'cpc0_u', 'cpc0_d', 'cpc5_u', 'cpc5_d', 'cpc10_u', 'cpc10_d']]
57-
covarience = np.cov(s_subset_for_cov, rowvar=False)
58-
invconv = np.linalg.inv(covarience)
64+
distance_columns = [
65+
"elevation", "slope", "access",
66+
"cpc0_u", "cpc0_d",
67+
"cpc5_u", "cpc5_d",
68+
"cpc10_u", "cpc10_d"
69+
]
70+
71+
s_subset_for_cov = s_subset[distance_columns]
72+
k_subset_for_cov = k_subset[distance_columns]
73+
74+
logging.info("Scaling the columns...")
75+
76+
# Scale the columns as per matchit
77+
# https://github.com/kosukeimai/MatchIt/blob/7b7b48af995ad1d2ab83a72bad9a689bbb9e8151/R/dist_functions.R#L240
78+
combined = np.concatenate([s_subset_for_cov, k_subset_for_cov])
79+
means = np.mean(combined, axis=0)
80+
stds = np.std(combined - means, axis=0)
81+
s_subset_for_cov -= means
82+
s_subset_for_cov /= stds
83+
k_subset_for_cov -= means
84+
k_subset_for_cov /= stds
85+
86+
s_subset[distance_columns] = s_subset_for_cov
87+
k_subset[distance_columns] = k_subset_for_cov
88+
89+
logging.info("Calculating pooled covariance...")
90+
covariance = pooled_scaled_cov(k_subset_for_cov, s_subset_for_cov)
91+
92+
logging.info("Calculating inverse covariance...")
93+
invconv = np.linalg.inv(covariance).astype(np.float32)
5994

6095
for _, k_row in k_subset.iterrows():
6196
# Methodology 6.5.7: find the matches.
@@ -85,12 +120,6 @@ def find_match_iteration(
85120
# * slope
86121
# * accessibility
87122
# * coarsened proportional coverage
88-
distance_columns = [
89-
"elevation", "slope", "access",
90-
"cpc0_u", "cpc0_d",
91-
"cpc5_u", "cpc5_d",
92-
"cpc10_u", "cpc10_d"
93-
]
94123
k_soft = np.array(k_row[distance_columns].to_list())
95124
just_cols = filtered_s[distance_columns].to_numpy()
96125

0 commit comments

Comments
 (0)