|
15 | 15 |
|
16 | 16 | logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") |
17 | 17 |
|
| 18 | +# Implement similar logic to the R matchit package |
| 19 | +# https://github.com/kosukeimai/MatchIt/blob/7b7b48af995ad1d2ab83a72bad9a689bbb9e8151/R/aux_functions.R#L550 |
| 20 | +def pooled_scaled_cov(treatment, control): |
| 21 | + treatment -= np.mean(treatment, axis=0) |
| 22 | + control -= np.mean(control, axis=0) |
| 23 | + combined_scaled = np.concatenate([treatment, control]) |
| 24 | + size = len(combined_scaled) |
| 25 | + return np.cov(combined_scaled, rowvar=False) * (size-1) / (size-2) |
| 26 | + |
18 | 27 | def find_match_iteration( |
19 | 28 | k_parquet_filename: str, |
20 | 29 | s_parquet_filename: str, |
@@ -52,10 +61,36 @@ def find_match_iteration( |
52 | 61 | # As well as all the LUC columns for later use |
53 | 62 | luc_columns = [x for x in s_set.columns if x.startswith('luc')] |
54 | 63 |
|
55 | | - s_subset_for_cov = s_subset[['elevation', 'slope', 'access', \ |
56 | | - 'cpc0_u', 'cpc0_d', 'cpc5_u', 'cpc5_d', 'cpc10_u', 'cpc10_d']] |
57 | | - covarience = np.cov(s_subset_for_cov, rowvar=False) |
58 | | - invconv = np.linalg.inv(covarience) |
| 64 | + distance_columns = [ |
| 65 | + "elevation", "slope", "access", |
| 66 | + "cpc0_u", "cpc0_d", |
| 67 | + "cpc5_u", "cpc5_d", |
| 68 | + "cpc10_u", "cpc10_d" |
| 69 | + ] |
| 70 | + |
| 71 | + s_subset_for_cov = s_subset[distance_columns] |
| 72 | + k_subset_for_cov = k_subset[distance_columns] |
| 73 | + |
| 74 | + logging.info("Scaling the columns...") |
| 75 | + |
| 76 | + # Scale the columns as per matchit |
| 77 | + # https://github.com/kosukeimai/MatchIt/blob/7b7b48af995ad1d2ab83a72bad9a689bbb9e8151/R/dist_functions.R#L240 |
| 78 | + combined = np.concatenate([s_subset_for_cov, k_subset_for_cov]) |
| 79 | + means = np.mean(combined, axis=0) |
| 80 | + stds = np.std(combined - means, axis=0) |
| 81 | + s_subset_for_cov -= means |
| 82 | + s_subset_for_cov /= stds |
| 83 | + k_subset_for_cov -= means |
| 84 | + k_subset_for_cov /= stds |
| 85 | + |
| 86 | + s_subset[distance_columns] = s_subset_for_cov |
| 87 | + k_subset[distance_columns] = k_subset_for_cov |
| 88 | + |
| 89 | + logging.info("Calculating pooled covariance...") |
| 90 | + covariance = pooled_scaled_cov(k_subset_for_cov, s_subset_for_cov) |
| 91 | + |
| 92 | + logging.info("Calculating inverse covariance...") |
| 93 | + invconv = np.linalg.inv(covariance).astype(np.float32) |
59 | 94 |
|
60 | 95 | for _, k_row in k_subset.iterrows(): |
61 | 96 | # Methodology 6.5.7: find the matches. |
@@ -85,12 +120,6 @@ def find_match_iteration( |
85 | 120 | # * slope |
86 | 121 | # * accessibility |
87 | 122 | # * coarsened proportional coverage |
88 | | - distance_columns = [ |
89 | | - "elevation", "slope", "access", |
90 | | - "cpc0_u", "cpc0_d", |
91 | | - "cpc5_u", "cpc5_d", |
92 | | - "cpc10_u", "cpc10_d" |
93 | | - ] |
94 | 123 | k_soft = np.array(k_row[distance_columns].to_list()) |
95 | 124 | just_cols = filtered_s[distance_columns].to_numpy() |
96 | 125 |
|
|
0 commit comments