quantifyearth
diff --git a/‎methods/matching/cluster_find_pairs_interactive.py‎
Lines changed: 48 additions & 255 deletions b/‎methods/matching/cluster_find_pairs_interactive.py‎
Lines changed: 48 additions & 255 deletions
@@ -7,6 +7,7 @@
 from sklearn.cluster import KMeans
 import matplotlib.pyplot as plt
 import geopandas as gpd
+from pyproj import Proj, transform
 import os
 import time
 import sys
@@ -108,14 +109,53 @@ def calculate_smd(group1, group2):
 
 # Define the start year
 t0 = 2012 # READ THIS IN
-match_years = [t0-10, t0-5, t0]
 
 # Read in the data
 boundary = gpd.read_file('/maps/aew85/projects/1201.geojson')
 
-k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet')
+k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k.parquet')
+# k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet')
 m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/1201/matches.parquet')
 
+
+t0 = 2018
+boundary = gpd.read_file('/maps/aew85/projects/ona.geojson')
+k_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/fastfp_test_ona/k.parquet')
+m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/fastfp_test_ona/matches.parquet')
+
+if(m_pixels.shape[0] > (k_pixels.shape[0])):
+    m_sub_size = int(k_pixels.shape[0]) # First down sample M as it is ~230 million points    
+    m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False)
+    m_pixels = m_pixels.iloc[m_random_indices]
+
+# # Calculate the central coordinates (centroid)
+# central_lat = m_pixels['lat'].mean()
+# central_lon = m_pixels['lng'].mean()
+# aeqd_proj = f"+proj=aeqd +lat_0={central_lat} +lon_0={central_lon} +datum=WGS84"
+
+# # Convert the DataFrame to a GeoDataFrame
+# m_gdf = gpd.GeoDataFrame(m_pixels, geometry=gpd.points_from_xy(m_pixels.lng, m_pixels.lat))
+# # Set the original CRS to WGS84 (EPSG:4326)
+# m_gdf.set_crs(epsg=4326, inplace=True)
+
+# # Transform the GeoDataFrame to the AEQD projection
+# m_gdf_aeqd = m_gdf.to_crs(aeqd_proj)
+
+# # Extract the transformed coordinates
+# gdf_aeqd['aeqd_x'] = gdf_aeqd.geometry.x
+# gdf_aeqd['aeqd_y'] = gdf_aeqd.geometry.y
+
+# # Define the grid resolution in meters
+# grid_resolution_m = 5000  # 5 km
+
+# # Calculate grid cell indices
+# gdf_aeqd['grid_x'] = (gdf_aeqd['aeqd_x'] // grid_resolution_m).astype(int)
+# gdf_aeqd['grid_y'] = (gdf_aeqd['aeqd_y'] // grid_resolution_m).astype(int)
+
+# # Print the first few rows to verify
+# print(gdf_aeqd.head())
+
+
 # concat m and k
 km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
                        m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
@@ -124,8 +164,6 @@ def calculate_smd(group1, group2):
 exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt']
 exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')]
 
-match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
-
 # Extract only the continuous columns
 continuous_columns = km_pixels.columns.difference(exclude_columns)
 km_pixels_selected = km_pixels[continuous_columns]
@@ -178,7 +216,7 @@ def calculate_smd(group1, group2):
     plt.ylabel('PCA Component 2')
     plt.legend()
     plt.show()
-    plt.savefig('Figures/cluster_centres_faiss_1.png')
+    plt.savefig('Figures/ona_cluster_centres_faiss_1.png')
     plt.close()  # Close the plot to free up memory
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -202,11 +240,14 @@ def calculate_smd(group1, group2):
         fig.delaxes(axes[j])
 
     plt.tight_layout()
-    plt.savefig('Figures/cluster_faiss_1_facet.png')
+    plt.savefig('Figures/Ona_cluster_faiss_1_facet.png')
     plt.close()
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+match_years = [t0-10, t0-5, t0]
+match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
+
 # Extract K and M pixels 
 k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt']
 m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl']
@@ -216,7 +257,7 @@ def calculate_smd(group1, group2):
 m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy()
 
 k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION)
-m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION)
+m_sub_size = int(m_pixels.shape[0] * 1)
 
 # Define indexs for the samples from K and M
 k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False)
@@ -330,252 +371,4 @@ def calculate_smd(group1, group2):
 smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
 print(smd_df)
 
-smd_results = []
-for column in columns_to_compare:
-    smd, mean1, mean2, pooled_std = calculate_smd(k_pixels[column], m_pixels[column])
-    smd_results.append((column, smd, mean1, mean2, pooled_std))
 
-# Convert the results to a DataFrame for better readability
-smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
-print(smd_df)
-
-
-
-
-
-
-
-def find_match_iteration(
-    k_parquet_filename: str,
-    m_parquet_filename: str,
-    start_year: int,
-    output_folder: str,
-    idx_and_seed: tuple[int, int]
-) -> None:
-    logging.info("Find match iteration %d of %d", idx_and_seed[0] + 1, REPEAT_MATCH_FINDING)
-    rng = np.random.default_rng(idx_and_seed[1])
-
-    logging.info("Loading K from %s", k_parquet_filename)
-    k_pixels = pd.read_parquet(k_parquet_filename)
-    
-    logging.info("Loading M from %s", m_parquet_filename)
-    m_pixels = pd.read_parquet(m_parquet_filename)
-    
-    # concat m and k
-    km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
-                           m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
-    
-    # Find the continuous columns
-    exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt']
-    exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')]
-    continuous_columns = km_pixels.columns.difference(exclude_columns)
-    # Categorical columns
-    match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
-    
-    logging.info("Starting PCA transformation of k and m union. km_pixels.shape: %a", {km_pixels.shape})
-    # PCA transform and conversion to 32 bit ints for continuous only
-    km_pca = to_pca_int32(km_pixels[continuous_columns])
-    logging.info("Done PCA transformation")
-    
-    # Extract K and M pixels  - this might be unnecessary I just wanted to make sure 
-    # K and M were in the same order here and in the PCA transform
-    k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt']
-    m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl']
-    # Extract K and M PCA transforms
-    k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy()
-    m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy()
-    
-    # Sample from K and M
-    k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION)
-    m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION)
-    # Define indexs for the samples from K and M
-    k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False)
-    m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False)
-    # Take random samples from K and M pixels
-    k_sub = k_pixels.iloc[k_random_indices]
-    m_sub = m_pixels.iloc[m_random_indices]
-    # Take corresponding random samples from the PCA transformed K and M 
-    k_sub_pca = k_pca[k_random_indices,:]
-    m_sub_pca = m_pca[m_random_indices,:]
-    
-    logging.info("Samples taken from K and M. k_sub.shape: %a; m_sub.shape: %a", {k_sub.shape, m_sub.shape})
-    
-    # Identify the unique combinations of luc columns
-    k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats))
-    
-    pairs_list = []
-    matchless_list = []
-    
-    logging.info("Starting greedy matching... k_sub.shape: %s, m_sub.shape: %s",
-                                  k_sub.shape, m_sub.shape)
-    
-    start_time = time.time()
-    for i in range(0, k_cat_combinations.shape[0]):
-        # i = 6 # ith element of the unique combinations of the luc time series in k
-        # for in range()
-        k_cat_comb = k_cat_combinations.iloc[i]
-        k_cat = k_sub[(k_sub[match_cats] == k_cat_comb).all(axis=1)]
-        k_cat_pca = k_sub_pca[(k_sub[match_cats] == k_cat_comb).all(axis=1)]
-        
-        # Find the subset in km_pixels that matches this combination
-        m_cat = m_sub[(m_sub[match_cats] == k_cat_comb).all(axis=1)]
-        m_cat_pca = m_sub_pca[(m_sub[match_cats] == k_cat_comb).all(axis=1)]
-        
-        if VERBOSE:
-            print('ksub_cat:' + str(k_cat.shape[0]))
-            print('msub_cat:' + str(m_cat.shape[0]))
-        
-        # If there is no suitable match for the pre-project luc time series
-        # Then it may be preferable to just take the luc state at t0
-        # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)]
-        # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)]
-        # For if there are no matches return nothing
-        
-        if(m_cat.shape[0] < k_cat.shape[0] * 5):
-            # print("M insufficient for matching. Set VERBOSE to True for more details.")
-            # Append the matchless DataFrame to the list
-            matchless_list.append(k_cat)
-            continue
-        
-        # Find the matches
-        matches_index = loop_match(m_cat_pca, k_cat_pca)
-        m_cat_matches = m_cat.iloc[matches_index]
-        
-        # i = 0
-        # matched = pd.concat([k_cat.iloc[i], m_cat.iloc[matches[i]]], axis=1, ignore_index=True)
-        # matched.columns = ['trt', 'ctrl']
-        # matched
-        #Looks great!
-        columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope']
-        # Calculate SMDs for the specified columns
-        smd_results = []
-        for column in columns_to_compare:
-            smd, mean1, mean2, pooled_std = calculate_smd(k_cat[column], m_cat_matches[column])
-            smd_results.append((column, smd, mean1, mean2, pooled_std))
-        
-        # Convert the results to a DataFrame for better readability
-        smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
-        
-        if VERBOSE:
-            # Print the results
-            print("categorical combination:")
-            print(k_cat_comb)
-            # Count how many items in 'column1' are not equal to the specified integer value
-            print("LUC flips in K:")
-            (k_cat['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum()
-            print("LUC flips in matches:")
-            (m_cat_matches['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum()
-            print("Standardized Mean Differences:")
-            print(smd_df)
-        
-        # Join the pairs into one dataframe:
-        k_cat = k_cat.reset_index(drop = True)
-        m_cat_matches = m_cat_matches.reset_index(drop = True)  
-        pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1)
-        
-        # Append the resulting DataFrame to the list
-        pairs_list.append(pairs_df)
-
-    # Combine all the DataFrames in the list into a single DataFrame
-    pairs = pd.concat(pairs_list, ignore_index=True)
-    matchless = pd.concat(matchless_list, ignore_index=True)
-    
-    logging.info("Finished greedy matching... pairs.shape: %s, matchless.shape: %s",
-                                  pairs.shape, matchless.shape)
-    
-    logging.info("Starting storing matches...")
-    pairs.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet'))
-    matchless.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet'))
-    
-    logging.info("Finished find match iteration")
-
-
-def find_pairs(
-    k_parquet_filename: str,
-    m_parquet_filename: str,
-    start_year: int,
-    seed: int,
-    output_folder: str,
-    processes_count: int
-) -> None:
-    logging.info("Starting find pairs")
-    os.makedirs(output_folder, exist_ok=True)
-
-    rng = np.random.default_rng(seed)
-    iteration_seeds = zip(range(REPEAT_MATCH_FINDING), rng.integers(0, 1000000, REPEAT_MATCH_FINDING))
-
-    with Pool(processes=processes_count) as pool:
-        pool.map(
-            partial(
-                find_match_iteration,
-                k_parquet_filename,
-                m_parquet_filename,
-                start_year,
-                output_folder
-            ),
-            iteration_seeds
-        )
-
-
-def main():
-    # If you use the default multiprocess model then you risk deadlocks when logging (which we
-    # have hit). Spawn is the default on macOS, but not on Linux.
-    set_start_method("spawn")
-
-    parser = argparse.ArgumentParser(description="Takes K and M and finds 100 sets of matches.")
-    parser.add_argument(
-        "--k",
-        type=str,
-        required=True,
-        dest="k_filename",
-        help="Parquet file containing pixels from K as generated by calculate_k.py"
-    )
-    parser.add_argument(
-        "--m",
-        type=str,
-        required=True,
-        dest="m_filename",
-        help="Parquet file containing pixels from M as generated by build_m_table.py"
-    )
-    parser.add_argument(
-        "--start_year",
-        type=int,
-        required=True,
-        dest="start_year",
-        help="Year project started."
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        required=True,
-        dest="seed",
-        help="Random number seed, to ensure experiments are repeatable."
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        required=True,
-        dest="output_directory_path",
-        help="Directory into which output matches will be written. Will be created if it does not exist."
-    )
-    parser.add_argument(
-        "-j",
-        type=int,
-        required=False,
-        default=round(cpu_count() / 2),
-        dest="processes_count",
-        help="Number of concurrent threads to use."
-    )
-    args = parser.parse_args()
-
-    find_pairs(
-        args.k_filename,
-        args.m_filename,
-        args.start_year,
-        args.seed,
-        args.output_directory_path,
-        args.processes_count
-    )
-
-if __name__ == "__main__":
-    main()