Merge pull request #365 from MiraGeoscience/GEOPY-2781

domfournier · web-flow · commit 20701269256d · 2026-03-30T15:19:12.000-07:00
GEOPY-2781: Inversion stalls on tiling for large problems during redistribution of clusters
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,6 +8,8 @@ requires = [
   "jinja2 ==3.*",
   "packaging >=24.0",
   "tomlkit >=0.13",
+  "setuptools>=80",
+  "setuptools-scm[simple]>=9.2.*"
 ]
 build-backend = "poetry_dynamic_versioning.backend"
 
diff --git a/simpeg_drivers/utils/nested.py b/simpeg_drivers/utils/nested.py
@@ -13,19 +13,13 @@
 import warnings
 from collections.abc import Iterable
 from copy import copy
-from itertools import chain
 from pathlib import Path
 
 import numpy as np
-from dask import compute, delayed
-from dask.distributed import get_client
 from discretize import TensorMesh, TreeMesh
 from geoh5py.shared.utils import uuid_from_values
-from scipy.optimize import linear_sum_assignment
 from scipy.spatial import cKDTree
-from scipy.spatial.distance import cdist
 from simpeg import data, data_misfit, maps, meta, objective_function
-from simpeg.dask.objective_function import DistributedComboMisfits
 from simpeg.data_misfit import L2DataMisfit
 from simpeg.electromagnetics.base_1d import BaseEM1DSimulation
 from simpeg.electromagnetics.frequency_domain.simulation import BaseFDEMSimulation
@@ -539,21 +533,9 @@ def tile_locations(
         from sklearn.cluster import KMeans
 
         kmeans = KMeans(n_clusters=n_tiles, random_state=0, n_init="auto")
-        cluster_size = int(np.ceil(grid_locs.shape[0] / n_tiles))
         kmeans.fit(grid_locs)
 
-    if labels is not None:
-        cluster_id = kmeans.labels_
-    else:
-        # Redistribute cluster centers to even out the number of points
-        centers = kmeans.cluster_centers_
-        centers = (
-            centers.reshape(-1, 1, grid_locs.shape[1])
-            .repeat(cluster_size, 1)
-            .reshape(-1, grid_locs.shape[1])
-        )
-        distance_matrix = cdist(grid_locs, centers)
-        cluster_id = linear_sum_assignment(distance_matrix)[1] // cluster_size
+    cluster_id = kmeans.labels_
 
     tiles = []
     for tid in set(cluster_id):
diff --git a/tests/locations_test.py b/tests/locations_test.py
@@ -115,34 +115,34 @@ def test_filter(tmp_path: Path):
         assert np.all(filtered_data["key"] == [2, 3, 4])
 
 
-def test_tile_locations(tmp_path: Path):
-    with Workspace.create(tmp_path / f"{__name__}.geoh5") as ws:
-        grid_x, grid_y = np.meshgrid(np.arange(100), np.arange(100))
-        choices = np.c_[grid_x.ravel(), grid_y.ravel(), np.zeros(grid_x.size)]
-        inds = np.random.randint(0, 10000, 1000)
-        pts = Points.create(
-            ws,
-            name="test-points",
-            vertices=choices[inds],
-        )
-        tiles = tile_locations(pts.vertices[:, :2], n_tiles=8)
-
-        values = np.zeros(pts.n_vertices)
-        pop = []
-        for ind, tile in enumerate(tiles):
-            values[tile] = ind
-            pop.append(len(tile))
-
-        pts.add_data(
-            {
-                "values": {
-                    "values": values,
-                }
-            }
-        )
-        assert np.std(pop) / np.mean(pop) < 0.02, (
-            "Population of tiles are not almost equal."
-        )
+# TODO Find a scalable algo better than linear_sum_assignment to do even split
+# The tiling strategy should yield even "densities" (area x n_receivers)
+# def test_tile_locations(tmp_path: Path):
+#     with Workspace.create(tmp_path / f"{__name__}.geoh5") as ws:
+#         grid_x, grid_y = np.meshgrid(np.arange(100), np.arange(100))
+#         choices = np.c_[grid_x.ravel(), grid_y.ravel(), np.zeros(grid_x.size)]
+#         inds = np.random.randint(0, 10000, 1000)
+#         pts = Points.create(
+#             ws,
+#             name="test-points",
+#             vertices=choices[inds],
+
+
+def test_tile_locations():
+    n_points = 1000
+    rng = np.random.default_rng(0)
+    locations = rng.standard_normal((n_points, 2))
+
+    tiles = tile_locations(locations, n_tiles=8)
+
+    # All indices should be covered exactly once across tiles
+    all_indices = np.concatenate(tiles)
+    assert np.array_equal(np.sort(all_indices), np.arange(n_points))
+
+    # Tiles should be reasonably balanced in population
+    pop = np.array([len(tile) for tile in tiles])
+    assert pop.min() > 0
+    assert np.std(pop) / np.mean(pop) < 0.5
 
 
 def test_tile_locations_labels(tmp_path: Path):

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,8 @@ requires = [`
`8`	`8`	`"jinja2 ==3.*",`
`9`	`9`	`"packaging >=24.0",`
`10`	`10`	`"tomlkit >=0.13",`
	`11`	`+ "setuptools>=80",`
	`12`	`+ "setuptools-scm[simple]>=9.2.*"`
`11`	`13`	`]`
`12`	`14`	`build-backend = "poetry_dynamic_versioning.backend"`
`13`	`15`