dask.distributed: do not rechunk

crusaderky · crusaderky · commit d913fa979867 · 2026-03-08T11:40:26.000Z
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -9,8 +9,8 @@ v2.1.0 (unreleased)
   of text messages for all differences in numpy, pandas, and xarray objects
 - New function :func:`display_diffs` that displays differences
   in Jupyter notebooks
-- Fixed issue that would cause excessive RAM usage when comparing Dask arrays with
-  2+ dimensions using a distributed scheduler
+- Fixed issues that would cause slowdowns and excessive RAM usage when comparing Dask
+  arrays with 2+ dimensions using a distributed scheduler
 - Added support for P2P rechunk in Dask distributed
 
 
diff --git a/recursive_diff/core.py b/recursive_diff/core.py
@@ -6,7 +6,9 @@
 
 from __future__ import annotations
 
+import itertools
 import math
+import operator
 import re
 import typing
 from collections.abc import Callable, Collection, Generator, Hashable
@@ -422,10 +424,15 @@ def _diff_dataarrays(
     lhs, rhs = xarray.align(lhs, rhs, join="inner")
 
     is_dask = lhs.chunks is not None or rhs.chunks is not None
-    if is_dask and lhs.chunks is None:
-        lhs = lhs.chunk(dict(zip(rhs.dims, rhs.chunks)))  # type: ignore[arg-type]
-    elif is_dask and rhs.chunks is None:
-        rhs = rhs.chunk(dict(zip(lhs.dims, lhs.chunks)))  # type: ignore[arg-type]
+    if is_dask:
+        import dask.array as da
+
+        # Ensure that both lhs and rhs are Dask arrays and that they
+        # have aligned chunks
+        lhs_data, rhs_data = da.broadcast_arrays(lhs.data, rhs.data)
+        lhs = lhs.copy(deep=False, data=lhs_data)
+        rhs = rhs.copy(deep=False, data=rhs_data)
+        assert lhs.chunks == rhs.chunks
 
     # Generate a bit-mask of the differences
     # For Dask-backed arrays, this operation is delayed.
@@ -483,33 +490,37 @@ def _diff_dataarrays(
     #   non-brief dim, with potentially repeated indices
     # All of the arrays will have the same size, which is the number of differences.
     # For Dask-backed arrays, this whole operation is delayed.
+    diffs_idx: tuple[np.ndarray | Array, ...]
 
     if brief_axes:
         diffs_count = mask.astype(int).sum(axis=tuple(brief_axes))
         mask = diffs_count > 0
-        if mask.ndim:
-            diffs_count = diffs_count[mask]
+    if is_dask:
+        assert isinstance(mask, Array)
+        # a[mask] is very slow in Dask for 2+ dimensional arrays because it needs to
+        # preserve the order of the returned elements, so it involves rechunking. Under
+        # the assumption that the number of differences is << the number of total
+        # elements, filter each chunk independently and then full-sort the results by
+        # index.
+        diffs_idx, sort_indices = _fast_dask_nonzero(mask)
+        if brief_axes:
+            if mask.ndim:
+                diffs_count = _fast_dask_mask(diffs_count, mask, sort_indices)
+        else:
+            diffs_lhs = _fast_dask_mask(lhs.data, mask, sort_indices)
+            diffs_rhs = _fast_dask_mask(rhs.data, mask, sort_indices)
     else:
-        diffs_lhs = lhs.data[mask]
-        diffs_rhs = rhs.data[mask]
-
-    diffs_idx = []
-    for axis, size in enumerate(mask.shape):
-        idx_shape = (1,) * axis + (-1,) + (1,) * (mask.ndim - axis - 1)
-        if is_dask:
-            import dask.array as da
-
-            assert isinstance(mask, da.Array)
-            idx = da.arange(size, chunks=mask.chunks[axis])
-            idx = idx.reshape(idx_shape)
-            idx = da.broadcast_to(idx, mask.shape, chunks=mask.chunks)
+        assert isinstance(mask, (np.ndarray, np.generic))
+        if brief_axes:
+            if mask.ndim:
+                diffs_idx = np.nonzero(mask)
+                diffs_count = diffs_count[mask]
+            else:
+                diffs_idx = np.array([], dtype=int)
         else:
-            idx = np.arange(size)
-            idx = idx.reshape(idx_shape)
-            idx = np.broadcast_to(idx, mask.shape)
-
-        idx = idx[mask]
-        diffs_idx.append(idx)
+            diffs_idx = np.nonzero(mask)
+            diffs_lhs = lhs.data[mask]
+            diffs_rhs = rhs.data[mask]
 
     msg_prefix = "".join(f"[{elem}]" for elem in path)
 
@@ -542,7 +553,7 @@ def _diff_dataarrays(
 
             rel_delta = da.map_blocks(_rel_delta, diffs_lhs, diffs_rhs, dtype=float)
         else:
-            rel_delta = _rel_delta(diffs_lhs, diffs_rhs)
+            rel_delta = _rel_delta(diffs_lhs, diffs_rhs)  # type: ignore[arg-type]
         args = (diffs_lhs, diffs_rhs, abs_delta, rel_delta, *diffs_coords)
         build_df = partial(
             _build_dataframe,
@@ -581,6 +592,88 @@ def _diff_dataarrays(
         yield from pp_func(*args)
 
 
+def _fast_dask_nonzero(mask: Array) -> tuple[tuple[Array, ...], Array]:
+    """Variant of da.nonzero(mask), which is much faster when the number of
+    nonzero elements is much smaller than the total.
+
+    Returns
+
+    - tuple of single-chunk arrays of shape (mask.ndim, number of differences),
+      ordered as it would be returned by da.nonzero(mask)
+    - single-chunk array of shape (number of differences, ) which is to be used
+      by _fast_dask_mask to reorder the output.
+    """
+    import dask
+    import dask.array as da
+
+    chunk_offsets: list[list[int]] = [
+        [0, *np.cumsum(c[:-1]).tolist()] for c in mask.chunks
+    ]
+    f = dask.delayed(_fast_dask_nonzero_chunk, pure=True)
+    delayeds = [
+        f(chunk, chunk_offset)
+        for chunk, chunk_offset in zip(
+            mask.to_delayed().reshape(-1),
+            itertools.product(*chunk_offsets),
+        )
+    ]
+    rechunked = dask.delayed(np.concatenate, pure=True)(delayeds, axis=1)
+    nz = da.from_delayed(
+        rechunked,
+        shape=(mask.ndim, math.nan),
+        dtype=int,
+        meta=np.array([[]], dtype=int),
+    )
+    sort_indices = nz[::-1, :].map_blocks(
+        np.lexsort,
+        dtype=int,
+        meta=np.array([], dtype=int),
+        drop_axis=0,
+    )
+
+    nz_sorted = nz.T.map_blocks(
+        operator.getitem,
+        sort_indices,
+        dtype=int,
+        meta=np.array([[]], dtype=int),
+    ).T
+    return tuple(nz_sorted), sort_indices
+
+
+def _fast_dask_nonzero_chunk(
+    mask_chunk: np.ndarray, offset: tuple[int, ...]
+) -> np.ndarray:
+    nz_indices = np.stack(np.nonzero(mask_chunk))
+    return nz_indices + np.array(offset)[:, None]
+
+
+def _fast_dask_mask(a: Array, mask: Array, sort_indices: Array) -> Array:
+    """Variant of a[mask], which does not preserve the order of the returned elements,
+    which is much faster on Dask for 2+ dimensions arrays because it does not need
+    rechunnking. Applying this function to multiple identically shaped **and chunked**
+    arrays with the same mask will return objects in the same order.
+    """
+    import dask
+    import dask.array as da
+
+    f = dask.delayed(operator.getitem, pure=True)
+    delayeds = [
+        f(a_i, mask_i)
+        for a_i, mask_i in zip(
+            a.to_delayed().reshape(-1),
+            mask.to_delayed().reshape(-1),
+        )
+    ]
+    rechunked = dask.delayed(np.concatenate, pure=True)(delayeds)
+    sorted = dask.delayed(operator.getitem, pure=True)(rechunked, sort_indices)
+    return da.from_delayed(
+        sorted,
+        shape=(math.nan,),
+        dtype=a.dtype,
+        meta=np.array([], dtype=a.dtype),
+    )
+
+
 def _build_dataframe(
     column_names: list[str], index_names: list[str], *args: np.ndarray
 ) -> pd.DataFrame:
diff --git a/recursive_diff/tests/test_recursive_diff.py b/recursive_diff/tests/test_recursive_diff.py
@@ -83,9 +83,10 @@ def __repr__(self):
         return f"Square({self.side})"
 
 
-def check(lhs, rhs, *expect, **kwargs):
-    expect = sorted(expect)
-    actual = sorted(recursive_diff(lhs, rhs, **kwargs))
+def check(lhs, rhs, *expect, order=False, **kwargs):
+    f = list if order else sorted
+    expect = f(expect)
+    actual = f(recursive_diff(lhs, rhs, **kwargs))
     assert actual == expect
 
 
@@ -1165,6 +1166,67 @@ def test_dask_dataarray(chunk_lhs, chunk_rhs):
     check(lhs, rhs, "[data][x=2]: c != d")
 
 
+@requires_dask
+@pytest.mark.parametrize(
+    "chunk_lhs,chunk_rhs",
+    [
+        (None, None),
+        (None, -1),
+        (None, 2),
+        ({"x": 3, "y": 1}, {"x": 2, "y": 2}),
+    ],
+)
+def test_dask_dataarray_2d(chunk_lhs, chunk_rhs):
+    lhs = xarray.DataArray([[0, 1, 2], [3, 4, 5]], dims=["x", "y"])
+    rhs = xarray.DataArray([[0, 1, 2], [3, 4, 6]], dims=["x", "y"])
+    if chunk_lhs:
+        lhs = lhs.chunk(chunk_lhs)
+    if chunk_rhs:
+        rhs = rhs.chunk(chunk_rhs)
+
+    check(lhs, rhs, "[data][x=1, y=2]: 5 != 6 (abs: 1.0e+00, rel: 2.0e-01)")
+
+
+def test_dask_dataarray_ordered(chunk):
+    """Test that difference order goes in C order and is not influenced
+    by Dask chunks.
+    """
+    lhs = xarray.DataArray(np.arange(2 * 3 * 4).reshape(2, 3, 4), dims=["x", "y", "z"])
+    rhs = lhs + 1
+    if chunk:
+        lhs = lhs.chunk({"x": 2, "y": 2, "z": 3})
+        rhs = rhs.chunk({"x": 2, "y": 2, "z": 3})
+    check(
+        lhs,
+        rhs,
+        "[data][x=0, y=0, z=0]: 0 != 1 (abs: 1.0e+00, rel: nan)",
+        "[data][x=0, y=0, z=1]: 1 != 2 (abs: 1.0e+00, rel: 1.0e+00)",
+        "[data][x=0, y=0, z=2]: 2 != 3 (abs: 1.0e+00, rel: 5.0e-01)",
+        "[data][x=0, y=0, z=3]: 3 != 4 (abs: 1.0e+00, rel: 3.3e-01)",
+        "[data][x=0, y=1, z=0]: 4 != 5 (abs: 1.0e+00, rel: 2.5e-01)",
+        "[data][x=0, y=1, z=1]: 5 != 6 (abs: 1.0e+00, rel: 2.0e-01)",
+        "[data][x=0, y=1, z=2]: 6 != 7 (abs: 1.0e+00, rel: 1.7e-01)",
+        "[data][x=0, y=1, z=3]: 7 != 8 (abs: 1.0e+00, rel: 1.4e-01)",
+        "[data][x=0, y=2, z=0]: 8 != 9 (abs: 1.0e+00, rel: 1.2e-01)",
+        "[data][x=0, y=2, z=1]: 9 != 10 (abs: 1.0e+00, rel: 1.1e-01)",
+        "[data][x=0, y=2, z=2]: 10 != 11 (abs: 1.0e+00, rel: 1.0e-01)",
+        "[data][x=0, y=2, z=3]: 11 != 12 (abs: 1.0e+00, rel: 9.1e-02)",
+        "[data][x=1, y=0, z=0]: 12 != 13 (abs: 1.0e+00, rel: 8.3e-02)",
+        "[data][x=1, y=0, z=1]: 13 != 14 (abs: 1.0e+00, rel: 7.7e-02)",
+        "[data][x=1, y=0, z=2]: 14 != 15 (abs: 1.0e+00, rel: 7.1e-02)",
+        "[data][x=1, y=0, z=3]: 15 != 16 (abs: 1.0e+00, rel: 6.7e-02)",
+        "[data][x=1, y=1, z=0]: 16 != 17 (abs: 1.0e+00, rel: 6.2e-02)",
+        "[data][x=1, y=1, z=1]: 17 != 18 (abs: 1.0e+00, rel: 5.9e-02)",
+        "[data][x=1, y=1, z=2]: 18 != 19 (abs: 1.0e+00, rel: 5.6e-02)",
+        "[data][x=1, y=1, z=3]: 19 != 20 (abs: 1.0e+00, rel: 5.3e-02)",
+        "[data][x=1, y=2, z=0]: 20 != 21 (abs: 1.0e+00, rel: 5.0e-02)",
+        "[data][x=1, y=2, z=1]: 21 != 22 (abs: 1.0e+00, rel: 4.8e-02)",
+        "[data][x=1, y=2, z=2]: 22 != 23 (abs: 1.0e+00, rel: 4.5e-02)",
+        "[data][x=1, y=2, z=3]: 23 != 24 (abs: 1.0e+00, rel: 4.3e-02)",
+        order=False,
+    )
+
+
 @requires_dask
 def test_dask_dataarray_discards_data():
     """Test that chunked Dask datasets are loaded into memory and then