Improve matmul heuristic - fall back to legacy matmul (nv-legate#1022)

mfoerste4 · web-flow · commit 5890e1d0d3a1 · 2025-09-16T11:06:54.000+02:00
* readd legacy 3D matrix multiply

* remove Tall-skinny matmul workaround

* align c++ dot matmul code with python

* remove deprecated test, also ensure batched code executed during 1-proc tests

* add comments, fix memory approximation
diff --git a/cupynumeric/_thunk/deferred.py b/cupynumeric/_thunk/deferred.py
@@ -1857,101 +1857,141 @@ def contract(
                 assert n == rhs2.shape[1]
                 assert k == rhs2.shape[0]
 
-                def rounding_divide(
-                    lhs: tuple[int, ...], rhs: tuple[int, ...]
-                ) -> tuple[int, ...]:
-                    return tuple(
-                        (lh + rh - 1) // rh for (lh, rh) in zip(lhs, rhs)
-                    )
+                # decide whether to run full 3D matmul vs k-batched
+                # choose batched version only if memory exceeds threshold
+                def use_legacy_matmul(
+                    num_procs: int, m: int, n: int, k: int, itemsize: int
+                ) -> bool:
+                    # runtime.num_procs == 1 --> legacy matmul
+                    if not settings.test() and num_procs == 1:
+                        return True
+
+                    # approximate whether batching would actually be triggered here
+                    return (
+                        m + n
+                    ) * k * itemsize < settings.matmul_cache_size() * num_procs
+
+                use_3d_matmul = use_legacy_matmul(
+                    runtime.num_procs, m, n, k, rhs1_thunk.dtype.itemsize
+                )
+
+                if use_3d_matmul:
+                    lhs = lhs.promote(1, k)
+                    rhs1 = rhs1.promote(2, n)
+                    rhs2 = rhs2.promote(0, m)
 
-                # TODO: better heuristics
-                def choose_2d_color_shape(
-                    shape: tuple[int, int],
-                ) -> tuple[int, int]:
-                    # 1M elements, we should probably even go larger
-                    MIN_MATRIX_SIZE = 1 << 20
-                    # If the matrix is too small don't partition it at all
-                    if (not settings.test()) and shape[0] * shape[
-                        1
-                    ] <= MIN_MATRIX_SIZE:
-                        return (1, 1)
-
-                    # start with 1D and re-balance by powers of 2
-                    # (don't worry about other primes)
-                    color_shape = (runtime.num_procs, 1)
-                    while (
-                        shape[0] / color_shape[0]
-                        < 2 * shape[1] / color_shape[1]
-                        and color_shape[0] % 2 == 0
-                    ):
-                        color_shape = (color_shape[0] // 2, color_shape[1] * 2)
-
-                    return color_shape
-
-                # TODO: better heuristics?
-                def choose_batchsize(
-                    tilesize: tuple[int, ...], k: int, itemsize: int
-                ) -> int:
-                    # don't batch in case we only have 1 proc
-                    if runtime.num_procs == 1:
-                        return k
-
-                    # default corresponds to 128MB (to store A and B tile)
-                    from ..settings import settings
-
-                    assert len(tilesize) >= 2
-                    max_elements_per_tile = (
-                        settings.matmul_cache_size() // itemsize
+                    task = legate_runtime.create_auto_task(
+                        self.library, CuPyNumericOpCode.MATMUL
                     )
-                    total_elements_rhs = (tilesize[0] + tilesize[1]) * k
-                    num_batches = rounding_divide(
-                        (total_elements_rhs,), (max_elements_per_tile,)
-                    )[0]
-                    batch_size = rounding_divide((k,), (num_batches,))[0]
-
-                    return batch_size
-
-                # choose color-shape/k_batch_size
-                initial_color_shape = choose_2d_color_shape((m, n))
-                tile_shape = rounding_divide((m, n), initial_color_shape)
-                color_shape = rounding_divide((m, n), tile_shape)
-                k_batch_size = choose_batchsize(
-                    tile_shape, k, rhs1_thunk.dtype.itemsize
-                )
-                k_color = rounding_divide((k,), (k_batch_size,))
+                    p_lhs = task.add_reduction(lhs, ReductionOpKind.ADD)
+                    p_rhs1 = task.add_input(rhs1)
+                    p_rhs2 = task.add_input(rhs2)
 
-                # initial partition of lhs defined py tile-shape
-                tiled_lhs = lhs.partition_by_tiling(tile_shape)
-                tiled_rhs1 = rhs1.partition_by_tiling(
-                    (tile_shape[0], k_batch_size)
-                )
-                tiled_rhs2 = rhs2.partition_by_tiling(
-                    (k_batch_size, tile_shape[1])
-                )
+                    # specify unbatched matrix multiplication:
+                    unbatched = 1
+                    task.add_scalar_arg(unbatched, ty.uint32)
+
+                    task.add_constraint(align(p_lhs, p_rhs1))
+                    task.add_constraint(align(p_lhs, p_rhs2))
+                    task.execute()
+
+                else:
+                    # batched matmul
+                    #
+
+                    def rounding_divide(
+                        lhs: tuple[int, ...], rhs: tuple[int, ...]
+                    ) -> tuple[int, ...]:
+                        return tuple(
+                            (lh + rh - 1) // rh for (lh, rh) in zip(lhs, rhs)
+                        )
+
+                    # manually create 2d color shape with num_procs colors
+                    def choose_2d_color_shape(
+                        shape: tuple[int, int],
+                    ) -> tuple[int, int]:
+                        # start with 1D and re-balance by powers of 2
+                        # (don't worry about other primes)
+                        color_shape = (runtime.num_procs, 1)
+                        while (
+                            shape[0] / color_shape[0]
+                            < 2 * shape[1] / color_shape[1]
+                            and color_shape[0] % 2 == 0
+                        ):
+                            color_shape = (
+                                color_shape[0] // 2,
+                                color_shape[1] * 2,
+                            )
 
-                def run_matmul_for_batch(
-                    tiled_lhs: LogicalStorePartition,
-                    tiled_rhs1: LogicalStorePartition,
-                    tiled_rhs2: LogicalStorePartition,
-                    i: int,
-                ) -> None:
-                    manual_task = legate_runtime.create_manual_task(
-                        self.library, CuPyNumericOpCode.MATMUL, color_shape
+                        return color_shape
+
+                    # For a given tilesize choose a batchsize to split the
+                    # k-dimension into parts that will keep the partitions
+                    # of A and B below the settings.matmul_cache_size()
+                    def choose_batchsize(
+                        tilesize: tuple[int, ...], k: int, itemsize: int
+                    ) -> int:
+                        # don't batch in case we only have 1 proc
+                        if runtime.num_procs == 1:
+                            return k
+
+                        assert len(tilesize) >= 2
+                        # default corresponds to 128MB (to store A and B tile)
+                        max_elements_per_tile = (
+                            settings.matmul_cache_size() // itemsize
+                        )
+                        total_elements_rhs = (tilesize[0] + tilesize[1]) * k
+                        num_batches = rounding_divide(
+                            (total_elements_rhs,), (max_elements_per_tile,)
+                        )[0]
+                        # even out batches
+                        batch_size = rounding_divide((k,), (num_batches,))[0]
+
+                        return batch_size
+
+                    # choose color-shape/k_batch_size
+                    initial_color_shape = choose_2d_color_shape((m, n))
+                    tile_shape = rounding_divide((m, n), initial_color_shape)
+                    color_shape = rounding_divide((m, n), tile_shape)
+                    k_batch_size = choose_batchsize(
+                        tile_shape, k, rhs1_thunk.dtype.itemsize
                     )
+                    k_color = rounding_divide((k,), (k_batch_size,))
 
-                    manual_task.add_output(tiled_lhs)
-                    manual_task.add_input(tiled_lhs)
-                    manual_task.add_input(
-                        tiled_rhs1, (dimension(0), constant(i))
+                    # initial partition of lhs defined py tile-shape
+                    tiled_lhs = lhs.partition_by_tiling(tile_shape)
+                    tiled_rhs1 = rhs1.partition_by_tiling(
+                        (tile_shape[0], k_batch_size)
                     )
-                    manual_task.add_input(
-                        tiled_rhs2, (constant(i), dimension(1))
+                    tiled_rhs2 = rhs2.partition_by_tiling(
+                        (k_batch_size, tile_shape[1])
                     )
 
-                    manual_task.execute()
-
-                for i in range(0, k_color[0]):
-                    run_matmul_for_batch(tiled_lhs, tiled_rhs1, tiled_rhs2, i)
+                    def run_matmul_for_batch(
+                        tiled_lhs: LogicalStorePartition,
+                        tiled_rhs1: LogicalStorePartition,
+                        tiled_rhs2: LogicalStorePartition,
+                        i: int,
+                    ) -> None:
+                        manual_task = legate_runtime.create_manual_task(
+                            self.library, CuPyNumericOpCode.MATMUL, color_shape
+                        )
+
+                        manual_task.add_output(tiled_lhs)
+                        manual_task.add_input(tiled_lhs)
+                        manual_task.add_input(
+                            tiled_rhs1, (dimension(0), constant(i))
+                        )
+                        manual_task.add_input(
+                            tiled_rhs2, (constant(i), dimension(1))
+                        )
+
+                        manual_task.execute()
+
+                    for i in range(0, k_color[0]):
+                        run_matmul_for_batch(
+                            tiled_lhs, tiled_rhs1, tiled_rhs2, i
+                        )
 
             else:
                 assert False
@@ -4216,48 +4256,3 @@ def stencil_hint(
         legate_runtime.prefetch_bloated_instances(
             self.base, low_offsets, high_offsets, False
         )
-
-    @auto_convert("rhs1_thunk", "rhs2_thunk")
-    def ts_matmul(self, rhs1_thunk: Any, rhs2_thunk: Any) -> Any:
-        lhs_thunk: NumPyThunk = self
-
-        # Clear output array
-        lhs_thunk.fill(np.array(0, dtype=lhs_thunk.dtype))
-        lhs = lhs_thunk.base  # type: ignore
-
-        rhs1 = rhs1_thunk.base
-        rhs2 = rhs2_thunk.base
-
-        m = lhs.shape[0]
-        n = lhs.shape[1]
-        k = rhs1.shape[1]
-        unbatched = 1
-
-        assert m == rhs1.shape[0]
-        assert n == rhs2.shape[1]
-        assert k == rhs2.shape[0]
-        lhs = lhs.promote(1, k)
-        rhs1 = rhs1.promote(2, n)
-        rhs2 = rhs2.promote(0, m)
-
-        task = legate_runtime.create_auto_task(
-            self.library, CuPyNumericOpCode.MATMUL
-        )
-        p_lhs = task.add_reduction(lhs, ReductionOpKind.ADD)
-        p_rhs1 = task.add_input(rhs1)
-        p_rhs2 = task.add_input(rhs2)
-        #
-        # specify unbatched matrix multiplication:
-        #
-        task.add_scalar_arg(unbatched, ty.uint32)
-
-        task.add_constraint(align(p_lhs, p_rhs1))
-        task.add_constraint(align(p_lhs, p_rhs2))
-        #
-        # additional constraints:
-        #
-        # task.add_constraint(broadcast(p_rhs1, (0,)))
-        # task.add_constraint(broadcast(p_rhs2, (1,)))
-        task.add_constraint(broadcast(p_lhs))
-        #
-        task.execute()
diff --git a/cupynumeric/_thunk/eager.py b/cupynumeric/_thunk/eager.py
@@ -2147,9 +2147,6 @@ def stencil_hint(
         if self.deferred is not None:
             self.deferred.stencil_hint(low_offsets, high_offsets)
 
-    def ts_matmul(self, rhs1_thunk: Any, rhs2_thunk: Any) -> Any:
-        np.matmul(rhs1_thunk.array, rhs2_thunk.array, out=self.array)
-
     def in1d(
         self,
         ar2: Any,
diff --git a/cupynumeric/_thunk/thunk.py b/cupynumeric/_thunk/thunk.py
@@ -1512,9 +1512,6 @@ def stencil_hint(
         self, low_offsets: tuple[int, ...], high_offsets: tuple[int, ...]
     ) -> None: ...
 
-    @abstractmethod
-    def ts_matmul(self, rhs1_thunk: Any, rhs2_thunk: Any) -> Any: ...
-
     @abstractmethod
     def in1d(
         self,
diff --git a/cupynumeric/linalg/linalg.py b/cupynumeric/linalg/linalg.py
@@ -39,7 +39,7 @@
 from .._module import dot, empty_like, eye, matmul, ndarray
 from .._module.array_rearrange import flip
 from .._module.creation_matrices import diag
-from .._module.creation_shape import empty, zeros, zeros_like
+from .._module.creation_shape import zeros, zeros_like
 from .._module.ssc_sorting import argsort
 from .._ufunc.math import add, sqrt as _sqrt
 from ._exception import LinAlgError
@@ -1573,25 +1573,10 @@ def tssvd(a: ndarray) -> tuple[ndarray, ...]:
     if a.ndim != 2 or a.size <= 1:
         raise ValueError(f"Invalid input shape for tssvd: {a.shape}")
 
-    m_info = get_machine()
-
     # A.T*A:
     #
-    # unbatched way (there's a bug resulting in 0-matrix, it seems):
-    # {
-    m = a.shape[0]
-    n = a.shape[1]
-
     # TODO: Grammian API:
-    #
-    a2 = empty(shape=(n, n), dtype=a.dtype)
-    ah = a.transpose().conj()
-    a2._thunk.ts_matmul(ah._thunk, a._thunk)
-    # }
-    #
-    # batched way (slower, but passes):
-    #
-    # a2 = matmul(a.transpose().conj(), a)
+    a2 = a.transpose().conj() @ a
 
     # eigen-vals, eigen-vecs of A.T*A:
     #
@@ -1610,14 +1595,7 @@ def tssvd(a: ndarray) -> tuple[ndarray, ...]:
     # generate index permutation, pi
     # via sort-by-key decreasingly:
     #
-    d_indices = zeros(shape=(n,), dtype=np.int64)
-    with m_info[0]:  # !
-        d_indices = argsort(svals)
-        #
-        # reverse:
-        #
-        # d_indices = d_indices[::-1] # Error: not implemented
-        d_indices = flip(d_indices)
+    d_indices = flip(argsort(svals))
 
     # V.T:
     #
@@ -1628,14 +1606,7 @@ def tssvd(a: ndarray) -> tuple[ndarray, ...]:
 
     # U = A*V*inv(S):
     #
-    # B = matmul(ev, Sinv)
-    # u = matmul(a, B)
-
-    B = empty(shape=(n, n), dtype=a.dtype)
-    B._thunk.ts_matmul(ev._thunk, Sinv._thunk)
-
-    u = empty(shape=(m, n), dtype=a.dtype)
-    u._thunk.ts_matmul(a._thunk, B._thunk)
+    u = a @ (ev @ Sinv)
 
     # re-arrange svals decreasingly:
     #
@@ -1644,7 +1615,7 @@ def tssvd(a: ndarray) -> tuple[ndarray, ...]:
     # permute columns of U with pi:
     #
     # u = u[:, d_indices]
-    u = matmul(u, eye(u.shape[1])[d_indices].T)
+    u = u @ eye(u.shape[1])[d_indices].T
 
     # permute rows of V.T with pi:
     #
diff --git a/examples/tssvd.py b/examples/tssvd.py
@@ -63,6 +63,8 @@ def run_tssvd(m, n, perform_check, timing):
     if timing:
         print(f"TSSVD elapsed Time: {total:.3f} ms")
 
+    return total
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/cupynumeric/ndarray.cc b/src/cupynumeric/ndarray.cc
diff --git a/tests/integration/test_mm_batch_unbatch.py b/tests/integration/test_mm_batch_unbatch.py