Enable cusolvermp (nv-legate#587)

marcinz · web-flow · commit 04b1084791d3 · 2025-05-06T09:32:02.000-07:00
diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
@@ -22,7 +22,8 @@ if [ -z "$CPU_ONLY" ]; then
   # cutensor, relying on the conda cutensor package
   CMAKE_ARGS+="
 -Dcutensor_DIR=$PREFIX
--DCMAKE_CUDA_ARCHITECTURES=all-major"
+-DCMAKE_CUDA_ARCHITECTURES=all-major
+-DCUSOLVERMP_DIR=$PREFIX"
 else
   # When we build without cuda, we need to provide the location of curand
   CMAKE_ARGS+="
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
@@ -143,6 +143,8 @@ requirements:
     - libcurand-dev
     - libcufile-dev
     - cuda-version ={{ cuda_version }}
+    - libcusolvermp-dev
+    - libcal-dev
 {% endif %}
 
   run:
diff --git a/continuous_integration/scripts/build b/continuous_integration/scripts/build
@@ -10,6 +10,9 @@ build_release_product() {
 
     local conda_build_args=();
     # The channel sequence below needs to be preserved
+    # The ucc140 label contains the provisional packages for UCC 1.4.0
+    # TODO(marcinz): Needs to be removed when the real UCC 1.4.0 packages are available
+    conda_build_args+=(-c legate/label/ucc140);
     conda_build_args+=(-c https://conda.anaconda.org/${CONDA_CHANNEL}/label/${CONDA_LABEL});
     conda_build_args+=(-c legate/label/ucc140);
     conda_build_args+=(-c conda-forge);
diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test
@@ -59,13 +59,21 @@ test_cupynumeric() {
 
     cd "${REPO_DIR}";
 
+    export WORKERS=""
+    if command -v nvidia-smi &> /dev/null; then
+        gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+        if [ "$gpu_count" -ge 2 ]; then
+            export WORKERS="-j 1"
+        fi
+    fi
+
     case "$1" in
         "test")
             echo "Executing tests..."
             shift;
             setup_test_env;
             run_legate_issue;
-            ./test.py -vv --timeout 300 "$@"
+            ./test.py ${WORKERS} -vv --timeout 300 "$@"
             ;;
         "mypy")
             echo "Installing and executing mypy..."
diff --git a/cupynumeric/linalg/_solve.py b/cupynumeric/linalg/_solve.py
@@ -18,6 +18,7 @@
 
 import legate.core.types as ty
 from legate.core import broadcast, get_legate_runtime
+from legate.settings import settings
 
 from ..config import CuPyNumericOpCode
 from ..runtime import runtime
@@ -46,8 +47,8 @@ def solve_single(library: Library, a: LogicalStore, b: LogicalStore) -> None:
     task.execute()
 
 
-MIN_SOLVE_TILE_SIZE = 512
-MIN_SOLVE_MATRIX_SIZE = 2048
+MIN_SOLVE_TILE_SIZE = 2 if settings.test() else 512
+MIN_SOLVE_MATRIX_SIZE = 4 if settings.test() else 2048
 
 
 def mp_solve(
@@ -59,14 +60,24 @@ def mp_solve(
     b: LogicalStore,
     output: LogicalStore,
 ) -> None:
-    task = get_legate_runtime().create_auto_task(
-        library, CuPyNumericOpCode.MP_SOLVE
+    # coloring via num_procs to get utilization
+    initial_color_shape_x = runtime.num_gpus
+    tilesize_x = (n + initial_color_shape_x - 1) // initial_color_shape_x
+    color_shape_x = (n + tilesize_x - 1) // tilesize_x
+
+    task = get_legate_runtime().create_manual_task(
+        library, CuPyNumericOpCode.MP_SOLVE, (color_shape_x, 1)
     )
     task.throws_exception(LinAlgError)
-    task.add_input(a)
-    task.add_input(b)
-    task.add_output(output)
-    task.add_alignment(output, b)
+
+    tiled_a = a.partition_by_tiling((tilesize_x, n))
+    tiled_b = b.partition_by_tiling((tilesize_x, nrhs))
+    tiled_output = output.partition_by_tiling((tilesize_x, nrhs))
+
+    task.add_input(tiled_a)
+    task.add_input(tiled_b)
+    task.add_output(tiled_output)
+
     task.add_scalar_arg(n, ty.int64)
     task.add_scalar_arg(nrhs, ty.int64)
     task.add_scalar_arg(nb, ty.int64)
diff --git a/cupynumeric/linalg/linalg.py b/cupynumeric/linalg/linalg.py
@@ -961,17 +961,23 @@ def _thunk_solve(
         b = b.astype(dtype)
 
     if output is not None:
-        out = output
-        if out.shape != b.shape:
+        if output.shape != b.shape:
             raise ValueError(
                 f"Output shape mismatch: expected {b.shape}, "
-                f"but found {out.shape}"
+                f"but found {output.shape}"
             )
-        elif out.dtype != b.dtype:
+        elif output.dtype != b.dtype:
             raise TypeError(
                 f"Output type mismatch: expected {b.dtype}, "
-                f"but found {out.dtype}"
+                f"but found {output.dtype}"
             )
+
+    expand_b = b.ndim == 1
+    if expand_b:
+        b = b.reshape((b.shape[0], 1))
+
+    if output is not None:
+        out = output.reshape(b.shape)
     else:
         out = ndarray(
             shape=b.shape,
@@ -981,7 +987,12 @@ def _thunk_solve(
                 b,
             ),
         )
+
     out._thunk.solve(a._thunk, b._thunk)
+
+    if expand_b:
+        out = out.reshape((b.shape[0],))
+
     return out
 
 
diff --git a/src/cupynumeric/mapper.cc b/src/cupynumeric/mapper.cc
@@ -399,6 +399,20 @@ std::optional<std::size_t> CuPyNumericMapper::allocation_pool_size(
         }
       }
     }
+    case CUPYNUMERIC_MP_POTRF:
+    case CUPYNUMERIC_MP_SOLVE: {
+      switch (memory_kind) {
+        case legate::mapping::StoreTarget::FBMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::ZCMEM: {
+          return std::nullopt;
+        }
+        case legate::mapping::StoreTarget::SYSMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::SOCKETMEM: {
+          LEGATE_ABORT("CPU tasks shouldn't reach here");
+          return 0;
+        }
+      }
+    }
     case CUPYNUMERIC_NONZERO: {
       auto&& input      = task.input(0);
       auto&& output     = task.output(0);
diff --git a/src/cupynumeric/matrix/mp_potrf.cu b/src/cupynumeric/matrix/mp_potrf.cu
@@ -54,6 +54,10 @@ static inline void mp_potrf_template(
                                             &device_buffer_size,
                                             &host_buffer_size));
 
+  // ensure non-empty buffers
+  device_buffer_size = std::max(device_buffer_size, 1ul);
+  host_buffer_size   = std::max(host_buffer_size, 1ul);
+
   auto device_buffer = create_buffer<int8_t>(device_buffer_size, Memory::Kind::GPU_FB_MEM);
   auto host_buffer   = create_buffer<int8_t>(host_buffer_size, Memory::Kind::Z_COPY_MEM);
   auto info          = create_buffer<int32_t>(1, Memory::Kind::Z_COPY_MEM);
diff --git a/src/cupynumeric/matrix/mp_potrf.h b/src/cupynumeric/matrix/mp_potrf.h
@@ -25,10 +25,12 @@ class MpPotrfTask : public CuPyNumericTask<MpPotrfTask> {
   static inline const auto TASK_CONFIG =
     legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_MP_POTRF}};
 
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true).with_concurrent(true);
+
  public:
 #if LEGATE_DEFINED(LEGATE_USE_CUDA)
   static void gpu_variant(legate::TaskContext context);
 #endif
 };
 
-}  // namespace cupynumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/matrix/mp_solve.cu b/src/cupynumeric/matrix/mp_solve.cu
@@ -86,11 +86,14 @@ static inline void mp_solve_template(cal_comm_t comm,
                                             &getrs_device_buffer_size,
                                             &getrs_host_buffer_size));
 
-  auto device_buffer = create_buffer<int8_t>(
-    std::max(getrf_device_buffer_size, getrs_device_buffer_size), Memory::Kind::GPU_FB_MEM);
-  auto host_buffer = create_buffer<int8_t>(std::max(getrf_host_buffer_size, getrs_host_buffer_size),
-                                           Memory::Kind::Z_COPY_MEM);
-  auto info        = create_buffer<int32_t>(1, Memory::Kind::Z_COPY_MEM);
+  // ensure non-empty buffers
+  size_t device_buffer_size =
+    std::max(std::max(getrf_device_buffer_size, getrs_device_buffer_size), 1ul);
+  size_t host_buffer_size = std::max(std::max(getrf_host_buffer_size, getrs_host_buffer_size), 1ul);
+
+  auto device_buffer = create_buffer<int8_t>(device_buffer_size, Memory::Kind::GPU_FB_MEM);
+  auto host_buffer   = create_buffer<int8_t>(host_buffer_size, Memory::Kind::Z_COPY_MEM);
+  auto info          = create_buffer<int32_t>(1, Memory::Kind::Z_COPY_MEM);
 
   // initialize to zero
   info[0] = 0;
@@ -105,9 +108,9 @@ static inline void mp_solve_template(cal_comm_t comm,
                                  nullptr,
                                  cudaTypeToDataType<VAL>::type,
                                  device_buffer.ptr(0),
-                                 getrf_device_buffer_size,
+                                 device_buffer_size,
                                  host_buffer.ptr(0),
-                                 getrf_host_buffer_size,
+                                 host_buffer_size,
                                  info.ptr(0)));
 
   if (info[0] != 0) {
@@ -129,9 +132,9 @@ static inline void mp_solve_template(cal_comm_t comm,
                                  b_desc,
                                  cudaTypeToDataType<VAL>::type,
                                  device_buffer.ptr(0),
-                                 getrs_device_buffer_size,
+                                 device_buffer_size,
                                  host_buffer.ptr(0),
-                                 getrs_host_buffer_size,
+                                 host_buffer_size,
                                  info.ptr(0)));
 
   // TODO: We need a deferred exception to avoid this synchronization
diff --git a/src/cupynumeric/matrix/mp_solve.h b/src/cupynumeric/matrix/mp_solve.h
@@ -25,10 +25,12 @@ class MpSolveTask : public CuPyNumericTask<MpSolveTask> {
   static inline const auto TASK_CONFIG =
     legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_MP_SOLVE}};
 
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true).with_concurrent(true);
+
  public:
 #if LEGATE_DEFINED(LEGATE_USE_CUDA)
   static void gpu_variant(legate::TaskContext context);
 #endif
 };
 
-}  // namespace cupynumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/utilities/repartition.cu b/src/cupynumeric/utilities/repartition.cu
@@ -32,6 +32,13 @@ constexpr auto get_16b_aligned_count = [](auto count, auto element_bytes) {
   return (get_16b_aligned(count * element_bytes) + element_bytes - 1) / element_bytes;
 };
 
+template <typename T>
+Buffer<T> create_non_empty_buffer(size_t size,
+                                  legate::Memory::Kind kind = legate::Memory::Kind::NO_MEMKIND)
+{
+  return create_buffer<T>(std::max(size, size_t{1}), kind, alignof(T));
+}
+
 const auto is_device_only_ptr = [](const void* ptr) {
   cudaPointerAttributes attrs;
   auto res = cudaPointerGetAttributes(&attrs, ptr);
@@ -439,6 +446,7 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
       recv_info.ptr(r * stored_size_per_rank), stored_size_per_rank, ncclUint64, r, *comm, stream));
   }
   CHECK_NCCL(ncclGroupEnd());
+
   CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // need Z-copy synchronized to Host
 
   // allocate send/recv buffer
@@ -452,7 +460,7 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
     for (size_t rank_r = 0; rank_r < p_r; ++rank_r) {
       auto glob_rank = rank_r + rank_c * p_r;  // target ranks are col major
       assert(send_buffers.size() == glob_rank);
-      send_buffers.emplace_back(create_buffer<VAL>(
+      send_buffers.emplace_back(create_non_empty_buffer<VAL>(
         send_info[glob_rank * stored_size_per_rank + BlockInfo::TOTAL_SIZE], Memory::GPU_FB_MEM));
       auto receive_size = recv_info[glob_rank * stored_size_per_rank + BlockInfo::TOTAL_SIZE];
       if (receive_size > 0) {
@@ -463,7 +471,7 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
       }
       total_receive += receive_size;
       assert(recv_buffers.size() == glob_rank);
-      recv_buffers.emplace_back(create_buffer<VAL>(receive_size, Memory::GPU_FB_MEM));
+      recv_buffers.emplace_back(create_non_empty_buffer<VAL>(receive_size, Memory::GPU_FB_MEM));
     }
   }
 
@@ -484,6 +492,7 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
     // simplify - every tile handled by individual block (especially helpful for row/col transpose)
     dim3 grid = dim3(num_tiles_r, num_tiles_c);
     dim3 block(BLOCK_DIM, BLOCK_DIM);
+
     // row based needs shared mem for coalesced read/write
     // col based can access directly? maybe also use shared mem to unify
     copy_to_send_buffer<<<grid, block, 0, stream>>>(input,
@@ -528,7 +537,7 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
   }
 
   // combine data from all buffers
-  Buffer<VAL> result_2dbc = create_buffer<VAL>(total_receive, Memory::GPU_FB_MEM);
+  Buffer<VAL> result_2dbc = create_non_empty_buffer<VAL>(total_receive, Memory::GPU_FB_MEM);
   if (total_receive > 0) {
     Buffer<VAL*> recv_buffers_ptr = create_buffer<VAL*>(num_ranks, Memory::Z_COPY_MEM);
     for (size_t r = 0; r < num_ranks; r++) {
@@ -774,7 +783,8 @@ void repartition_matrix_block(
         send_info[other_rank * stored_size_per_rank + BlockInfo::OFFSET_COL] =
           active_send_column_start;
         assert(send_buffers.size() == other_rank);
-        send_buffers.emplace_back(create_buffer<VAL>(send_elements_for_rank, Memory::GPU_FB_MEM));
+        send_buffers.emplace_back(
+          create_non_empty_buffer<VAL>(send_elements_for_rank, Memory::GPU_FB_MEM));
       }
     }
   }
@@ -845,7 +855,8 @@ void repartition_matrix_block(
       recv_info[other_rank * stored_size_per_rank + BlockInfo::OFFSET_COL] =
         active_recv_column_start;
       assert(other_rank == recv_buffers.size());
-      recv_buffers.emplace_back(create_buffer<VAL>(recv_elements_for_rank, Memory::GPU_FB_MEM));
+      recv_buffers.emplace_back(
+        create_non_empty_buffer<VAL>(recv_elements_for_rank, Memory::GPU_FB_MEM));
     }
   }
 
diff --git a/tests/integration/test_solve.py b/tests/integration/test_solve.py
@@ -44,7 +44,7 @@
     "b_dtype", (np.float32, np.float64, np.complex64, np.complex128)
 )
 def test_solve_1d(n, a_dtype, b_dtype):
-    a = np.random.rand(n, n).astype(a_dtype)
+    a = np.random.rand(n, n).astype(a_dtype) + np.eye(n) * n
     b = np.random.rand(n).astype(b_dtype)
 
     out = num.linalg.solve(a, b)
@@ -67,7 +67,7 @@ def test_solve_1d(n, a_dtype, b_dtype):
     "b_dtype", (np.float32, np.float64, np.complex64, np.complex128)
 )
 def test_solve_2d(n, a_dtype, b_dtype):
-    a = np.random.rand(n, n).astype(a_dtype)
+    a = np.random.rand(n, n).astype(a_dtype) + np.eye(n) * n
     b = np.random.rand(n, n + 2).astype(b_dtype)
 
     out = num.linalg.solve(a, b)