CusolverMP re-org to accommodate updated version 0.7 (nv-legate#1013)

aschaffer · web-flow · commit 0330a983311a · 2025-09-16T14:40:38.000-05:00
* CusolverMP re-org to accomodate ncclComm_t.

* CusolverMP re-org clean-up.

* CAL dependencies clean-up.

* build deps clean-up.

* Addressed reviews on meta.yaml and task.add_cal_com... removal.

* Removed cal.h inclusion.
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
@@ -161,8 +161,7 @@ requirements:
     - libcurand-dev
     - libcufile-dev
     - cuda-version ={{ cuda_version }}
-    - libcusolvermp-dev
-    - libcal-dev
+    - libcusolvermp-dev >=0.7
 {% endif %}
 
   run:
diff --git a/cupynumeric/linalg/_cholesky.py b/cupynumeric/linalg/_cholesky.py
@@ -95,7 +95,6 @@ def mp_potrf(
     task.add_scalar_arg(n, ty.int64)
     task.add_scalar_arg(nb, ty.int64)
     task.add_nccl_communicator()  # for repartitioning
-    task.add_cal_communicator()
     task.execute()
 
 
diff --git a/cupynumeric/linalg/_solve.py b/cupynumeric/linalg/_solve.py
@@ -92,7 +92,6 @@ def mp_solve(
     task.add_scalar_arg(nrhs, ty.int64)
     task.add_scalar_arg(nb, ty.int64)
     task.add_nccl_communicator()  # for repartitioning
-    task.add_cal_communicator()
     task.execute()
 
 
diff --git a/src/cupynumeric/cuda_help.h b/src/cupynumeric/cuda_help.h
@@ -27,7 +27,6 @@
 #include <cusolverDn.h>
 #if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
 #include <cusolverMp.h>
-#include <cal.h>
 #endif
 #include <cuda_runtime.h>
 #include <cufft.h>
@@ -108,24 +107,6 @@ __host__ inline void check_cusolver(cusolverStatus_t status, const char* file, i
   }
 }
 
-#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
-__host__ inline void check_cal(calError_t status, const char* file, int line)
-{
-  if (status != CAL_OK) {
-    fprintf(stderr,
-            "Internal libcal failure with error code %d in file %s at line %d\n",
-            status,
-            file,
-            line);
-#ifdef DEBUG_CUPYNUMERIC
-    assert(false);
-#else
-    exit(status);
-#endif
-  }
-}
-#endif
-
 __host__ inline void check_cutensor(cutensorStatus_t result, const char* file, int line)
 {
   if (result != CUTENSOR_STATUS_SUCCESS) {
@@ -179,12 +160,6 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
     cupynumeric::check_cusolver(__result__, __FILE__, __LINE__); \
   } while (false)
 
-#define CHECK_CAL(expr)                                     \
-  do {                                                      \
-    calError_t __result__ = (expr);                         \
-    cupynumeric::check_cal(__result__, __FILE__, __LINE__); \
-  } while (false)
-
 #define CHECK_CUTENSOR(expr)                                     \
   do {                                                           \
     cutensorStatus_t __result__ = (expr);                        \
diff --git a/src/cupynumeric/matrix/mp_potrf.cu b/src/cupynumeric/matrix/mp_potrf.cu
@@ -24,8 +24,8 @@ namespace cupynumeric {
 using namespace Legion;
 using namespace legate;
 
-template <typename VAL>
-static inline void mp_potrf_template(cal_comm_t comm,
+template <typename VAL, typename comm_t>
+static inline void mp_potrf_template(comm_t comm,
                                      int nprow,
                                      int npcol,
                                      int64_t n,
@@ -92,7 +92,7 @@ static inline void mp_potrf_template(cal_comm_t comm,
                                  info.ptr(0)));
 
   // TODO: We need a deferred exception to avoid this synchronization
-  CHECK_CAL(cal_stream_sync(comm, stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
   CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(desc));
@@ -108,8 +108,9 @@ struct MpPotrfImplBody<VariantKind::GPU, Type::Code::FLOAT32> {
   TaskContext context;
   explicit MpPotrfImplBody(TaskContext context) : context(context) {}
 
+  template <typename comm_t>
   void operator()(
-    cal_comm_t comm, int nprow, int npcol, int64_t n, int64_t nb, float* array, int64_t lld)
+    comm_t comm, int nprow, int npcol, int64_t n, int64_t nb, float* array, int64_t lld)
   {
     auto stream = context.get_task_stream();
     mp_potrf_template(comm, nprow, npcol, n, nb, array, lld, stream);
@@ -121,8 +122,9 @@ struct MpPotrfImplBody<VariantKind::GPU, Type::Code::FLOAT64> {
   TaskContext context;
   explicit MpPotrfImplBody(TaskContext context) : context(context) {}
 
+  template <typename comm_t>
   void operator()(
-    cal_comm_t comm, int nprow, int npcol, int64_t n, int64_t nb, double* array, int64_t lld)
+    comm_t comm, int nprow, int npcol, int64_t n, int64_t nb, double* array, int64_t lld)
   {
     auto stream = context.get_task_stream();
     mp_potrf_template(comm, nprow, npcol, n, nb, array, lld, stream);
@@ -134,13 +136,9 @@ struct MpPotrfImplBody<VariantKind::GPU, Type::Code::COMPLEX64> {
   TaskContext context;
   explicit MpPotrfImplBody(TaskContext context) : context(context) {}
 
-  void operator()(cal_comm_t comm,
-                  int nprow,
-                  int npcol,
-                  int64_t n,
-                  int64_t nb,
-                  complex<float>* array,
-                  int64_t lld)
+  template <typename comm_t>
+  void operator()(
+    comm_t comm, int nprow, int npcol, int64_t n, int64_t nb, complex<float>* array, int64_t lld)
   {
     auto stream = context.get_task_stream();
     mp_potrf_template(comm, nprow, npcol, n, nb, reinterpret_cast<cuComplex*>(array), lld, stream);
@@ -152,13 +150,9 @@ struct MpPotrfImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   TaskContext context;
   explicit MpPotrfImplBody(TaskContext context) : context(context) {}
 
-  void operator()(cal_comm_t comm,
-                  int nprow,
-                  int npcol,
-                  int64_t n,
-                  int64_t nb,
-                  complex<double>* array,
-                  int64_t lld)
+  template <typename comm_t>
+  void operator()(
+    comm_t comm, int nprow, int npcol, int64_t n, int64_t nb, complex<double>* array, int64_t lld)
   {
     auto stream = context.get_task_stream();
     mp_potrf_template(
diff --git a/src/cupynumeric/matrix/mp_potrf_template.inl b/src/cupynumeric/matrix/mp_potrf_template.inl
@@ -25,8 +25,6 @@
 #include "cupynumeric/cuda_help.h"
 #include "cupynumeric/utilities/repartition.h"
 
-#include <cal.h>
-
 namespace cupynumeric {
 
 using namespace Legion;
@@ -64,12 +62,13 @@ struct MpPotrfImpl {
     auto input_shape  = input_array.shape<2>();
     auto output_shape = output_array.shape<2>();
 
+    auto* p_nccl_comm = comms[0].get<ncclComm_t*>();
     int rank, num_ranks;
-    auto nccl_comm = comms[0];
-    auto cal_comm  = comms[1].get<cal_comm_t>();
-    assert(cal_comm);
-    CHECK_CAL(cal_comm_get_rank(cal_comm, &rank));
-    CHECK_CAL(cal_comm_get_size(cal_comm, &num_ranks));
+    assert(p_nccl_comm);
+    auto nccl_comm = *p_nccl_comm;
+    CHECK_NCCL(ncclCommUserRank(nccl_comm, &rank));
+    CHECK_NCCL(ncclCommCount(nccl_comm, &num_ranks));
+
     assert(launch_domain.get_volume() == num_ranks);
     assert(launch_domain.get_dim() <= 2);
 
@@ -115,10 +114,10 @@ struct MpPotrfImpl {
     auto volume   = num_rows * num_cols;
 
     auto [buffer_2dbc, volume_2dbc, lld_2dbc] = repartition_matrix_2dbc(
-      input_arr, volume, false, offset_r, offset_c, lld, nprow, npcol, nb, nb, nccl_comm, context);
+      input_arr, volume, false, offset_r, offset_c, lld, nprow, npcol, nb, nb, comms[0], context);
 
     MpPotrfImplBody<KIND, CODE>{context}(
-      cal_comm, nprow, npcol, n, nb, buffer_2dbc.ptr(0), lld_2dbc);
+      nccl_comm, nprow, npcol, n, nb, buffer_2dbc.ptr(0), lld_2dbc);
 
     repartition_matrix_block(buffer_2dbc,
                              volume_2dbc,
@@ -136,7 +135,7 @@ struct MpPotrfImpl {
                              false,
                              offset_r,
                              offset_c,
-                             nccl_comm,
+                             comms[0],
                              context);
   }
 
diff --git a/src/cupynumeric/matrix/mp_solve.cu b/src/cupynumeric/matrix/mp_solve.cu
@@ -24,8 +24,8 @@ namespace cupynumeric {
 using namespace Legion;
 using namespace legate;
 
-template <typename VAL>
-static inline void mp_solve_template(cal_comm_t comm,
+template <typename VAL, typename comm_t>
+static inline void mp_solve_template(comm_t comm,
                                      int nprow,
                                      int npcol,
                                      int64_t n,
@@ -145,7 +145,7 @@ static inline void mp_solve_template(cal_comm_t comm,
                                  info.ptr(0)));
 
   // TODO: We need a deferred exception to avoid this synchronization
-  CHECK_CAL(cal_stream_sync(comm, stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
   CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(a_desc));
@@ -163,7 +163,8 @@ struct MpSolveImplBody<VariantKind::GPU, Type::Code::FLOAT32> {
   TaskContext context;
   explicit MpSolveImplBody(TaskContext context) : context(context) {}
 
-  void operator()(cal_comm_t comm,
+  template <typename comm_t>
+  void operator()(comm_t comm,
                   int nprow,
                   int npcol,
                   int64_t n,
@@ -184,7 +185,8 @@ struct MpSolveImplBody<VariantKind::GPU, Type::Code::FLOAT64> {
   TaskContext context;
   explicit MpSolveImplBody(TaskContext context) : context(context) {}
 
-  void operator()(cal_comm_t comm,
+  template <typename comm_t>
+  void operator()(comm_t comm,
                   int nprow,
                   int npcol,
                   int64_t n,
@@ -205,7 +207,8 @@ struct MpSolveImplBody<VariantKind::GPU, Type::Code::COMPLEX64> {
   TaskContext context;
   explicit MpSolveImplBody(TaskContext context) : context(context) {}
 
-  void operator()(cal_comm_t comm,
+  template <typename comm_t>
+  void operator()(comm_t comm,
                   int nprow,
                   int npcol,
                   int64_t n,
@@ -236,7 +239,8 @@ struct MpSolveImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   TaskContext context;
   explicit MpSolveImplBody(TaskContext context) : context(context) {}
 
-  void operator()(cal_comm_t comm,
+  template <typename comm_t>
+  void operator()(comm_t comm,
                   int nprow,
                   int npcol,
                   int64_t n,
diff --git a/src/cupynumeric/matrix/mp_solve_template.inl b/src/cupynumeric/matrix/mp_solve_template.inl
@@ -25,8 +25,6 @@
 #include "cupynumeric/cuda_help.h"
 #include "cupynumeric/utilities/repartition.h"
 
-#include <cal.h>
-
 namespace cupynumeric {
 
 using namespace Legion;
@@ -63,13 +61,13 @@ struct MpSolveImpl {
   {
     using VAL = type_of<CODE>;
 
-    auto nccl_comm = comms[0];
-    auto cal_comm  = comms[1].get<cal_comm_t>();
-
+    auto* p_nccl_comm = comms[0].get<ncclComm_t*>();
     int rank, num_ranks;
-    assert(cal_comm);
-    CHECK_CAL(cal_comm_get_rank(cal_comm, &rank));
-    CHECK_CAL(cal_comm_get_size(cal_comm, &num_ranks));
+    assert(p_nccl_comm);
+    auto nccl_comm = *p_nccl_comm;
+    CHECK_NCCL(ncclCommUserRank(nccl_comm, &rank));
+    CHECK_NCCL(ncclCommCount(nccl_comm, &num_ranks));
+
     assert(launch_domain.get_volume() == num_ranks);
     assert(launch_domain.get_dim() <= 2);
 
@@ -127,7 +125,7 @@ struct MpSolveImpl {
                                                                               npcol,
                                                                               nb,
                                                                               nb,
-                                                                              nccl_comm,
+                                                                              comms[0],
                                                                               context);
 
     auto b_offset_r = b_shape.lo[0];
@@ -144,10 +142,10 @@ struct MpSolveImpl {
                                                                               npcol,
                                                                               nb,
                                                                               nb,
-                                                                              nccl_comm,
+                                                                              comms[0],
                                                                               context);
 
-    MpSolveImplBody<KIND, CODE>{context}(cal_comm,
+    MpSolveImplBody<KIND, CODE>{context}(nccl_comm,
                                          nprow,
                                          npcol,
                                          n,
@@ -177,7 +175,7 @@ struct MpSolveImpl {
                              false,  // x_shape is enforced col-major
                              b_offset_r,
                              b_offset_c,
-                             nccl_comm,
+                             comms[0],
                              context);
   }