Skip to content

Commit 04b1084

Browse files
authored
Enable cusolvermp (nv-legate#587)
1 parent 76ce882 commit 04b1084

13 files changed

Lines changed: 105 additions & 33 deletions

File tree

conda/conda-build/build.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ if [ -z "$CPU_ONLY" ]; then
2222
# cutensor, relying on the conda cutensor package
2323
CMAKE_ARGS+="
2424
-Dcutensor_DIR=$PREFIX
25-
-DCMAKE_CUDA_ARCHITECTURES=all-major"
25+
-DCMAKE_CUDA_ARCHITECTURES=all-major
26+
-DCUSOLVERMP_DIR=$PREFIX"
2627
else
2728
# When we build without cuda, we need to provide the location of curand
2829
CMAKE_ARGS+="

conda/conda-build/meta.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ requirements:
143143
- libcurand-dev
144144
- libcufile-dev
145145
- cuda-version ={{ cuda_version }}
146+
- libcusolvermp-dev
147+
- libcal-dev
146148
{% endif %}
147149

148150
run:

continuous_integration/scripts/build

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ build_release_product() {
1010

1111
local conda_build_args=();
1212
# The channel sequence below needs to be preserved
13+
# The ucc140 label contains the provisional packages for UCC 1.4.0
14+
# TODO(marcinz): Needs to be removed when the real UCC 1.4.0 packages are available
15+
conda_build_args+=(-c legate/label/ucc140);
1316
conda_build_args+=(-c https://conda.anaconda.org/${CONDA_CHANNEL}/label/${CONDA_LABEL});
1417
conda_build_args+=(-c legate/label/ucc140);
1518
conda_build_args+=(-c conda-forge);

continuous_integration/scripts/test

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,21 @@ test_cupynumeric() {
5959

6060
cd "${REPO_DIR}";
6161

62+
export WORKERS=""
63+
if command -v nvidia-smi &> /dev/null; then
64+
gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
65+
if [ "$gpu_count" -ge 2 ]; then
66+
export WORKERS="-j 1"
67+
fi
68+
fi
69+
6270
case "$1" in
6371
"test")
6472
echo "Executing tests..."
6573
shift;
6674
setup_test_env;
6775
run_legate_issue;
68-
./test.py -vv --timeout 300 "$@"
76+
./test.py ${WORKERS} -vv --timeout 300 "$@"
6977
;;
7078
"mypy")
7179
echo "Installing and executing mypy..."

cupynumeric/linalg/_solve.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import legate.core.types as ty
2020
from legate.core import broadcast, get_legate_runtime
21+
from legate.settings import settings
2122

2223
from ..config import CuPyNumericOpCode
2324
from ..runtime import runtime
@@ -46,8 +47,8 @@ def solve_single(library: Library, a: LogicalStore, b: LogicalStore) -> None:
4647
task.execute()
4748

4849

49-
MIN_SOLVE_TILE_SIZE = 512
50-
MIN_SOLVE_MATRIX_SIZE = 2048
50+
MIN_SOLVE_TILE_SIZE = 2 if settings.test() else 512
51+
MIN_SOLVE_MATRIX_SIZE = 4 if settings.test() else 2048
5152

5253

5354
def mp_solve(
@@ -59,14 +60,24 @@ def mp_solve(
5960
b: LogicalStore,
6061
output: LogicalStore,
6162
) -> None:
62-
task = get_legate_runtime().create_auto_task(
63-
library, CuPyNumericOpCode.MP_SOLVE
63+
# coloring via num_procs to get utilization
64+
initial_color_shape_x = runtime.num_gpus
65+
tilesize_x = (n + initial_color_shape_x - 1) // initial_color_shape_x
66+
color_shape_x = (n + tilesize_x - 1) // tilesize_x
67+
68+
task = get_legate_runtime().create_manual_task(
69+
library, CuPyNumericOpCode.MP_SOLVE, (color_shape_x, 1)
6470
)
6571
task.throws_exception(LinAlgError)
66-
task.add_input(a)
67-
task.add_input(b)
68-
task.add_output(output)
69-
task.add_alignment(output, b)
72+
73+
tiled_a = a.partition_by_tiling((tilesize_x, n))
74+
tiled_b = b.partition_by_tiling((tilesize_x, nrhs))
75+
tiled_output = output.partition_by_tiling((tilesize_x, nrhs))
76+
77+
task.add_input(tiled_a)
78+
task.add_input(tiled_b)
79+
task.add_output(tiled_output)
80+
7081
task.add_scalar_arg(n, ty.int64)
7182
task.add_scalar_arg(nrhs, ty.int64)
7283
task.add_scalar_arg(nb, ty.int64)

cupynumeric/linalg/linalg.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -961,17 +961,23 @@ def _thunk_solve(
961961
b = b.astype(dtype)
962962

963963
if output is not None:
964-
out = output
965-
if out.shape != b.shape:
964+
if output.shape != b.shape:
966965
raise ValueError(
967966
f"Output shape mismatch: expected {b.shape}, "
968-
f"but found {out.shape}"
967+
f"but found {output.shape}"
969968
)
970-
elif out.dtype != b.dtype:
969+
elif output.dtype != b.dtype:
971970
raise TypeError(
972971
f"Output type mismatch: expected {b.dtype}, "
973-
f"but found {out.dtype}"
972+
f"but found {output.dtype}"
974973
)
974+
975+
expand_b = b.ndim == 1
976+
if expand_b:
977+
b = b.reshape((b.shape[0], 1))
978+
979+
if output is not None:
980+
out = output.reshape(b.shape)
975981
else:
976982
out = ndarray(
977983
shape=b.shape,
@@ -981,7 +987,12 @@ def _thunk_solve(
981987
b,
982988
),
983989
)
990+
984991
out._thunk.solve(a._thunk, b._thunk)
992+
993+
if expand_b:
994+
out = out.reshape((b.shape[0],))
995+
985996
return out
986997

987998

src/cupynumeric/mapper.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,20 @@ std::optional<std::size_t> CuPyNumericMapper::allocation_pool_size(
399399
}
400400
}
401401
}
402+
case CUPYNUMERIC_MP_POTRF:
403+
case CUPYNUMERIC_MP_SOLVE: {
404+
switch (memory_kind) {
405+
case legate::mapping::StoreTarget::FBMEM: [[fallthrough]];
406+
case legate::mapping::StoreTarget::ZCMEM: {
407+
return std::nullopt;
408+
}
409+
case legate::mapping::StoreTarget::SYSMEM: [[fallthrough]];
410+
case legate::mapping::StoreTarget::SOCKETMEM: {
411+
LEGATE_ABORT("CPU tasks shouldn't reach here");
412+
return 0;
413+
}
414+
}
415+
}
402416
case CUPYNUMERIC_NONZERO: {
403417
auto&& input = task.input(0);
404418
auto&& output = task.output(0);

src/cupynumeric/matrix/mp_potrf.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ static inline void mp_potrf_template(
5454
&device_buffer_size,
5555
&host_buffer_size));
5656

57+
// ensure non-empty buffers
58+
device_buffer_size = std::max(device_buffer_size, 1ul);
59+
host_buffer_size = std::max(host_buffer_size, 1ul);
60+
5761
auto device_buffer = create_buffer<int8_t>(device_buffer_size, Memory::Kind::GPU_FB_MEM);
5862
auto host_buffer = create_buffer<int8_t>(host_buffer_size, Memory::Kind::Z_COPY_MEM);
5963
auto info = create_buffer<int32_t>(1, Memory::Kind::Z_COPY_MEM);

src/cupynumeric/matrix/mp_potrf.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,12 @@ class MpPotrfTask : public CuPyNumericTask<MpPotrfTask> {
2525
static inline const auto TASK_CONFIG =
2626
legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_MP_POTRF}};
2727

28+
static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true).with_concurrent(true);
29+
2830
public:
2931
#if LEGATE_DEFINED(LEGATE_USE_CUDA)
3032
static void gpu_variant(legate::TaskContext context);
3133
#endif
3234
};
3335

34-
} // namespace cupynumeric
36+
} // namespace cupynumeric

src/cupynumeric/matrix/mp_solve.cu

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,14 @@ static inline void mp_solve_template(cal_comm_t comm,
8686
&getrs_device_buffer_size,
8787
&getrs_host_buffer_size));
8888

89-
auto device_buffer = create_buffer<int8_t>(
90-
std::max(getrf_device_buffer_size, getrs_device_buffer_size), Memory::Kind::GPU_FB_MEM);
91-
auto host_buffer = create_buffer<int8_t>(std::max(getrf_host_buffer_size, getrs_host_buffer_size),
92-
Memory::Kind::Z_COPY_MEM);
93-
auto info = create_buffer<int32_t>(1, Memory::Kind::Z_COPY_MEM);
89+
// ensure non-empty buffers
90+
size_t device_buffer_size =
91+
std::max(std::max(getrf_device_buffer_size, getrs_device_buffer_size), 1ul);
92+
size_t host_buffer_size = std::max(std::max(getrf_host_buffer_size, getrs_host_buffer_size), 1ul);
93+
94+
auto device_buffer = create_buffer<int8_t>(device_buffer_size, Memory::Kind::GPU_FB_MEM);
95+
auto host_buffer = create_buffer<int8_t>(host_buffer_size, Memory::Kind::Z_COPY_MEM);
96+
auto info = create_buffer<int32_t>(1, Memory::Kind::Z_COPY_MEM);
9497

9598
// initialize to zero
9699
info[0] = 0;
@@ -105,9 +108,9 @@ static inline void mp_solve_template(cal_comm_t comm,
105108
nullptr,
106109
cudaTypeToDataType<VAL>::type,
107110
device_buffer.ptr(0),
108-
getrf_device_buffer_size,
111+
device_buffer_size,
109112
host_buffer.ptr(0),
110-
getrf_host_buffer_size,
113+
host_buffer_size,
111114
info.ptr(0)));
112115

113116
if (info[0] != 0) {
@@ -129,9 +132,9 @@ static inline void mp_solve_template(cal_comm_t comm,
129132
b_desc,
130133
cudaTypeToDataType<VAL>::type,
131134
device_buffer.ptr(0),
132-
getrs_device_buffer_size,
135+
device_buffer_size,
133136
host_buffer.ptr(0),
134-
getrs_host_buffer_size,
137+
host_buffer_size,
135138
info.ptr(0)));
136139

137140
// TODO: We need a deferred exception to avoid this synchronization

0 commit comments

Comments
 (0)