Skip to content

Commit c44a397

Browse files
Merge pull request #345 from KernelTuner/fix-cuda-python
Use the new cuda-python modules
2 parents 5160017 + 9bb2fcb commit c44a397

7 files changed

Lines changed: 130 additions & 97 deletions

File tree

kernel_tuner/backends/nvcuda.py

Lines changed: 45 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,23 @@
55

66
from kernel_tuner.backends.backend import GPUBackend
77
from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
8-
from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc
8+
from kernel_tuner.util import SkippableFailure
9+
from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc
910

1011
# embedded in try block to be able to generate documentation
1112
# and run tests without cuda-python installed
1213
try:
13-
from cuda import cuda, cudart, nvrtc
14+
from cuda.bindings import driver, runtime, nvrtc
1415
except ImportError:
15-
cuda = None
16+
try:
17+
# backward compatibility hack for older cuda-python versions
18+
from cuda import cuda as driver, cudart as runtime, nvrtc as nvrtc
19+
except ImportError:
20+
driver = None
1621

1722

1823
class CudaFunctions(GPUBackend):
19-
"""Class that groups the Cuda functions on maintains state about the device."""
24+
"""Class that groups the Cuda functions and it maintains state about the device."""
2025

2126
def __init__(self, device=0, iterations=7, compiler_options=None, observers=None):
2227
"""Instantiate CudaFunctions object used for interacting with the CUDA device.
@@ -38,34 +43,30 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
3843
"""
3944
self.allocations = []
4045
self.texrefs = []
41-
if not cuda:
46+
if not driver:
4247
raise ImportError(
4348
"cuda-python not installed, install using 'pip install cuda-python', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#cuda-and-pycuda."
4449
)
4550

4651
# initialize and select device
47-
err = cuda.cuInit(0)
52+
err = driver.cuInit(0)
4853
cuda_error_check(err)
49-
err, self.device = cuda.cuDeviceGet(device)
54+
err, self.device = driver.cuDeviceGet(device)
5055
cuda_error_check(err)
51-
err, self.context = cuda.cuDevicePrimaryCtxRetain(device)
56+
err, self.context = driver.cuDevicePrimaryCtxRetain(device)
5257
cuda_error_check(err)
5358
if CudaFunctions.last_selected_device != device:
54-
err = cuda.cuCtxSetCurrent(self.context)
59+
err = driver.cuCtxSetCurrent(self.context)
5560
cuda_error_check(err)
5661
CudaFunctions.last_selected_device = device
5762

5863
# compute capabilities and device properties
59-
err, major = cudart.cudaDeviceGetAttribute(
60-
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device
61-
)
64+
err, major = runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device)
6265
cuda_error_check(err)
63-
err, minor = cudart.cudaDeviceGetAttribute(
64-
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device
65-
)
66+
err, minor = runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device)
6667
cuda_error_check(err)
67-
err, self.max_threads = cudart.cudaDeviceGetAttribute(
68-
cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
68+
err, self.max_threads = runtime.cudaDeviceGetAttribute(
69+
runtime.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
6970
)
7071
cuda_error_check(err)
7172
self.cc = f"{major}{minor}"
@@ -78,11 +79,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
7879
self.compiler_options_bytes.append(str(option).encode("UTF-8"))
7980

8081
# create a stream and events
81-
err, self.stream = cuda.cuStreamCreate(0)
82+
err, self.stream = driver.cuStreamCreate(0)
8283
cuda_error_check(err)
83-
err, self.start = cuda.cuEventCreate(0)
84+
err, self.start = driver.cuEventCreate(0)
8485
cuda_error_check(err)
85-
err, self.end = cuda.cuEventCreate(0)
86+
err, self.end = driver.cuEventCreate(0)
8687
cuda_error_check(err)
8788

8889
# default dynamically allocated shared memory size, can be overwritten using smem_args
@@ -95,11 +96,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
9596
observer.register_device(self)
9697

9798
# collect environment information
98-
err, device_properties = cudart.cudaGetDeviceProperties(device)
99+
err, device_properties = runtime.cudaGetDeviceProperties(device)
99100
cuda_error_check(err)
100101
env = dict()
101102
env["device_name"] = device_properties.name.decode()
102-
env["cuda_version"] = cuda.CUDA_VERSION
103+
env["cuda_version"] = driver.CUDA_VERSION
103104
env["compute_capability"] = self.cc
104105
env["iterations"] = self.iterations
105106
env["compiler_options"] = self.compiler_options
@@ -109,8 +110,8 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
109110

110111
def __del__(self):
111112
for device_memory in self.allocations:
112-
if isinstance(device_memory, cuda.CUdeviceptr):
113-
err = cuda.cuMemFree(device_memory)
113+
if isinstance(device_memory, driver.CUdeviceptr):
114+
err = driver.cuMemFree(device_memory)
114115
cuda_error_check(err)
115116

116117
def ready_argument_list(self, arguments):
@@ -128,7 +129,7 @@ def ready_argument_list(self, arguments):
128129
for arg in arguments:
129130
# if arg is a numpy array copy it to device
130131
if isinstance(arg, np.ndarray):
131-
err, device_memory = cuda.cuMemAlloc(arg.nbytes)
132+
err, device_memory = driver.cuMemAlloc(arg.nbytes)
132133
cuda_error_check(err)
133134
self.allocations.append(device_memory)
134135
gpu_args.append(device_memory)
@@ -164,38 +165,30 @@ def compile(self, kernel_instance):
164165
if not any(["--std=" in opt for opt in self.compiler_options]):
165166
self.compiler_options.append("--std=c++11")
166167
if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
167-
compiler_options.append(
168-
f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
169-
)
168+
compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
170169
if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
171170
self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
172171

173-
err, program = nvrtc.nvrtcCreateProgram(
174-
str.encode(kernel_string), b"CUDAProgram", 0, [], []
175-
)
172+
err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
176173
try:
177174
cuda_error_check(err)
178-
err = nvrtc.nvrtcCompileProgram(
179-
program, len(compiler_options), compiler_options
180-
)
175+
err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
181176
cuda_error_check(err)
182177
err, size = nvrtc.nvrtcGetPTXSize(program)
183178
cuda_error_check(err)
184179
buff = b" " * size
185180
err = nvrtc.nvrtcGetPTX(program, buff)
186181
cuda_error_check(err)
187-
err, self.current_module = cuda.cuModuleLoadData(np.char.array(buff))
188-
if err == cuda.CUresult.CUDA_ERROR_INVALID_PTX:
182+
err, self.current_module = driver.cuModuleLoadData(np.char.array(buff))
183+
if err == driver.CUresult.CUDA_ERROR_INVALID_PTX:
189184
raise SkippableFailure("uses too much shared data")
190185
else:
191186
cuda_error_check(err)
192-
err, self.func = cuda.cuModuleGetFunction(
193-
self.current_module, str.encode(kernel_name)
194-
)
187+
err, self.func = driver.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
195188
cuda_error_check(err)
196189

197190
# get the number of registers per thread used in this kernel
198-
num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func)
191+
num_regs = driver.cuFuncGetAttribute(driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func)
199192
assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}"
200193
self.num_regs = num_regs[1]
201194

@@ -210,26 +203,26 @@ def compile(self, kernel_instance):
210203

211204
def start_event(self):
212205
"""Records the event that marks the start of a measurement."""
213-
err = cudart.cudaEventRecord(self.start, self.stream)
206+
err = runtime.cudaEventRecord(self.start, self.stream)
214207
cuda_error_check(err)
215208

216209
def stop_event(self):
217210
"""Records the event that marks the end of a measurement."""
218-
err = cudart.cudaEventRecord(self.end, self.stream)
211+
err = runtime.cudaEventRecord(self.end, self.stream)
219212
cuda_error_check(err)
220213

221214
def kernel_finished(self):
222215
"""Returns True if the kernel has finished, False otherwise."""
223-
err = cudart.cudaEventQuery(self.end)
224-
if err[0] == cudart.cudaError_t.cudaSuccess:
216+
err = runtime.cudaEventQuery(self.end)
217+
if err[0] == runtime.cudaError_t.cudaSuccess:
225218
return True
226219
else:
227220
return False
228221

229222
@staticmethod
230223
def synchronize():
231224
"""Halts execution until device has finished its tasks."""
232-
err = cudart.cudaDeviceSynchronize()
225+
err = runtime.cudaDeviceSynchronize()
233226
cuda_error_check(err)
234227

235228
def copy_constant_memory_args(self, cmem_args):
@@ -243,9 +236,9 @@ def copy_constant_memory_args(self, cmem_args):
243236
:type cmem_args: dict( string: numpy.ndarray, ... )
244237
"""
245238
for k, v in cmem_args.items():
246-
err, symbol, _ = cuda.cuModuleGetGlobal(self.current_module, str.encode(k))
239+
err, symbol, _ = driver.cuModuleGetGlobal(self.current_module, str.encode(k))
247240
cuda_error_check(err)
248-
err = cuda.cuMemcpyHtoD(symbol, v, v.nbytes)
241+
err = driver.cuMemcpyHtoD(symbol, v, v.nbytes)
249242
cuda_error_check(err)
250243

251244
def copy_shared_memory_args(self, smem_args):
@@ -284,12 +277,12 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
284277
stream = self.stream
285278
arg_types = list()
286279
for arg in gpu_args:
287-
if isinstance(arg, cuda.CUdeviceptr):
280+
if isinstance(arg, driver.CUdeviceptr):
288281
arg_types.append(None)
289282
else:
290283
arg_types.append(np.ctypeslib.as_ctypes_type(arg.dtype))
291284
kernel_args = (tuple(gpu_args), tuple(arg_types))
292-
err = cuda.cuLaunchKernel(
285+
err = driver.cuLaunchKernel(
293286
func,
294287
grid[0],
295288
grid[1],
@@ -318,7 +311,7 @@ def memset(allocation, value, size):
318311
:type size: int
319312
320313
"""
321-
err = cudart.cudaMemset(allocation, value, size)
314+
err = runtime.cudaMemset(allocation, value, size)
322315
cuda_error_check(err)
323316

324317
@staticmethod
@@ -331,7 +324,7 @@ def memcpy_dtoh(dest, src):
331324
:param src: A GPU memory allocation unit
332325
:type src: cuda.CUdeviceptr
333326
"""
334-
err = cuda.cuMemcpyDtoH(dest, src, dest.nbytes)
327+
err = driver.cuMemcpyDtoH(dest, src, dest.nbytes)
335328
cuda_error_check(err)
336329

337330
@staticmethod
@@ -344,7 +337,7 @@ def memcpy_htod(dest, src):
344337
:param src: A numpy array in host memory to store the data
345338
:type src: numpy.ndarray
346339
"""
347-
err = cuda.cuMemcpyHtoD(dest, src, src.nbytes)
340+
err = driver.cuMemcpyHtoD(dest, src, src.nbytes)
348341
cuda_error_check(err)
349342

350343
units = {"time": "ms"}

kernel_tuner/observers/nvcuda.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
import numpy as np
22

33
try:
4-
from cuda import cudart
4+
from cuda.bindings import runtime
55
except ImportError:
6-
cuda = None
6+
try:
7+
# backward compatibility hack for older cuda-python versions
8+
from cuda import cudart as runtime
9+
except ImportError:
10+
cuda = None
711

812
from kernel_tuner.observers.observer import BenchmarkObserver
9-
from kernel_tuner.util import cuda_error_check
13+
from kernel_tuner.utils.nvcuda import cuda_error_check
1014

1115

1216
class CudaRuntimeObserver(BenchmarkObserver):
@@ -21,7 +25,7 @@ def __init__(self, dev):
2125

2226
def after_finish(self):
2327
# Time is measured in milliseconds
24-
err, time = cudart.cudaEventElapsedTime(self.start, self.end)
28+
err, time = runtime.cudaEventElapsedTime(self.start, self.end)
2529
cuda_error_check(err)
2630
self.times.append(time)
2731

kernel_tuner/util.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,6 @@
3838
import cupy as cp
3939
except ImportError:
4040
cp = np
41-
try:
42-
from cuda import cuda, cudart, nvrtc
43-
except ImportError:
44-
cuda = None
4541

4642
from kernel_tuner.observers.nvml import NVMLObserver
4743

@@ -642,14 +638,6 @@ def get_total_timings(results, env, overhead_time):
642638
return env
643639

644640

645-
NVRTC_VALID_CC = np.array(["50", "52", "53", "60", "61", "62", "70", "72", "75", "80", "87", "89", "90", "90a"])
646-
647-
648-
def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str:
649-
"""Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options."""
650-
return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="52")
651-
652-
653641
def print_config(config, tuning_options, runner):
654642
"""Print the configuration string with tunable parameters and benchmark results."""
655643
print_config_output(tuning_options.tune_params, config, runner.quiet, tuning_options.metrics, runner.units)
@@ -1315,19 +1303,3 @@ def dump_cache(obj: str, tuning_options):
13151303
if isinstance(tuning_options.cache, dict) and tuning_options.cachefile:
13161304
with open(tuning_options.cachefile, "a") as cachefile:
13171305
cachefile.write(obj)
1318-
1319-
1320-
def cuda_error_check(error):
1321-
"""Checking the status of CUDA calls using the NVIDIA cuda-python backend."""
1322-
if isinstance(error, cuda.CUresult):
1323-
if error != cuda.CUresult.CUDA_SUCCESS:
1324-
_, name = cuda.cuGetErrorName(error)
1325-
raise RuntimeError(f"CUDA error: {name.decode()}")
1326-
elif isinstance(error, cudart.cudaError_t):
1327-
if error != cudart.cudaError_t.cudaSuccess:
1328-
_, name = cudart.getErrorName(error)
1329-
raise RuntimeError(f"CUDART error: {name.decode()}")
1330-
elif isinstance(error, nvrtc.nvrtcResult):
1331-
if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
1332-
_, desc = nvrtc.nvrtcGetErrorString(error)
1333-
raise RuntimeError(f"NVRTC error: {desc.decode()}")

kernel_tuner/utils/nvcuda.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Module for kernel tuner cuda-python utility functions."""
2+
3+
import numpy as np
4+
5+
try:
6+
from cuda.bindings import driver, runtime, nvrtc
7+
except ImportError:
8+
cuda = None
9+
10+
NVRTC_VALID_CC = np.array(
11+
[
12+
"50",
13+
"52",
14+
"53",
15+
"60",
16+
"61",
17+
"62",
18+
"70",
19+
"72",
20+
"75",
21+
"80",
22+
"87",
23+
"89",
24+
"90",
25+
"90a",
26+
"100",
27+
"100f",
28+
"100a",
29+
"101",
30+
"101f",
31+
"101a",
32+
"103",
33+
"103f",
34+
"103a",
35+
"120",
36+
"120f",
37+
"120a",
38+
"121",
39+
"121f",
40+
"121a",
41+
]
42+
)
43+
44+
45+
def cuda_error_check(error):
46+
"""Checking the status of CUDA calls using the NVIDIA cuda-python backend."""
47+
if isinstance(error, driver.CUresult):
48+
if error != driver.CUresult.CUDA_SUCCESS:
49+
_, name = driver.cuGetErrorName(error)
50+
raise RuntimeError(f"CUDA Driver error: {name.decode()}")
51+
elif isinstance(error, runtime.cudaError_t):
52+
if error != runtime.cudaError_t.cudaSuccess:
53+
_, name = runtime.cudaGetErrorName(error)
54+
raise RuntimeError(f"CUDA Runtime error: {name.decode()}")
55+
elif isinstance(error, nvrtc.nvrtcResult):
56+
if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
57+
_, desc = nvrtc.nvrtcGetErrorString(error)
58+
raise RuntimeError(f"NVRTC error: {desc.decode()}")
59+
60+
61+
def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str:
62+
"""Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options."""
63+
return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="75")

0 commit comments

Comments
 (0)