55
66from kernel_tuner .backends .backend import GPUBackend
77from kernel_tuner .observers .nvcuda import CudaRuntimeObserver
8- from kernel_tuner .util import SkippableFailure , cuda_error_check , to_valid_nvrtc_gpu_arch_cc
8+ from kernel_tuner .util import SkippableFailure
9+ from kernel_tuner .utils .nvcuda import cuda_error_check , to_valid_nvrtc_gpu_arch_cc
910
1011# embedded in try block to be able to generate documentation
1112# and run tests without cuda-python installed
1213try :
13- from cuda import cuda , cudart , nvrtc
14+ from cuda . bindings import driver , runtime , nvrtc
1415except ImportError :
15- cuda = None
16+ try :
17+ # backward compatibility hack for older cuda-python versions
18+ from cuda import cuda as driver , cudart as runtime , nvrtc as nvrtc
19+ except ImportError :
20+ driver = None
1621
1722
1823class CudaFunctions (GPUBackend ):
19- """Class that groups the Cuda functions on maintains state about the device."""
24+ """Class that groups the Cuda functions and it maintains state about the device."""
2025
2126 def __init__ (self , device = 0 , iterations = 7 , compiler_options = None , observers = None ):
2227 """Instantiate CudaFunctions object used for interacting with the CUDA device.
@@ -38,34 +43,30 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
3843 """
3944 self .allocations = []
4045 self .texrefs = []
41- if not cuda :
46+ if not driver :
4247 raise ImportError (
4348 "cuda-python not installed, install using 'pip install cuda-python', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#cuda-and-pycuda."
4449 )
4550
4651 # initialize and select device
47- err = cuda .cuInit (0 )
52+ err = driver .cuInit (0 )
4853 cuda_error_check (err )
49- err , self .device = cuda .cuDeviceGet (device )
54+ err , self .device = driver .cuDeviceGet (device )
5055 cuda_error_check (err )
51- err , self .context = cuda .cuDevicePrimaryCtxRetain (device )
56+ err , self .context = driver .cuDevicePrimaryCtxRetain (device )
5257 cuda_error_check (err )
5358 if CudaFunctions .last_selected_device != device :
54- err = cuda .cuCtxSetCurrent (self .context )
59+ err = driver .cuCtxSetCurrent (self .context )
5560 cuda_error_check (err )
5661 CudaFunctions .last_selected_device = device
5762
5863 # compute capabilities and device properties
59- err , major = cudart .cudaDeviceGetAttribute (
60- cudart .cudaDeviceAttr .cudaDevAttrComputeCapabilityMajor , device
61- )
64+ err , major = runtime .cudaDeviceGetAttribute (runtime .cudaDeviceAttr .cudaDevAttrComputeCapabilityMajor , device )
6265 cuda_error_check (err )
63- err , minor = cudart .cudaDeviceGetAttribute (
64- cudart .cudaDeviceAttr .cudaDevAttrComputeCapabilityMinor , device
65- )
66+ err , minor = runtime .cudaDeviceGetAttribute (runtime .cudaDeviceAttr .cudaDevAttrComputeCapabilityMinor , device )
6667 cuda_error_check (err )
67- err , self .max_threads = cudart .cudaDeviceGetAttribute (
68- cudart .cudaDeviceAttr .cudaDevAttrMaxThreadsPerBlock , device
68+ err , self .max_threads = runtime .cudaDeviceGetAttribute (
69+ runtime .cudaDeviceAttr .cudaDevAttrMaxThreadsPerBlock , device
6970 )
7071 cuda_error_check (err )
7172 self .cc = f"{ major } { minor } "
@@ -78,11 +79,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
7879 self .compiler_options_bytes .append (str (option ).encode ("UTF-8" ))
7980
8081 # create a stream and events
81- err , self .stream = cuda .cuStreamCreate (0 )
82+ err , self .stream = driver .cuStreamCreate (0 )
8283 cuda_error_check (err )
83- err , self .start = cuda .cuEventCreate (0 )
84+ err , self .start = driver .cuEventCreate (0 )
8485 cuda_error_check (err )
85- err , self .end = cuda .cuEventCreate (0 )
86+ err , self .end = driver .cuEventCreate (0 )
8687 cuda_error_check (err )
8788
8889 # default dynamically allocated shared memory size, can be overwritten using smem_args
@@ -95,11 +96,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
9596 observer .register_device (self )
9697
9798 # collect environment information
98- err , device_properties = cudart .cudaGetDeviceProperties (device )
99+ err , device_properties = runtime .cudaGetDeviceProperties (device )
99100 cuda_error_check (err )
100101 env = dict ()
101102 env ["device_name" ] = device_properties .name .decode ()
102- env ["cuda_version" ] = cuda .CUDA_VERSION
103+ env ["cuda_version" ] = driver .CUDA_VERSION
103104 env ["compute_capability" ] = self .cc
104105 env ["iterations" ] = self .iterations
105106 env ["compiler_options" ] = self .compiler_options
@@ -109,8 +110,8 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
109110
110111 def __del__ (self ):
111112 for device_memory in self .allocations :
112- if isinstance (device_memory , cuda .CUdeviceptr ):
113- err = cuda .cuMemFree (device_memory )
113+ if isinstance (device_memory , driver .CUdeviceptr ):
114+ err = driver .cuMemFree (device_memory )
114115 cuda_error_check (err )
115116
116117 def ready_argument_list (self , arguments ):
@@ -128,7 +129,7 @@ def ready_argument_list(self, arguments):
128129 for arg in arguments :
129130 # if arg is a numpy array copy it to device
130131 if isinstance (arg , np .ndarray ):
131- err , device_memory = cuda .cuMemAlloc (arg .nbytes )
132+ err , device_memory = driver .cuMemAlloc (arg .nbytes )
132133 cuda_error_check (err )
133134 self .allocations .append (device_memory )
134135 gpu_args .append (device_memory )
@@ -164,38 +165,30 @@ def compile(self, kernel_instance):
164165 if not any (["--std=" in opt for opt in self .compiler_options ]):
165166 self .compiler_options .append ("--std=c++11" )
166167 if not any ([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options ]):
167- compiler_options .append (
168- f"--gpu-architecture=compute_{ to_valid_nvrtc_gpu_arch_cc (self .cc )} " .encode ("UTF-8" )
169- )
168+ compiler_options .append (f"--gpu-architecture=compute_{ to_valid_nvrtc_gpu_arch_cc (self .cc )} " .encode ("UTF-8" ))
170169 if not any (["--gpu-architecture=" in opt or "-arch" in opt for opt in self .compiler_options ]):
171170 self .compiler_options .append (f"--gpu-architecture=compute_{ to_valid_nvrtc_gpu_arch_cc (self .cc )} " )
172171
173- err , program = nvrtc .nvrtcCreateProgram (
174- str .encode (kernel_string ), b"CUDAProgram" , 0 , [], []
175- )
172+ err , program = nvrtc .nvrtcCreateProgram (str .encode (kernel_string ), b"CUDAProgram" , 0 , [], [])
176173 try :
177174 cuda_error_check (err )
178- err = nvrtc .nvrtcCompileProgram (
179- program , len (compiler_options ), compiler_options
180- )
175+ err = nvrtc .nvrtcCompileProgram (program , len (compiler_options ), compiler_options )
181176 cuda_error_check (err )
182177 err , size = nvrtc .nvrtcGetPTXSize (program )
183178 cuda_error_check (err )
184179 buff = b" " * size
185180 err = nvrtc .nvrtcGetPTX (program , buff )
186181 cuda_error_check (err )
187- err , self .current_module = cuda .cuModuleLoadData (np .char .array (buff ))
188- if err == cuda .CUresult .CUDA_ERROR_INVALID_PTX :
182+ err , self .current_module = driver .cuModuleLoadData (np .char .array (buff ))
183+ if err == driver .CUresult .CUDA_ERROR_INVALID_PTX :
189184 raise SkippableFailure ("uses too much shared data" )
190185 else :
191186 cuda_error_check (err )
192- err , self .func = cuda .cuModuleGetFunction (
193- self .current_module , str .encode (kernel_name )
194- )
187+ err , self .func = driver .cuModuleGetFunction (self .current_module , str .encode (kernel_name ))
195188 cuda_error_check (err )
196189
197190 # get the number of registers per thread used in this kernel
198- num_regs = cuda .cuFuncGetAttribute (cuda .CUfunction_attribute .CU_FUNC_ATTRIBUTE_NUM_REGS , self .func )
191+ num_regs = driver .cuFuncGetAttribute (driver .CUfunction_attribute .CU_FUNC_ATTRIBUTE_NUM_REGS , self .func )
199192 assert num_regs [0 ] == 0 , f"Retrieving number of registers per thread unsuccesful: code { num_regs [0 ]} "
200193 self .num_regs = num_regs [1 ]
201194
@@ -210,26 +203,26 @@ def compile(self, kernel_instance):
210203
211204 def start_event (self ):
212205 """Records the event that marks the start of a measurement."""
213- err = cudart .cudaEventRecord (self .start , self .stream )
206+ err = runtime .cudaEventRecord (self .start , self .stream )
214207 cuda_error_check (err )
215208
216209 def stop_event (self ):
217210 """Records the event that marks the end of a measurement."""
218- err = cudart .cudaEventRecord (self .end , self .stream )
211+ err = runtime .cudaEventRecord (self .end , self .stream )
219212 cuda_error_check (err )
220213
221214 def kernel_finished (self ):
222215 """Returns True if the kernel has finished, False otherwise."""
223- err = cudart .cudaEventQuery (self .end )
224- if err [0 ] == cudart .cudaError_t .cudaSuccess :
216+ err = runtime .cudaEventQuery (self .end )
217+ if err [0 ] == runtime .cudaError_t .cudaSuccess :
225218 return True
226219 else :
227220 return False
228221
229222 @staticmethod
230223 def synchronize ():
231224 """Halts execution until device has finished its tasks."""
232- err = cudart .cudaDeviceSynchronize ()
225+ err = runtime .cudaDeviceSynchronize ()
233226 cuda_error_check (err )
234227
235228 def copy_constant_memory_args (self , cmem_args ):
@@ -243,9 +236,9 @@ def copy_constant_memory_args(self, cmem_args):
243236 :type cmem_args: dict( string: numpy.ndarray, ... )
244237 """
245238 for k , v in cmem_args .items ():
246- err , symbol , _ = cuda .cuModuleGetGlobal (self .current_module , str .encode (k ))
239+ err , symbol , _ = driver .cuModuleGetGlobal (self .current_module , str .encode (k ))
247240 cuda_error_check (err )
248- err = cuda .cuMemcpyHtoD (symbol , v , v .nbytes )
241+ err = driver .cuMemcpyHtoD (symbol , v , v .nbytes )
249242 cuda_error_check (err )
250243
251244 def copy_shared_memory_args (self , smem_args ):
@@ -284,12 +277,12 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
284277 stream = self .stream
285278 arg_types = list ()
286279 for arg in gpu_args :
287- if isinstance (arg , cuda .CUdeviceptr ):
280+ if isinstance (arg , driver .CUdeviceptr ):
288281 arg_types .append (None )
289282 else :
290283 arg_types .append (np .ctypeslib .as_ctypes_type (arg .dtype ))
291284 kernel_args = (tuple (gpu_args ), tuple (arg_types ))
292- err = cuda .cuLaunchKernel (
285+ err = driver .cuLaunchKernel (
293286 func ,
294287 grid [0 ],
295288 grid [1 ],
@@ -318,7 +311,7 @@ def memset(allocation, value, size):
318311 :type size: int
319312
320313 """
321- err = cudart .cudaMemset (allocation , value , size )
314+ err = runtime .cudaMemset (allocation , value , size )
322315 cuda_error_check (err )
323316
324317 @staticmethod
@@ -331,7 +324,7 @@ def memcpy_dtoh(dest, src):
331324 :param src: A GPU memory allocation unit
332325 :type src: cuda.CUdeviceptr
333326 """
334- err = cuda .cuMemcpyDtoH (dest , src , dest .nbytes )
327+ err = driver .cuMemcpyDtoH (dest , src , dest .nbytes )
335328 cuda_error_check (err )
336329
337330 @staticmethod
@@ -344,7 +337,7 @@ def memcpy_htod(dest, src):
344337 :param src: A numpy array in host memory to store the data
345338 :type src: numpy.ndarray
346339 """
347- err = cuda .cuMemcpyHtoD (dest , src , src .nbytes )
340+ err = driver .cuMemcpyHtoD (dest , src , src .nbytes )
348341 cuda_error_check (err )
349342
350343 units = {"time" : "ms" }
0 commit comments