|
| 1 | +# Slightly modified from |
| 2 | +# https://github.com/NVIDIA/TensorRT/blob/c0c633cc629cc0705f0f69359f531a192e524c0f/samples/python/common.py |
| 3 | + |
| 4 | +# |
| 5 | +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 6 | +# SPDX-License-Identifier: Apache-2.0 |
| 7 | +# |
| 8 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 9 | +# you may not use this file except in compliance with the License. |
| 10 | +# You may obtain a copy of the License at |
| 11 | +# |
| 12 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 13 | +# |
| 14 | +# Unless required by applicable law or agreed to in writing, software |
| 15 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 16 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 17 | +# See the License for the specific language governing permissions and |
| 18 | +# limitations under the License. |
| 19 | +# |
| 20 | + |
| 21 | +import argparse |
| 22 | +import os |
| 23 | +import ctypes |
| 24 | +from typing import Optional, List |
| 25 | + |
| 26 | +import numpy as np |
| 27 | +import tensorrt as trt |
| 28 | +from cuda import cuda, cudart |
| 29 | + |
| 30 | +try: |
| 31 | + # Sometimes python does not understand FileNotFoundError |
| 32 | + FileNotFoundError |
| 33 | +except NameError: |
| 34 | + FileNotFoundError = IOError |
| 35 | + |
| 36 | +EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) |
| 37 | + |
| 38 | + |
| 39 | +def check_cuda_err(err): |
| 40 | + if isinstance(err, cuda.CUresult): |
| 41 | + if err != cuda.CUresult.CUDA_SUCCESS: |
| 42 | + raise RuntimeError("Cuda Error: {}".format(err)) |
| 43 | + if isinstance(err, cudart.cudaError_t): |
| 44 | + if err != cudart.cudaError_t.cudaSuccess: |
| 45 | + raise RuntimeError("Cuda Runtime Error: {}".format(err)) |
| 46 | + else: |
| 47 | + raise RuntimeError("Unknown error type: {}".format(err)) |
| 48 | + |
| 49 | + |
| 50 | +def cuda_call(call): |
| 51 | + err, res = call[0], call[1:] |
| 52 | + check_cuda_err(err) |
| 53 | + if len(res) == 1: |
| 54 | + res = res[0] |
| 55 | + return res |
| 56 | + |
| 57 | + |
| 58 | +def GiB(val): |
| 59 | + return val * 1 << 30 |
| 60 | + |
| 61 | + |
| 62 | +def add_help(description): |
| 63 | + parser = argparse.ArgumentParser( |
| 64 | + description=description, |
| 65 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
| 66 | + args, _ = parser.parse_known_args() |
| 67 | + |
| 68 | + |
| 69 | +def find_sample_data(description="Runs a TensorRT Python sample", |
| 70 | + subfolder="", |
| 71 | + find_files=[], |
| 72 | + err_msg=""): |
| 73 | + """ |
| 74 | + Parses sample arguments. |
| 75 | +
|
| 76 | + Args: |
| 77 | + description (str): Description of the sample. |
| 78 | + subfolder (str): The subfolder containing data relevant to this sample |
| 79 | + find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path. |
| 80 | +
|
| 81 | + Returns: |
| 82 | + str: Path of data directory. |
| 83 | + """ |
| 84 | + |
| 85 | + # Standard command-line arguments for all samples. |
| 86 | + kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data") |
| 87 | + parser = argparse.ArgumentParser( |
| 88 | + description=description, |
| 89 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
| 90 | + parser.add_argument( |
| 91 | + "-d", |
| 92 | + "--datadir", |
| 93 | + help= |
| 94 | + "Location of the TensorRT sample data directory, and any additional data directories.", |
| 95 | + action="append", |
| 96 | + default=[kDEFAULT_DATA_ROOT], |
| 97 | + ) |
| 98 | + args, _ = parser.parse_known_args() |
| 99 | + |
| 100 | + def get_data_path(data_dir): |
| 101 | + # If the subfolder exists, append it to the path, otherwise use the provided path as-is. |
| 102 | + data_path = os.path.join(data_dir, subfolder) |
| 103 | + if not os.path.exists(data_path): |
| 104 | + if data_dir != kDEFAULT_DATA_ROOT: |
| 105 | + print("WARNING: " + data_path + " does not exist. Trying " + |
| 106 | + data_dir + " instead.") |
| 107 | + data_path = data_dir |
| 108 | + # Make sure data directory exists. |
| 109 | + if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT: |
| 110 | + print( |
| 111 | + "WARNING: {:} does not exist. Please provide the correct data path with the -d option." |
| 112 | + .format(data_path)) |
| 113 | + return data_path |
| 114 | + |
| 115 | + data_paths = [get_data_path(data_dir) for data_dir in args.datadir] |
| 116 | + return data_paths, locate_files(data_paths, find_files, err_msg) |
| 117 | + |
| 118 | + |
| 119 | +def locate_files(data_paths, filenames, err_msg=""): |
| 120 | + """ |
| 121 | + Locates the specified files in the specified data directories. |
| 122 | + If a file exists in multiple data directories, the first directory is used. |
| 123 | +
|
| 124 | + Args: |
| 125 | + data_paths (List[str]): The data directories. |
| 126 | + filename (List[str]): The names of the files to find. |
| 127 | +
|
| 128 | + Returns: |
| 129 | + List[str]: The absolute paths of the files. |
| 130 | +
|
| 131 | + Raises: |
| 132 | + FileNotFoundError if a file could not be located. |
| 133 | + """ |
| 134 | + found_files = [None] * len(filenames) |
| 135 | + for data_path in data_paths: |
| 136 | + # Find all requested files. |
| 137 | + for index, (found, filename) in enumerate(zip(found_files, filenames)): |
| 138 | + if not found: |
| 139 | + file_path = os.path.abspath(os.path.join(data_path, filename)) |
| 140 | + if os.path.exists(file_path): |
| 141 | + found_files[index] = file_path |
| 142 | + |
| 143 | + # Check that all files were found |
| 144 | + for f, filename in zip(found_files, filenames): |
| 145 | + if not f or not os.path.exists(f): |
| 146 | + raise FileNotFoundError( |
| 147 | + "Could not find {:}. Searched in data paths: {:}\n{:}".format( |
| 148 | + filename, data_paths, err_msg)) |
| 149 | + return found_files |
| 150 | + |
| 151 | + |
| 152 | +class HostDeviceMem: |
| 153 | + """Pair of host and device memory, where the host memory is wrapped in a numpy array""" |
| 154 | + |
| 155 | + def __init__(self, |
| 156 | + size: int, |
| 157 | + dtype: np.dtype, |
| 158 | + name: Optional[str] = None, |
| 159 | + shape: Optional[trt.Dims] = None, |
| 160 | + format: Optional[trt.TensorFormat] = None): |
| 161 | + nbytes = size * dtype.itemsize |
| 162 | + host_mem = cuda_call(cudart.cudaMallocHost(nbytes)) |
| 163 | + pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype)) |
| 164 | + |
| 165 | + self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), |
| 166 | + (size, )) |
| 167 | + self._device = cuda_call(cudart.cudaMalloc(nbytes)) |
| 168 | + self._nbytes = nbytes |
| 169 | + self.name = name |
| 170 | + self.shape = shape |
| 171 | + self.format = format |
| 172 | + self.dtype = dtype |
| 173 | + |
| 174 | + @property |
| 175 | + def host(self) -> np.ndarray: |
| 176 | + return self._host |
| 177 | + |
| 178 | + @host.setter |
| 179 | + def host(self, arr: np.ndarray): |
| 180 | + if arr.size > self.host.size: |
| 181 | + raise ValueError( |
| 182 | + f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}" |
| 183 | + ) |
| 184 | + np.copyto(self.host[:arr.size], arr.flat, casting='safe') |
| 185 | + |
| 186 | + @property |
| 187 | + def device(self) -> int: |
| 188 | + return self._device |
| 189 | + |
| 190 | + @property |
| 191 | + def nbytes(self) -> int: |
| 192 | + return self._nbytes |
| 193 | + |
| 194 | + def __str__(self): |
| 195 | + return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n" |
| 196 | + |
| 197 | + def __repr__(self): |
| 198 | + return self.__str__() |
| 199 | + |
| 200 | + def free(self): |
| 201 | + cuda_call(cudart.cudaFree(self.device)) |
| 202 | + cuda_call(cudart.cudaFreeHost(self.host.ctypes.data)) |
| 203 | + |
| 204 | + |
| 205 | +# Allocates all buffers required for an engine, i.e. host/device inputs/outputs. |
| 206 | +# If engine uses dynamic shapes, specify a profile to find the maximum input & output size. |
| 207 | +def allocate_buffers(engine: trt.ICudaEngine, |
| 208 | + profile_idx: Optional[int] = None): |
| 209 | + inputs = [] |
| 210 | + outputs = [] |
| 211 | + bindings = [] |
| 212 | + stream = cuda_call(cudart.cudaStreamCreate()) |
| 213 | + tensor_names = [ |
| 214 | + engine.get_tensor_name(i) for i in range(engine.num_io_tensors) |
| 215 | + ] |
| 216 | + for binding in tensor_names: |
| 217 | + # get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape) |
| 218 | + # Pick out the max shape to allocate enough memory for the binding. |
| 219 | + format = engine.get_tensor_format(binding) |
| 220 | + shape = engine.get_tensor_shape( |
| 221 | + binding |
| 222 | + ) if profile_idx is None else engine.get_tensor_profile_shape( |
| 223 | + binding, profile_idx)[-1] |
| 224 | + shape_valid = np.all([s >= 0 for s in shape]) |
| 225 | + if not shape_valid and profile_idx is None: |
| 226 | + raise ValueError(f"Binding {binding} has dynamic shape, " +\ |
| 227 | + "but no profile was specified.") |
| 228 | + size = trt.volume(shape) |
| 229 | + if engine.has_implicit_batch_dimension: |
| 230 | + size *= engine.max_batch_size |
| 231 | + dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding))) |
| 232 | + |
| 233 | + # Allocate host and device buffers |
| 234 | + bindingMemory = HostDeviceMem(size, |
| 235 | + dtype, |
| 236 | + name=binding, |
| 237 | + shape=shape, |
| 238 | + format=format) |
| 239 | + |
| 240 | + # Append the device buffer to device bindings. |
| 241 | + bindings.append(int(bindingMemory.device)) |
| 242 | + |
| 243 | + # Append to the appropriate list. |
| 244 | + if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT: |
| 245 | + inputs.append(bindingMemory) |
| 246 | + else: |
| 247 | + outputs.append(bindingMemory) |
| 248 | + return inputs, outputs, bindings, stream |
| 249 | + |
| 250 | + |
| 251 | +# Frees the resources allocated in allocate_buffers |
| 252 | +def free_buffers(inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], |
| 253 | + stream: cudart.cudaStream_t): |
| 254 | + for mem in inputs + outputs: |
| 255 | + mem.free() |
| 256 | + cuda_call(cudart.cudaStreamDestroy(stream)) |
| 257 | + |
| 258 | + |
| 259 | +# Wrapper for cudaMemcpy which infers copy size and does error checking |
| 260 | +def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray): |
| 261 | + nbytes = host_arr.size * host_arr.itemsize |
| 262 | + cuda_call( |
| 263 | + cudart.cudaMemcpy(device_ptr, host_arr, nbytes, |
| 264 | + cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)) |
| 265 | + |
| 266 | + |
| 267 | +# Wrapper for cudaMemcpy which infers copy size and does error checking |
| 268 | +def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int): |
| 269 | + nbytes = host_arr.size * host_arr.itemsize |
| 270 | + cuda_call( |
| 271 | + cudart.cudaMemcpy(host_arr, device_ptr, nbytes, |
| 272 | + cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)) |
| 273 | + |
| 274 | + |
| 275 | +def _do_inference_base(inputs, outputs, stream, execute_async): |
| 276 | + # Transfer input data to the GPU. |
| 277 | + kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice |
| 278 | + [ |
| 279 | + cuda_call( |
| 280 | + cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, |
| 281 | + stream)) for inp in inputs |
| 282 | + ] |
| 283 | + # Run inference. |
| 284 | + execute_async() |
| 285 | + # Transfer predictions back from the GPU. |
| 286 | + kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost |
| 287 | + [ |
| 288 | + cuda_call( |
| 289 | + cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, |
| 290 | + stream)) for out in outputs |
| 291 | + ] |
| 292 | + # Synchronize the stream |
| 293 | + cuda_call(cudart.cudaStreamSynchronize(stream)) |
| 294 | + # Return only the host outputs. |
| 295 | + return [out.host for out in outputs] |
| 296 | + |
| 297 | + |
| 298 | +# This function is generalized for multiple inputs/outputs. |
| 299 | +# inputs and outputs are expected to be lists of HostDeviceMem objects. |
| 300 | +def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): |
| 301 | + |
| 302 | + def execute_async(): |
| 303 | + context.execute_async(batch_size=batch_size, |
| 304 | + bindings=bindings, |
| 305 | + stream_handle=stream) |
| 306 | + |
| 307 | + return _do_inference_base(inputs, outputs, stream, execute_async) |
| 308 | + |
| 309 | + |
| 310 | +# This function is generalized for multiple inputs/outputs for full dimension networks. |
| 311 | +# inputs and outputs are expected to be lists of HostDeviceMem objects. |
| 312 | +def do_inference_v2(context, bindings, inputs, outputs, stream): |
| 313 | + |
| 314 | + def execute_async(): |
| 315 | + context.execute_async_v2(bindings=bindings, stream_handle=stream) |
| 316 | + |
| 317 | + return _do_inference_base(inputs, outputs, stream, execute_async) |
0 commit comments