|
1 | 1 | # Slightly modified from |
2 | 2 | # https://github.com/NVIDIA/TensorRT/blob/c0c633cc629cc0705f0f69359f531a192e524c0f/samples/python/common.py |
| 3 | +# https://github.com/NVIDIA/TensorRT/blob/ccf119972b50299ba00d35d39f3938296e187f4e/samples/python/common_runtime.py |
3 | 4 |
|
4 | 5 | # |
5 | 6 | # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
@@ -150,8 +151,6 @@ def allocate_buffers(engine: trt.ICudaEngine, |
150 | 151 | raise ValueError(f"Binding {binding} has dynamic shape, " +\ |
151 | 152 | "but no profile was specified.") |
152 | 153 | size = trt.volume(shape) |
153 | | - if engine.has_implicit_batch_dimension: |
154 | | - size *= engine.max_batch_size |
155 | 154 | dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding))) |
156 | 155 |
|
157 | 156 | # Allocate host and device buffers |
@@ -219,23 +218,38 @@ def _do_inference_base(inputs, outputs, stream, execute_async): |
219 | 218 | return [out.host for out in outputs] |
220 | 219 |
|
221 | 220 |
|
222 | | -# This function is generalized for multiple inputs/outputs. |
223 | | -# inputs and outputs are expected to be lists of HostDeviceMem objects. |
224 | | -def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): |
225 | | - |
226 | | - def execute_async(): |
227 | | - context.execute_async(batch_size=batch_size, |
228 | | - bindings=bindings, |
229 | | - stream_handle=stream) |
230 | | - |
231 | | - return _do_inference_base(inputs, outputs, stream, execute_async) |
| 221 | +def _do_inference_base(inputs, outputs, stream, execute_async_func): |
| 222 | + # Transfer input data to the GPU. |
| 223 | + kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice |
| 224 | + [ |
| 225 | + cuda_call( |
| 226 | + cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, |
| 227 | + stream)) for inp in inputs |
| 228 | + ] |
| 229 | + # Run inference. |
| 230 | + execute_async_func() |
| 231 | + # Transfer predictions back from the GPU. |
| 232 | + kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost |
| 233 | + [ |
| 234 | + cuda_call( |
| 235 | + cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, |
| 236 | + stream)) for out in outputs |
| 237 | + ] |
| 238 | + # Synchronize the stream |
| 239 | + cuda_call(cudart.cudaStreamSynchronize(stream)) |
| 240 | + # Return only the host outputs. |
| 241 | + return [out.host for out in outputs] |
232 | 242 |
|
233 | 243 |
|
234 | | -# This function is generalized for multiple inputs/outputs for full dimension networks. |
| 244 | +# This function is generalized for multiple inputs/outputs. |
235 | 245 | # inputs and outputs are expected to be lists of HostDeviceMem objects. |
236 | | -def do_inference_v2(context, bindings, inputs, outputs, stream): |
| 246 | +def do_inference(context, engine, bindings, inputs, outputs, stream): |
237 | 247 |
|
238 | | - def execute_async(): |
239 | | - context.execute_async_v2(bindings=bindings, stream_handle=stream) |
| 248 | + def execute_async_func(): |
| 249 | + context.execute_async_v3(stream_handle=stream) |
240 | 250 |
|
241 | | - return _do_inference_base(inputs, outputs, stream, execute_async) |
| 251 | + # Setup context tensor address. |
| 252 | + num_io = engine.num_io_tensors |
| 253 | + for i in range(num_io): |
| 254 | + context.set_tensor_address(engine.get_tensor_name(i), bindings[i]) |
| 255 | + return _do_inference_base(inputs, outputs, stream, execute_async_func) |
0 commit comments