Created
November 4, 2019 09:14
-
-
Save CasiaFan/c8a380aabdc08e1fdcf6310c78431106 to your computer and use it in GitHub Desktop.
Acceleration inference of onnx model with TensorRT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorrt as trt | |
import numpy as np | |
import pycuda.autoinit | |
import pycuda.driver as cuda | |
import time | |
model_path = "model.onnx" | |
input_size = 32 | |
TRT_LOGGER = trt.Logger(trt.Logger.WARNING) | |
def build_engine(model_path): | |
with trt.Builder(TRT_LOGGER) as builder, \ | |
builder.create_network() as network, \ | |
trt.OnnxParser(network, TRT_LOGGER) as parser: | |
builder.max_workspace_size = 1<<20 | |
builder.max_batch_size = 1 | |
with open(model_path, "rb") as f: | |
parser.parse(f.read()) | |
engine = builder.build_cuda_engine(network) | |
return engine | |
def alloc_buf(engine): | |
# host cpu mem | |
h_in_size = trt.volume(engine.get_binding_shape(0)) | |
h_out_size = trt.volume(engine.get_binding_shape(1)) | |
h_in_dtype = trt.nptype(engine.get_binding_dtype(0)) | |
h_out_dtype = trt.nptype(engine.get_binding_dtype(1)) | |
in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype) | |
out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype) | |
# allocate gpu mem | |
in_gpu = cuda.mem_alloc(in_cpu.nbytes) | |
out_gpu = cuda.mem_alloc(out_cpu.nbytes) | |
stream = cuda.Stream() | |
return in_cpu, out_cpu, in_gpu, out_gpu, stream | |
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream): | |
# async version | |
# with engine.create_execution_context() as context: # cost time to initialize | |
# cuda.memcpy_htod_async(in_gpu, inputs, stream) | |
# context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None) | |
# cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream) | |
# stream.synchronize() | |
# sync version | |
cuda.memcpy_htod(in_gpu, inputs) | |
context.execute(1, [int(in_gpu), int(out_gpu)]) | |
cuda.memcpy_dtoh(out_cpu, out_gpu) | |
return out_cpu | |
if __name__ == "__main__": | |
inputs = np.random.random((1, 3, input_size, input_size)).astype(np.float32) | |
engine = build_engine(model_path) | |
context = engine.create_execution_context() | |
for _ in range(10): | |
t1 = time.time() | |
in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine) | |
res = inference(engine, context, inputs.reshape(-1), out_cpu, in_gpu, out_gpu, stream) | |
print(res) | |
print("cost time: ", time.time()-t1) | |
# tensorrt docker image: docker pull nvcr.io/nvidia/tensorrt:19.09-py3 (See: https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt/tags) | |
# NOTE: cuda driver >= 418 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment