trt_inference.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. from pytorch_model import preprocess_image, postprocess
  2. import torch
  3. import pycuda.driver as cuda
  4. import pycuda.autoinit
  5. import numpy as np
  6. import tensorrt as trt
  7. ONNX_FILE_PATH = "resnet50.onnx"
  8. # logger to capture errors, warnings, and other information during the build and inference phases
  9. TRT_LOGGER = trt.Logger()
  10. def build_engine(onnx_file_path):
  11. # initialize TensorRT engine and parse ONNX model
  12. builder = trt.Builder(TRT_LOGGER)
  13. network = builder.create_network()
  14. parser = trt.OnnxParser(network, TRT_LOGGER)
  15. # allow TensorRT to use up to 1GB of GPU memory for tactic selection
  16. builder.max_workspace_size = 1 << 30
  17. # we have only one image in batch
  18. builder.max_batch_size = 1
  19. # use FP16 mode if possible
  20. if builder.platform_has_fast_fp16:
  21. builder.fp16_mode = True
  22. # parse ONNX
  23. with open(onnx_file_path, 'rb') as model:
  24. print('Beginning ONNX file parsing')
  25. parser.parse(model.read())
  26. print('Completed parsing of ONNX file')
  27. # generate TensorRT engine optimized for the target platform
  28. print('Building an engine...')
  29. engine = builder.build_cuda_engine(network)
  30. context = engine.create_execution_context()
  31. print("Completed creating Engine")
  32. return engine, context
  33. def main():
  34. # initialize TensorRT engine and parse ONNX model
  35. engine, context = build_engine(ONNX_FILE_PATH)
  36. # get sizes of input and output and allocate memory required for input data and for output data
  37. for binding in engine:
  38. if engine.binding_is_input(binding): # we expect only one input
  39. input_shape = engine.get_binding_shape(binding)
  40. input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize # in bytes
  41. device_input = cuda.mem_alloc(input_size)
  42. else: # and one output
  43. output_shape = engine.get_binding_shape(binding)
  44. # create page-locked memory buffers (i.e. won't be swapped to disk)
  45. host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
  46. device_output = cuda.mem_alloc(host_output.nbytes)
  47. # Create a stream in which to copy inputs/outputs and run inference.
  48. stream = cuda.Stream()
  49. # preprocess input data
  50. host_input = np.array(preprocess_image("turkish_coffee.jpg").numpy(), dtype=np.float32, order='C')
  51. cuda.memcpy_htod_async(device_input, host_input, stream)
  52. # run inference
  53. context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
  54. cuda.memcpy_dtoh_async(host_output, device_output, stream)
  55. stream.synchronize()
  56. # postprocess results
  57. output_data = torch.Tensor(host_output).reshape(engine.max_batch_size, output_shape[0])
  58. postprocess(output_data)
  59. if __name__ == '__main__':
  60. main()