trt_sample.cpp 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. #include <iostream>
  2. #include <fstream>
  3. #include <NvInfer.h>
  4. #include <memory>
  5. #include <NvOnnxParser.h>
  6. #include <vector>
  7. #include <cuda_runtime_api.h>
  8. #include <opencv2/imgcodecs.hpp>
  9. #include <opencv2/core/cuda.hpp>
  10. #include <opencv2/cudawarping.hpp>
  11. #include <opencv2/core.hpp>
  12. #include <opencv2/cudaarithm.hpp>
  13. #include <algorithm>
  14. #include <numeric>
  15. // utilities ----------------------------------------------------------------------------------------------------------
  16. // class to log errors, warnings, and other information during the build and inference phases
  17. class Logger : public nvinfer1::ILogger
  18. {
  19. public:
  20. void log(Severity severity, const char* msg) override {
  21. // remove this 'if' if you need more logged info
  22. if ((severity == Severity::kERROR) || (severity == Severity::kINTERNAL_ERROR)) {
  23. std::cout << msg << "\n";
  24. }
  25. }
  26. } gLogger;
  27. // destroy TensorRT objects if something goes wrong
  28. struct TRTDestroy
  29. {
  30. template <class T>
  31. void operator()(T* obj) const
  32. {
  33. if (obj)
  34. {
  35. obj->destroy();
  36. }
  37. }
  38. };
  39. template <class T>
  40. using TRTUniquePtr = std::unique_ptr<T, TRTDestroy>;
  41. // calculate size of tensor
  42. size_t getSizeByDim(const nvinfer1::Dims& dims)
  43. {
  44. size_t size = 1;
  45. for (size_t i = 0; i < dims.nbDims; ++i)
  46. {
  47. size *= dims.d[i];
  48. }
  49. return size;
  50. }
  51. // get classes names
  52. std::vector<std::string> getClassNames(const std::string& imagenet_classes)
  53. {
  54. std::ifstream classes_file(imagenet_classes);
  55. std::vector<std::string> classes;
  56. if (!classes_file.good())
  57. {
  58. std::cerr << "ERROR: can't read file with classes names.\n";
  59. return classes;
  60. }
  61. std::string class_name;
  62. while (std::getline(classes_file, class_name))
  63. {
  64. classes.push_back(class_name);
  65. }
  66. return classes;
  67. }
  68. // preprocessing stage ------------------------------------------------------------------------------------------------
  69. void preprocessImage(const std::string& image_path, float* gpu_input, const nvinfer1::Dims& dims)
  70. {
  71. // read input image
  72. cv::Mat frame = cv::imread(image_path);
  73. if (frame.empty())
  74. {
  75. std::cerr << "Input image " << image_path << " load failed\n";
  76. return;
  77. }
  78. cv::cuda::GpuMat gpu_frame;
  79. // upload image to GPU
  80. gpu_frame.upload(frame);
  81. auto input_width = dims.d[2];
  82. auto input_height = dims.d[1];
  83. auto channels = dims.d[0];
  84. auto input_size = cv::Size(input_width, input_height);
  85. // resize
  86. cv::cuda::GpuMat resized;
  87. cv::cuda::resize(gpu_frame, resized, input_size, 0, 0, cv::INTER_NEAREST);
  88. // normalize
  89. cv::cuda::GpuMat flt_image;
  90. resized.convertTo(flt_image, CV_32FC3, 1.f / 255.f);
  91. cv::cuda::subtract(flt_image, cv::Scalar(0.485f, 0.456f, 0.406f), flt_image, cv::noArray(), -1);
  92. cv::cuda::divide(flt_image, cv::Scalar(0.229f, 0.224f, 0.225f), flt_image, 1, -1);
  93. // to tensor
  94. std::vector<cv::cuda::GpuMat> chw;
  95. for (size_t i = 0; i < channels; ++i)
  96. {
  97. chw.emplace_back(cv::cuda::GpuMat(input_size, CV_32FC1, gpu_input + i * input_width * input_height));
  98. }
  99. cv::cuda::split(flt_image, chw);
  100. }
  101. // post-processing stage ----------------------------------------------------------------------------------------------
  102. void postprocessResults(float *gpu_output, const nvinfer1::Dims &dims, int batch_size)
  103. {
  104. // get class names
  105. auto classes = getClassNames("imagenet_classes.txt");
  106. // copy results from GPU to CPU
  107. std::vector<float> cpu_output(getSizeByDim(dims) * batch_size);
  108. cudaMemcpy(cpu_output.data(), gpu_output, cpu_output.size() * sizeof(float), cudaMemcpyDeviceToHost);
  109. // calculate softmax
  110. std::transform(cpu_output.begin(), cpu_output.end(), cpu_output.begin(), [](float val) {return std::exp(val);});
  111. auto sum = std::accumulate(cpu_output.begin(), cpu_output.end(), 0.0);
  112. // find top classes predicted by the model
  113. std::vector<int> indices(getSizeByDim(dims) * batch_size);
  114. std::iota(indices.begin(), indices.end(), 0); // generate sequence 0, 1, 2, 3, ..., 999
  115. std::sort(indices.begin(), indices.end(), [&cpu_output](int i1, int i2) {return cpu_output[i1] > cpu_output[i2];});
  116. // print results
  117. int i = 0;
  118. while (cpu_output[indices[i]] / sum > 0.005)
  119. {
  120. if (classes.size() > indices[i])
  121. {
  122. std::cout << "class: " << classes[indices[i]] << " | ";
  123. }
  124. std::cout << "confidence: " << 100 * cpu_output[indices[i]] / sum << "% | index: " << indices[i] << "\n";
  125. ++i;
  126. }
  127. }
  128. // initialize TensorRT engine and parse ONNX model --------------------------------------------------------------------
  129. void parseOnnxModel(const std::string& model_path, TRTUniquePtr<nvinfer1::ICudaEngine>& engine,
  130. TRTUniquePtr<nvinfer1::IExecutionContext>& context)
  131. {
  132. TRTUniquePtr<nvinfer1::IBuilder> builder{nvinfer1::createInferBuilder(gLogger)};
  133. TRTUniquePtr<nvinfer1::INetworkDefinition> network{builder->createNetwork()};
  134. TRTUniquePtr<nvonnxparser::IParser> parser{nvonnxparser::createParser(*network, gLogger)};
  135. TRTUniquePtr<nvinfer1::IBuilderConfig> config{builder->createBuilderConfig()};
  136. // parse ONNX
  137. if (!parser->parseFromFile(model_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO)))
  138. {
  139. std::cerr << "ERROR: could not parse the model.\n";
  140. return;
  141. }
  142. // allow TensorRT to use up to 1GB of GPU memory for tactic selection.
  143. config->setMaxWorkspaceSize(1ULL << 30);
  144. // use FP16 mode if possible
  145. if (builder->platformHasFastFp16())
  146. {
  147. config->setFlag(nvinfer1::BuilderFlag::kFP16);
  148. }
  149. // we have only one image in batch
  150. builder->setMaxBatchSize(1);
  151. // generate TensorRT engine optimized for the target platform
  152. engine.reset(builder->buildEngineWithConfig(*network, *config));
  153. context.reset(engine->createExecutionContext());
  154. }
  155. // main pipeline ------------------------------------------------------------------------------------------------------
  156. int main(int argc, char* argv[])
  157. {
  158. if (argc < 3)
  159. {
  160. std::cerr << "usage: " << argv[0] << " model.onnx image.jpg\n";
  161. return -1;
  162. }
  163. std::string model_path(argv[1]);
  164. std::string image_path(argv[2]);
  165. int batch_size = 1;
  166. // initialize TensorRT engine and parse ONNX model
  167. TRTUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};
  168. TRTUniquePtr<nvinfer1::IExecutionContext> context{nullptr};
  169. parseOnnxModel(model_path, engine, context);
  170. // get sizes of input and output and allocate memory required for input data and for output data
  171. std::vector<nvinfer1::Dims> input_dims; // we expect only one input
  172. std::vector<nvinfer1::Dims> output_dims; // and one output
  173. std::vector<void*> buffers(engine->getNbBindings()); // buffers for input and output data
  174. for (size_t i = 0; i < engine->getNbBindings(); ++i)
  175. {
  176. auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
  177. cudaMalloc(&buffers[i], binding_size);
  178. if (engine->bindingIsInput(i))
  179. {
  180. input_dims.emplace_back(engine->getBindingDimensions(i));
  181. }
  182. else
  183. {
  184. output_dims.emplace_back(engine->getBindingDimensions(i));
  185. }
  186. }
  187. if (input_dims.empty() || output_dims.empty())
  188. {
  189. std::cerr << "Expect at least one input and one output for network\n";
  190. return -1;
  191. }
  192. // preprocess input data
  193. preprocessImage(image_path, (float *) buffers[0], input_dims[0]);
  194. // inference
  195. context->enqueue(batch_size, buffers.data(), 0, nullptr);
  196. // postprocess results
  197. postprocessResults((float *) buffers[1], output_dims[0], batch_size);
  198. for (void* buf : buffers)
  199. {
  200. cudaFree(buf);
  201. }
  202. return 0;
  203. }