test_trt.cu 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. /**
  2. * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  3. * Full license terms provided in LICENSE.md file.
  4. */
  5. #include <iostream>
  6. #include <string>
  7. #include <vector>
  8. #include <sstream>
  9. #include <chrono>
  10. #include <stdexcept>
  11. #include <fstream>
  12. #include <opencv2/opencv.hpp>
  13. #include <NvInfer.h>
  14. #define MS_PER_SEC 1000.0
  15. using namespace std;
  16. using namespace nvinfer1;
  17. class TestConfig;
  18. typedef void (*preprocess_fn_t)(float *input, size_t channels, size_t height, size_t width);
  19. float * imageToTensor(const cv::Mat & image);
  20. void preprocessVgg(float *input, size_t channels, size_t height, size_t width);
  21. void preprocessInception(float *input, size_t channels, size_t height, size_t width);
  22. size_t argmax(float *input, size_t numel);
  23. void test(const TestConfig &testConfig);
  24. class TestConfig
  25. {
  26. public:
  27. string imagePath;
  28. string planPath;
  29. string inputNodeName;
  30. string outputNodeName;
  31. string preprocessFnName;
  32. string inputHeight;
  33. string inputWidth;
  34. string numOutputCategories;
  35. string dataType;
  36. string maxBatchSize;
  37. string workspaceSize;
  38. string numRuns;
  39. string useMappedMemory;
  40. string statsPath;
  41. TestConfig(int argc, char * argv[])
  42. {
  43. imagePath = argv[1];
  44. planPath = argv[2];
  45. inputNodeName = argv[3];
  46. inputHeight = argv[4];
  47. inputWidth = argv[5];
  48. outputNodeName = argv[6];
  49. numOutputCategories = argv[7];
  50. preprocessFnName = argv[8];
  51. numRuns = argv[9];
  52. dataType = argv[10];
  53. maxBatchSize = argv[11];
  54. workspaceSize = argv[12];
  55. useMappedMemory = argv[13];
  56. statsPath = argv[14];
  57. }
  58. static string UsageString()
  59. {
  60. string s = "";
  61. s += "imagePath: \n";
  62. s += "planPath: \n";
  63. s += "inputNodeName: \n";
  64. s += "inputHeight: \n";
  65. s += "inputWidth: \n";
  66. s += "outputNodeName: \n";
  67. s += "numOutputCategories: \n";
  68. s += "preprocessFnName: \n";
  69. s += "numRuns: \n";
  70. s += "dataType: \n";
  71. s += "maxBatchSize: \n";
  72. s += "workspaceSize: \n";
  73. s += "useMappedMemory: \n";
  74. s += "statsPath: \n";
  75. return s;
  76. }
  77. string ToString()
  78. {
  79. string s = "";
  80. s += "imagePath: " + imagePath + "\n";
  81. s += "planPath: " + planPath + "\n";
  82. s += "inputNodeName: " + inputNodeName + "\n";
  83. s += "inputHeight: " + inputHeight + "\n";
  84. s += "inputWidth: " + inputWidth + "\n";
  85. s += "outputNodeName: " + outputNodeName + "\n";
  86. s += "numOutputCategories: " + numOutputCategories + "\n";
  87. s += "preprocessFnName: " + preprocessFnName + "\n";
  88. s += "numRuns: " + numRuns + "\n";
  89. s += "dataType: " + dataType + "\n";
  90. s += "maxBatchSize: " + maxBatchSize + "\n";
  91. s += "workspaceSize: " + workspaceSize + "\n";
  92. s += "useMappedMemory: " + useMappedMemory + "\n";
  93. s += "statsPath: " + statsPath + "\n";
  94. return s;
  95. }
  96. static int ToInteger(string value)
  97. {
  98. int valueInt;
  99. stringstream ss;
  100. ss << value;
  101. ss >> valueInt;
  102. return valueInt;
  103. }
  104. preprocess_fn_t PreprocessFn() const {
  105. if (preprocessFnName == "preprocess_vgg")
  106. return preprocessVgg;
  107. else if (preprocessFnName == "preprocess_inception")
  108. return preprocessInception;
  109. else
  110. throw runtime_error("Invalid preprocessing function name.");
  111. }
  112. int InputWidth() const { return ToInteger(inputWidth); }
  113. int InputHeight() const { return ToInteger(inputHeight); }
  114. int NumOutputCategories() const { return ToInteger(numOutputCategories); }
  115. nvinfer1::DataType DataType() const {
  116. if (dataType == "float")
  117. return nvinfer1::DataType::kFLOAT;
  118. else if (dataType == "half")
  119. return nvinfer1::DataType::kHALF;
  120. else
  121. throw runtime_error("Invalid data type.");
  122. }
  123. int MaxBatchSize() const { return ToInteger(maxBatchSize); }
  124. int WorkspaceSize() const { return ToInteger(workspaceSize); }
  125. int NumRuns() const { return ToInteger(numRuns); }
  126. int UseMappedMemory() const { return ToInteger(useMappedMemory); }
  127. };
  128. class Logger : public ILogger
  129. {
  130. void log(Severity severity, const char * msg) override
  131. {
  132. cout << msg << endl;
  133. }
  134. } gLogger;
  135. int main(int argc, char * argv[])
  136. {
  137. if (argc != 15)
  138. {
  139. cout << TestConfig::UsageString() << endl;
  140. return 0;
  141. }
  142. TestConfig testConfig(argc, argv);
  143. cout << "\ntestConfig: \n" << testConfig.ToString() << endl;
  144. test(testConfig);
  145. return 0;
  146. }
  147. float *imageToTensor(const cv::Mat & image)
  148. {
  149. const size_t height = image.rows;
  150. const size_t width = image.cols;
  151. const size_t channels = image.channels();
  152. const size_t numel = height * width * channels;
  153. const size_t stridesCv[3] = { width * channels, channels, 1 };
  154. const size_t strides[3] = { height * width, width, 1 };
  155. float * tensor;
  156. cudaHostAlloc((void**)&tensor, numel * sizeof(float), cudaHostAllocMapped);
  157. for (int i = 0; i < height; i++)
  158. {
  159. for (int j = 0; j < width; j++)
  160. {
  161. for (int k = 0; k < channels; k++)
  162. {
  163. const size_t offsetCv = i * stridesCv[0] + j * stridesCv[1] + k * stridesCv[2];
  164. const size_t offset = k * strides[0] + i * strides[1] + j * strides[2];
  165. tensor[offset] = (float) image.data[offsetCv];
  166. }
  167. }
  168. }
  169. return tensor;
  170. }
  171. void preprocessVgg(float * tensor, size_t channels, size_t height, size_t width)
  172. {
  173. const size_t strides[3] = { height * width, width, 1 };
  174. const float mean[3] = { 123.68, 116.78, 103.94 };
  175. for (int i = 0; i < height; i++)
  176. {
  177. for (int j = 0; j < width; j++)
  178. {
  179. for (int k = 0; k < channels; k++)
  180. {
  181. const size_t offset = k * strides[0] + i * strides[1] + j * strides[2];
  182. tensor[offset] -= mean[k];
  183. }
  184. }
  185. }
  186. }
  187. void preprocessInception(float * tensor, size_t channels, size_t height, size_t width)
  188. {
  189. const size_t numel = channels * height * width;
  190. for (int i = 0; i < numel; i++)
  191. tensor[i] = 2.0 * (tensor[i] / 255.0 - 0.5);
  192. }
  193. size_t argmax(float * tensor, size_t numel)
  194. {
  195. if (numel <= 0)
  196. return 0;
  197. size_t maxIndex = 0;
  198. float max = tensor[0];
  199. for (int i = 0; i < numel; i++)
  200. {
  201. if (tensor[i] > max)
  202. {
  203. maxIndex = i;
  204. max = tensor[i];
  205. }
  206. }
  207. return maxIndex;
  208. }
  209. void test(const TestConfig &testConfig)
  210. {
  211. ifstream planFile(testConfig.planPath);
  212. stringstream planBuffer;
  213. planBuffer << planFile.rdbuf();
  214. string plan = planBuffer.str();
  215. IRuntime *runtime = createInferRuntime(gLogger);
  216. ICudaEngine *engine = runtime->deserializeCudaEngine((void*)plan.data(),
  217. plan.size(), nullptr);
  218. IExecutionContext *context = engine->createExecutionContext();
  219. int inputBindingIndex, outputBindingIndex;
  220. inputBindingIndex = engine->getBindingIndex(testConfig.inputNodeName.c_str());
  221. outputBindingIndex = engine->getBindingIndex(testConfig.outputNodeName.c_str());
  222. // load and preprocess image
  223. cv::Mat image = cv::imread(testConfig.imagePath, CV_LOAD_IMAGE_COLOR);
  224. cv::cvtColor(image, image, cv::COLOR_BGR2RGB, 3);
  225. cv::resize(image, image, cv::Size(testConfig.InputWidth(), testConfig.InputHeight()));
  226. float *input = imageToTensor(image);
  227. testConfig.PreprocessFn()(input, 3, testConfig.InputHeight(), testConfig.InputWidth());
  228. // allocate memory on host / device for input / output
  229. float *output;
  230. float *inputDevice;
  231. float *outputDevice;
  232. size_t inputSize = testConfig.InputHeight() * testConfig.InputWidth() * 3 * sizeof(float);
  233. cudaHostAlloc(&output, testConfig.NumOutputCategories() * sizeof(float), cudaHostAllocMapped);
  234. if (testConfig.UseMappedMemory())
  235. {
  236. cudaHostGetDevicePointer(&inputDevice, input, 0);
  237. cudaHostGetDevicePointer(&outputDevice, output, 0);
  238. }
  239. else
  240. {
  241. cudaMalloc(&inputDevice, inputSize);
  242. cudaMalloc(&outputDevice, testConfig.NumOutputCategories() * sizeof(float));
  243. }
  244. float *bindings[2];
  245. bindings[inputBindingIndex] = inputDevice;
  246. bindings[outputBindingIndex] = outputDevice;
  247. // run and compute average time over numRuns iterations
  248. double avgTime = 0;
  249. for (int i = 0; i < testConfig.NumRuns() + 1; i++)
  250. {
  251. chrono::duration<double> diff;
  252. if (testConfig.UseMappedMemory())
  253. {
  254. auto t0 = chrono::steady_clock::now();
  255. context->execute(1, (void**)bindings);
  256. auto t1 = chrono::steady_clock::now();
  257. diff = t1 - t0;
  258. }
  259. else
  260. {
  261. auto t0 = chrono::steady_clock::now();
  262. cudaMemcpy(inputDevice, input, inputSize, cudaMemcpyHostToDevice);
  263. context->execute(1, (void**)bindings);
  264. cudaMemcpy(output, outputDevice, testConfig.NumOutputCategories() * sizeof(float), cudaMemcpyDeviceToHost);
  265. auto t1 = chrono::steady_clock::now();
  266. diff = t1 - t0;
  267. }
  268. if (i != 0)
  269. avgTime += MS_PER_SEC * diff.count();
  270. }
  271. avgTime /= testConfig.NumRuns();
  272. // save results to file
  273. int maxCategoryIndex = argmax(output, testConfig.NumOutputCategories()) + 1001 - testConfig.NumOutputCategories();
  274. cout << "Most likely category id is " << maxCategoryIndex << endl;
  275. cout << "Average execution time in ms is " << avgTime << endl;
  276. ofstream outfile;
  277. outfile.open(testConfig.statsPath, ios_base::app);
  278. outfile << "\n" << testConfig.planPath
  279. << " " << avgTime;
  280. // << " " << maxCategoryIndex
  281. // << " " << testConfig.InputWidth()
  282. // << " " << testConfig.InputHeight()
  283. // << " " << testConfig.MaxBatchSize()
  284. // << " " << testConfig.WorkspaceSize()
  285. // << " " << testConfig.dataType
  286. // << " " << testConfig.NumRuns()
  287. // << " " << testConfig.UseMappedMemory();
  288. outfile.close();
  289. cudaFree(inputDevice);
  290. cudaFree(outputDevice);
  291. cudaFreeHost(input);
  292. cudaFreeHost(output);
  293. engine->destroy();
  294. context->destroy();
  295. runtime->destroy();
  296. }