/** * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. * Full license terms provided in LICENSE.md file. */ #include #include #include #include #include #include #include #include #include #define MS_PER_SEC 1000.0 using namespace std; using namespace nvinfer1; class TestConfig; typedef void (*preprocess_fn_t)(float *input, size_t channels, size_t height, size_t width); float * imageToTensor(const cv::Mat & image); void preprocessVgg(float *input, size_t channels, size_t height, size_t width); void preprocessInception(float *input, size_t channels, size_t height, size_t width); size_t argmax(float *input, size_t numel); void test(const TestConfig &testConfig); class TestConfig { public: string imagePath; string planPath; string inputNodeName; string outputNodeName; string preprocessFnName; string inputHeight; string inputWidth; string numOutputCategories; string dataType; string maxBatchSize; string workspaceSize; string numRuns; string useMappedMemory; string statsPath; TestConfig(int argc, char * argv[]) { imagePath = argv[1]; planPath = argv[2]; inputNodeName = argv[3]; inputHeight = argv[4]; inputWidth = argv[5]; outputNodeName = argv[6]; numOutputCategories = argv[7]; preprocessFnName = argv[8]; numRuns = argv[9]; dataType = argv[10]; maxBatchSize = argv[11]; workspaceSize = argv[12]; useMappedMemory = argv[13]; statsPath = argv[14]; } static string UsageString() { string s = ""; s += "imagePath: \n"; s += "planPath: \n"; s += "inputNodeName: \n"; s += "inputHeight: \n"; s += "inputWidth: \n"; s += "outputNodeName: \n"; s += "numOutputCategories: \n"; s += "preprocessFnName: \n"; s += "numRuns: \n"; s += "dataType: \n"; s += "maxBatchSize: \n"; s += "workspaceSize: \n"; s += "useMappedMemory: \n"; s += "statsPath: \n"; return s; } string ToString() { string s = ""; s += "imagePath: " + imagePath + "\n"; s += "planPath: " + planPath + "\n"; s += "inputNodeName: " + inputNodeName + "\n"; s += "inputHeight: " + inputHeight + "\n"; s += "inputWidth: " + inputWidth + "\n"; s += "outputNodeName: " + outputNodeName + "\n"; s += "numOutputCategories: " + numOutputCategories + "\n"; s += "preprocessFnName: " + preprocessFnName + "\n"; s += "numRuns: " + numRuns + "\n"; s += "dataType: " + dataType + "\n"; s += "maxBatchSize: " + maxBatchSize + "\n"; s += "workspaceSize: " + workspaceSize + "\n"; s += "useMappedMemory: " + useMappedMemory + "\n"; s += "statsPath: " + statsPath + "\n"; return s; } static int ToInteger(string value) { int valueInt; stringstream ss; ss << value; ss >> valueInt; return valueInt; } preprocess_fn_t PreprocessFn() const { if (preprocessFnName == "preprocess_vgg") return preprocessVgg; else if (preprocessFnName == "preprocess_inception") return preprocessInception; else throw runtime_error("Invalid preprocessing function name."); } int InputWidth() const { return ToInteger(inputWidth); } int InputHeight() const { return ToInteger(inputHeight); } int NumOutputCategories() const { return ToInteger(numOutputCategories); } nvinfer1::DataType DataType() const { if (dataType == "float") return nvinfer1::DataType::kFLOAT; else if (dataType == "half") return nvinfer1::DataType::kHALF; else throw runtime_error("Invalid data type."); } int MaxBatchSize() const { return ToInteger(maxBatchSize); } int WorkspaceSize() const { return ToInteger(workspaceSize); } int NumRuns() const { return ToInteger(numRuns); } int UseMappedMemory() const { return ToInteger(useMappedMemory); } }; class Logger : public ILogger { void log(Severity severity, const char * msg) override { cout << msg << endl; } } gLogger; int main(int argc, char * argv[]) { if (argc != 15) { cout << TestConfig::UsageString() << endl; return 0; } TestConfig testConfig(argc, argv); cout << "\ntestConfig: \n" << testConfig.ToString() << endl; test(testConfig); return 0; } float *imageToTensor(const cv::Mat & image) { const size_t height = image.rows; const size_t width = image.cols; const size_t channels = image.channels(); const size_t numel = height * width * channels; const size_t stridesCv[3] = { width * channels, channels, 1 }; const size_t strides[3] = { height * width, width, 1 }; float * tensor; cudaHostAlloc((void**)&tensor, numel * sizeof(float), cudaHostAllocMapped); for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { for (int k = 0; k < channels; k++) { const size_t offsetCv = i * stridesCv[0] + j * stridesCv[1] + k * stridesCv[2]; const size_t offset = k * strides[0] + i * strides[1] + j * strides[2]; tensor[offset] = (float) image.data[offsetCv]; } } } return tensor; } void preprocessVgg(float * tensor, size_t channels, size_t height, size_t width) { const size_t strides[3] = { height * width, width, 1 }; const float mean[3] = { 123.68, 116.78, 103.94 }; for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { for (int k = 0; k < channels; k++) { const size_t offset = k * strides[0] + i * strides[1] + j * strides[2]; tensor[offset] -= mean[k]; } } } } void preprocessInception(float * tensor, size_t channels, size_t height, size_t width) { const size_t numel = channels * height * width; for (int i = 0; i < numel; i++) tensor[i] = 2.0 * (tensor[i] / 255.0 - 0.5); } size_t argmax(float * tensor, size_t numel) { if (numel <= 0) return 0; size_t maxIndex = 0; float max = tensor[0]; for (int i = 0; i < numel; i++) { if (tensor[i] > max) { maxIndex = i; max = tensor[i]; } } return maxIndex; } void test(const TestConfig &testConfig) { ifstream planFile(testConfig.planPath); stringstream planBuffer; planBuffer << planFile.rdbuf(); string plan = planBuffer.str(); IRuntime *runtime = createInferRuntime(gLogger); ICudaEngine *engine = runtime->deserializeCudaEngine((void*)plan.data(), plan.size(), nullptr); IExecutionContext *context = engine->createExecutionContext(); int inputBindingIndex, outputBindingIndex; inputBindingIndex = engine->getBindingIndex(testConfig.inputNodeName.c_str()); outputBindingIndex = engine->getBindingIndex(testConfig.outputNodeName.c_str()); // load and preprocess image cv::Mat image = cv::imread(testConfig.imagePath, CV_LOAD_IMAGE_COLOR); cv::cvtColor(image, image, cv::COLOR_BGR2RGB, 3); cv::resize(image, image, cv::Size(testConfig.InputWidth(), testConfig.InputHeight())); float *input = imageToTensor(image); testConfig.PreprocessFn()(input, 3, testConfig.InputHeight(), testConfig.InputWidth()); // allocate memory on host / device for input / output float *output; float *inputDevice; float *outputDevice; size_t inputSize = testConfig.InputHeight() * testConfig.InputWidth() * 3 * sizeof(float); cudaHostAlloc(&output, testConfig.NumOutputCategories() * sizeof(float), cudaHostAllocMapped); if (testConfig.UseMappedMemory()) { cudaHostGetDevicePointer(&inputDevice, input, 0); cudaHostGetDevicePointer(&outputDevice, output, 0); } else { cudaMalloc(&inputDevice, inputSize); cudaMalloc(&outputDevice, testConfig.NumOutputCategories() * sizeof(float)); } float *bindings[2]; bindings[inputBindingIndex] = inputDevice; bindings[outputBindingIndex] = outputDevice; // run and compute average time over numRuns iterations double avgTime = 0; for (int i = 0; i < testConfig.NumRuns() + 1; i++) { chrono::duration diff; if (testConfig.UseMappedMemory()) { auto t0 = chrono::steady_clock::now(); context->execute(1, (void**)bindings); auto t1 = chrono::steady_clock::now(); diff = t1 - t0; } else { auto t0 = chrono::steady_clock::now(); cudaMemcpy(inputDevice, input, inputSize, cudaMemcpyHostToDevice); context->execute(1, (void**)bindings); cudaMemcpy(output, outputDevice, testConfig.NumOutputCategories() * sizeof(float), cudaMemcpyDeviceToHost); auto t1 = chrono::steady_clock::now(); diff = t1 - t0; } if (i != 0) avgTime += MS_PER_SEC * diff.count(); } avgTime /= testConfig.NumRuns(); // save results to file int maxCategoryIndex = argmax(output, testConfig.NumOutputCategories()) + 1001 - testConfig.NumOutputCategories(); cout << "Most likely category id is " << maxCategoryIndex << endl; cout << "Average execution time in ms is " << avgTime << endl; ofstream outfile; outfile.open(testConfig.statsPath, ios_base::app); outfile << "\n" << testConfig.planPath << " " << avgTime; // << " " << maxCategoryIndex // << " " << testConfig.InputWidth() // << " " << testConfig.InputHeight() // << " " << testConfig.MaxBatchSize() // << " " << testConfig.WorkspaceSize() // << " " << testConfig.dataType // << " " << testConfig.NumRuns() // << " " << testConfig.UseMappedMemory(); outfile.close(); cudaFree(inputDevice); cudaFree(outputDevice); cudaFreeHost(input); cudaFreeHost(output); engine->destroy(); context->destroy(); runtime->destroy(); }