c++加载TensorRT调用深度学习模型方法
使用TensorRT来调用训练好的模型并输出结果是一个高效的推理过程,特别是在需要低延迟和高吞吐量的应用场景中。以下是一个基本的步骤指南,展示了如何在C++中使用TensorRT进行推理。
步骤1:准备环境
- 安装TensorRT:确保你已经安装了NVIDIA TensorRT库。
- 准备模型:确保你的训练好的模型已经转换为TensorRT支持的格式,通常是一个.engine文件。你可以使用onnx-tensorrt、TensorFlow-TensorRT等工具将模型转换为TensorRT引擎。
步骤2:编写C++代码
以下是一个简单的C++代码示例,演示如何加载TensorRT引擎并执行推理。
cpp复制代码
#include <NvInfer.h> | |
#include <NvInferRuntime.h> | |
#include <cuda_runtime_api.h> | |
#include <fstream> | |
#include <iostream> | |
#include <memory> | |
#include <vector> | |
// Logger for TensorRT info/warning/errors | |
class Logger : public nvinfer1::ILogger { | |
public: | |
void log(Severity severity, const char* msg) noexcept override { | |
// Filter out info-level messages | |
if (severity != Severity::kINFO) | |
std::cout << msg << std::endl; | |
} | |
}; | |
std::vector<char> readFile(const std::string& filepath) { | |
std::ifstream file(filepath, std::ios::binary | std::ios::ate); | |
if (!file.is_open()) { | |
throw std::runtime_error("Unable to open file " + filepath); | |
} | |
size_t size = file.tellg(); | |
file.seekg(0, std::ios::beg); | |
std::vector<char> buffer(size); | |
file.read(buffer.data(), size); | |
return buffer; | |
} | |
void inference(const std::string& enginePath, const std::vector<float>& inputData) { | |
// Logger | |
Logger logger; | |
// Read the engine file | |
std::vector<char> engineData = readFile(enginePath); | |
std::istringstream engineStream(std::string(engineData.begin(), engineData.end())); | |
// Deserialize the engine | |
IRuntime* runtime = createInferRuntime(logger); | |
ICudaEngine* engine = runtime->deserializeCudaEngine(engineStream); | |
// Create execution context | |
IExecutionContext* context = engine->createExecutionContext(); | |
// Allocate GPU memory | |
void* buffers[2]; | |
cudaMalloc(&buffers[0], inputData.size() * sizeof(float)); // Input buffer | |
float* outputData = nullptr; | |
cudaMalloc(&buffers[1], engine->getBindingDimensions(1).d[0] * sizeof(float)); // Output buffer | |
// Copy input data to GPU | |
cudaMemcpy(buffers[0], inputData.data(), inputData.size() * sizeof(float), cudaMemcpyHostToDevice); | |
// Set dynamic input dimensions if needed (omitting for simplicity) | |
// Run inference | |
context->enqueue(batchSize, buffers, 0, nullptr); | |
// Synchronize the stream | |
cudaStreamSynchronize(context->getStream()); | |
// Copy the output data to the host | |
outputData = new float[engine->getBindingDimensions(1).d[0]]; | |
cudaMemcpy(outputData, buffers[1], engine->getBindingDimensions(1).d[0] * sizeof(float), cudaMemcpyDeviceToHost); | |
// Print the output data (or process it as needed) | |
std::cout << "Output data: "; | |
for (int i = 0; i < engine->getBindingDimensions(1).d[0]; ++i) { | |
std::cout << outputData[i] << " "; | |
} | |
std::cout << std::endl; | |
// Clean up | |
delete[] outputData; | |
cudaFree(buffers[0]); | |
cudaFree(buffers[1]); | |
context->destroy(); | |
engine->destroy(); | |
runtime->destroy(); | |
} | |
int main() { | |
// Path to the TensorRT engine file | |
std::string enginePath = "your_model.engine"; | |
// Example input data (must match the model's input dimensions) | |
std::vector<float> inputData = { /* Populate with your input data */ }; | |
// Run inference | |
try { | |
inference(enginePath, inputData); | |
} catch (const std::exception& ex) { | |
std::cerr << "Error: " << ex.what() << std::endl; | |
return EXIT_FAILURE; | |
} | |
return EXIT_SUCCESS; | |
} |
注意事项
- 输入数据:确保输入数据的维度和类型与你的模型匹配。
- 动态维度:如果你的模型包含动态输入维度,需要在创建执行上下文后设置这些维度。
- 错误处理:实际代码中应包含更多的错误处理逻辑,以应对各种可能的异常情况。
- 优化:TensorRT提供了多种优化选项,例如使用FP16进行推理以减少内存带宽和计算需求,你可以根据需求进行调整。
编译和运行
确保你的编译命令链接了TensorRT和CUDA库。例如:
sh复制代码
g++ -o tensorrt_inference tensorrt_inference.cpp -lnvinfer -lnvinfer_runtime -lcudart -lcudnn | |
./tensorrt_inference |
希望这个示例能帮助你理解如何在C++中使用TensorRT进行推理。