当前位置：首页 > article >正文

c++加载TensorRT调用深度学习模型方法

article 2025/2/9 22:36:52

使用TensorRT来调用训练好的模型并输出结果是一个高效的推理过程，特别是在需要低延迟和高吞吐量的应用场景中。以下是一个基本的步骤指南，展示了如何在C++中使用TensorRT进行推理。

步骤1：准备环境

安装TensorRT：确保你已经安装了NVIDIA TensorRT库。
准备模型：确保你的训练好的模型已经转换为TensorRT支持的格式，通常是一个.engine文件。你可以使用onnx-tensorrt、TensorFlow-TensorRT等工具将模型转换为TensorRT引擎。

步骤2：编写C++代码

以下是一个简单的C++代码示例，演示如何加载TensorRT引擎并执行推理。

cpp复制代码

	#include <NvInfer.h>
	#include <NvInferRuntime.h>
	#include <cuda_runtime_api.h>
	#include <fstream>
	#include <iostream>
	#include <memory>
	#include <vector>

	// Logger for TensorRT info/warning/errors
	class Logger : public nvinfer1::ILogger {
	public:
	void log(Severity severity, const char* msg) noexcept override {
	// Filter out info-level messages
	if (severity != Severity::kINFO)
	std::cout << msg << std::endl;
	}
	};

	std::vector<char> readFile(const std::string& filepath) {
	std::ifstream file(filepath, std::ios::binary \| std::ios::ate);
	if (!file.is_open()) {
	throw std::runtime_error("Unable to open file " + filepath);
	}
	size_t size = file.tellg();
	file.seekg(0, std::ios::beg);
	std::vector<char> buffer(size);
	file.read(buffer.data(), size);
	return buffer;
	}

	void inference(const std::string& enginePath, const std::vector<float>& inputData) {
	// Logger
	Logger logger;

	// Read the engine file
	std::vector<char> engineData = readFile(enginePath);
	std::istringstream engineStream(std::string(engineData.begin(), engineData.end()));

	// Deserialize the engine
	IRuntime* runtime = createInferRuntime(logger);
	ICudaEngine* engine = runtime->deserializeCudaEngine(engineStream);

	// Create execution context
	IExecutionContext* context = engine->createExecutionContext();

	// Allocate GPU memory
	void* buffers[2];
	cudaMalloc(&buffers[0], inputData.size() * sizeof(float)); // Input buffer
	float* outputData = nullptr;
	cudaMalloc(&buffers[1], engine->getBindingDimensions(1).d[0] * sizeof(float)); // Output buffer

	// Copy input data to GPU
	cudaMemcpy(buffers[0], inputData.data(), inputData.size() * sizeof(float), cudaMemcpyHostToDevice);

	// Set dynamic input dimensions if needed (omitting for simplicity)

	// Run inference
	context->enqueue(batchSize, buffers, 0, nullptr);

	// Synchronize the stream
	cudaStreamSynchronize(context->getStream());

	// Copy the output data to the host
	outputData = new float[engine->getBindingDimensions(1).d[0]];
	cudaMemcpy(outputData, buffers[1], engine->getBindingDimensions(1).d[0] * sizeof(float), cudaMemcpyDeviceToHost);

	// Print the output data (or process it as needed)
	std::cout << "Output data: ";
	for (int i = 0; i < engine->getBindingDimensions(1).d[0]; ++i) {
	std::cout << outputData[i] << " ";
	}
	std::cout << std::endl;

	// Clean up
	delete[] outputData;
	cudaFree(buffers[0]);
	cudaFree(buffers[1]);
	context->destroy();
	engine->destroy();
	runtime->destroy();
	}

	int main() {
	// Path to the TensorRT engine file
	std::string enginePath = "your_model.engine";

	// Example input data (must match the model's input dimensions)
	std::vector<float> inputData = { /* Populate with your input data */ };

	// Run inference
	try {
	inference(enginePath, inputData);
	} catch (const std::exception& ex) {
	std::cerr << "Error: " << ex.what() << std::endl;
	return EXIT_FAILURE;
	}

	return EXIT_SUCCESS;
	}