生产部署与多框架支持
一、为什么需要多框架部署?
工业部署核心需求:
- 跨平台运行:支持TensorRT/OpenVINO/CoreML等推理引擎
- 性能优化:利用各框架专属加速能力(FP16量化/算子融合)
- 语言无关:Java/C++/Go等语言调用
- 硬件适配:兼容边缘设备(手机/嵌入式/IoT)
二、环境安装与工具链
pip install onnx onnxruntime onnxruntime-gpu openvino-dev tensorflow
三、ONNX格式转换实战
1. PyTorch模型导出ONNX
import torch
from torchvision.models import resnet18
# 创建示例模型
model = resnet18(pretrained=True).eval()
# 生成虚拟输入
dummy_input = torch.randn(1, 3, 224, 224)
# 导出ONNX模型
torch.onnx.export(
model,
dummy_input,
"resnet18.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"},
"output": {0: "batch_size"}
},
opset_version=13
)
# 验证导出正确性
import onnx
onnx_model = onnx.load("resnet18.onnx")
onnx.checker.check_model(onnx_model) # 无报错表示成功
2. ONNXRuntime推理对比
import onnxruntime as ort
import numpy as np
# 创建推理Session
ort_session = ort.InferenceSession(
"resnet18.onnx",
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
)
# 准备输入数据
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
# PyTorch推理
with torch.no_grad():
torch_output = model(torch.from_numpy(input_data)).numpy()
# ONNXRuntime推理
ort_inputs = {"input": input_data}
ort_output = ort_session.run(["output"], ort_inputs)
# 结果一致性验证
print("最大差值:", np.max(np.abs(torch_output - ort_output))) # 应小于1e-5
四、性能优化技巧
1. 图优化与量化压缩
from onnxruntime.transformers import optimizer
# 应用所有优化策略
optimized_model = optimizer.optimize_model(
"resnet18.onnx",
model_type='bert',
num_heads=0, # 非Transformer模型设为0
optimize_gemm=True
)
optimized_model.save_model_to_file("resnet18_optimized.onnx")
# FP16量化
from onnxconverter_common import float16
fp16_model = float16.convert_float_to_float16(onnx.load("resnet18.onnx"))
onnx.save(fp16_model, "resnet18_fp16.onnx")
2. 多线程加速
# 配置Session选项
options = ort.SessionOptions()
options.intra_op_num_threads = 8
options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
ort_session = ort.InferenceSession(
"resnet18.onnx",
sess_options=options,
providers=['CUDAExecutionProvider']
)
五、多框架支持实战
1. 转TensorFlow模型(ONNX-TF)
from onnx_tf.backend import prepare
tf_rep = prepare(onnx_model)
tf_rep.export_graph("resnet18_tf")
# TensorFlow推理测试
import tensorflow as tf
tf_model = tf.saved_model.load("resnet18_tf")
tf_output = tf_model(**{"input": tf.constant(input_data)})
print("TF输出形状:", tf_output['output'].shape)
2. 转TensorFlow Lite(移动端部署)
converter = tf.lite.TFLiteConverter.from_saved_model("resnet18_tf")
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()
with open("resnet18.tflite", "wb") as f:
f.write(tflite_model)
3. 转OpenVINO(Intel设备优化)
mo --input_model resnet18.onnx --output_dir openvino_model --data_type FP16
六、生产部署方案
1. 高性能HTTP服务(FastAPI)
from fastapi import FastAPI
import uvicorn
from PIL import Image
import numpy as np
app = FastAPI()
ort_session = ort.InferenceSession("resnet18.onnx")
@app.post("/predict")
async def predict(image: UploadFile = File(...)):
img = Image.open(image.file).resize((224, 224))
img_array = np.array(img).transpose(2,0,1).astype(np.float32)[np.newaxis,...]
outputs = ort_session.run(None, {"input": img_array})
return {"class_id": int(np.argmax(outputs))}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
2. 边缘设备部署(LibTorch C++)
#include <torch/script.h>
int main() {
torch::jit::script::Module model = torch::jit::load("resnet18.pt");
std::vector<torch::jit::IValue> inputs;
inputs.push_back(torch::rand({1, 3, 224, 224}));
auto output = model.forward(inputs).toTensor();
std::cout << "Prediction: " << output.argmax() << std::endl;
return 0;
}
七、常见问题解答
Q1:如何处理动态batch_size?
# 导出时指定动态维度
torch.onnx.export(
...,
dynamic_axes={
"input": {0: "batch_size", 2: "height", 3: "width"},
"output": {0: "batch_size"}
}
)
Q2:遇到不支持的算子怎么办?
- 使用自定义算子映射
from torch.onnx import register_custom_op_symbolic def my_op_symbolic(*args): return "CustomOpName" register_custom_op_symbolic('mymodule::my_op', my_op_symbolic, opset_version=13)
- 使用ONNX Runtime自定义算子库
Q3:如何优化内存占用?
# 使用内存共享
options = ort.SessionOptions()
options.add_session_config_entry("session.use_device_allocator_for_initializers", "1")