引言
本文涉及两个框架及其版本分别为
- NNI (Neural Network Intelligence) :3.0
- TensorRT:10.9.0.34
NNI 在文档 Speed Up Quantized Model with TensorRT里描述了如何使用 TensorRT 为NNI量化的模型实现加速,但是从NNI 的源代码https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/integrated_tensorrt.py 来看:
- https://github.com/microsoft/nni/blob/767ed7f22e1e588ce76cbbecb6c6a4a76a309805/nni/compression/quantization_speedup/integrated_tensorrt.py#L14
TRT8 = 8
- https://github.com/microsoft/nni/blob/767ed7f22e1e588ce76cbbecb6c6a4a76a309805/nni/compression/quantization_speedup/integrated_tensorrt.py#L292
assert trt_version >= TRT8, "Version of TensorRT is too old, please \
update TensorRT to version >= 8.0"
来看,实际上 NNI只支持 TensorRT 8
的API,但是TensorRT后续已经更新到10了,并且NNI 已经不再更新,因此有必要实现NNI 适配 TensorRT10。
本文将参考:
- API Migration Guide
- 个人实践经验
完成适配过程。
适配过程
修改 integrated_tensorrt.py 文件
integrated_tensorrt.py 位于https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/integrated_tensorrt.py
- L291
assert trt_version >= TRT8, "Version of TensorRT is too old, please \
update TensorRT to version >= 8.0"
修改为
trt_version = int(trt.__version__.split('.')[0])
assert trt_version >= TRT8, "Version of TensorRT is too old, please \
update TensorRT to version >= 8.0"
- L231-L232
builder.max_batch_size = input_shape[0]
trt_config.max_workspace_size = common.GiB(8)
修改为
# builder.max_batch_size = input_shape[0]
# trt_config.max_workspace_size = common.GiB(8)
trt_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 8 << 30)
L255
engine = builder.build_engine(network, trt_config)
修改为
# engine = builder.build_engine(network, trt_config)
engine_data = builder.build_serialized_network(network, trt_config)
if not engine_data:
raise RuntimeError("Failed to build serialized engine.")
runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(engine_data)
- L354
engine_input_shape = self.engine.get_tensor_shape(self.engine.get_tensor_name(0))
修改为
# engine_input_shape = self.engine.get_binding_shape(0)
engine_input_shape = self.engine.get_tensor_shape(self.engine.get_tensor_name(0))
- L365
trt_outputs = common.do_inference_v2(self.context, bindings=self.bindings, inputs=self.inputs,
outputs=self.outputs, stream=self.stream)
修改为
trt_outputs = common.do_inference_v2(self.engine, self.context,
self.bindings,
inputs=self.inputs,
outputs=self.outputs, stream=self.stream)
修改 trt_pycuda.py
trt_pycuda.py 位于https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/trt_pycuda.py
- L77
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) # * engine.max_batch_size, batch size already in
dtype = trt.nptype(engine.get_binding_dtype(binding))
修改为
# ref to https://docs.nvidia.com/deeplearning/tensorrt/migration-guide/index.html
# modify the code to support to compatibility with TensorRT 10.0
"""
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
"""
for i in range(engine.num_io_tensors):
tensor_name = engine.get_tensor_name(i)
size = trt.volume(engine.get_tensor_shape(tensor_name))
dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))
L93
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
修改为
# if engine.binding_is_input(binding):
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
- L102
def do_inference_v2(context, bindings, inputs, outputs, stream):
修改为
def do_inference_v2(engine, context, bindings, inputs, outputs, stream):
# Ref to https://docs.nvidia.com/deeplearning/tensorrt/migration-guide/index.html to
# Setup tensor address
for i in range(engine.num_io_tensors):
context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
- L110
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
修改为
# context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
context.execute_async_v3(stream_handle=stream.handle)
实验
我们试着在NNI上对ResNet18 进行量化,然后使用TensorRT 进行加速验证
实验准备
- 实验平台信息
This Benchmark is running on the following Hardware:
CPU Information:
CPU Brand: Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz
CPU Architecture: X86_64
CPU Count: 40
GPU Information:
GPU Device: Tesla T4
GPU Count: 1
CUDA Version: 12.1
GPU Memory Usage:
Allocated: 0.00MB
Cached: 0.00MB
This Benchmark is running on the following Software:
PyTorch Version: 2.4.1+cu121
ONNX Version: 1.17.0
ONNXRuntime Version: 1.19.2
TensorRT Version: 10.9.0.34
- 代码
import torch
import torch.nn.functional as F
import torch.nn as nn
import torchvision.models as models
model = models.resnet18(pretrained=True)
BATCH_SIZE = 32
NUM_CLASSES = 1000
INPUT_SHAPE = (BATCH_SIZE, 3, 32, 32)
OUTPUT_SHAPE = (BATCH_SIZE, NUM_CLASSES)
dummy_input = torch.randn(BATCH_SIZE, 3, 32, 32)
cnn_model_onnx_save_path = "resnet18_pytorch.onnx"
torch.onnx.export(
model,
dummy_input, # 例如 shape=(1,3,224,224)
cnn_model_onnx_save_path,
input_names=["input"],
output_names=["output"],
dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
opset_version=11,
)
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import numpy as np
import torch.onnx
import torchvision.models as models
from torchvision.models import ResNet18_Weights
import torch
import time
import onnx
import onnxruntime as ort
import cpuinfo
import matplotlib.pyplot as plt
def build_engine(onnx_path, input_shape):
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, logger)
with open(onnx_path, "rb") as f:
if not parser.parse(f.read()):
for i in range(parser.num_errors):
print(parser.get_error(i))
raise RuntimeError("Failed to parse ONNX.")
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB
# 固定输入大小为 (1, 3, 224, 224)
profile = builder.create_optimization_profile()
input_tensor = network.get_input(0)
profile.set_shape(input_tensor.name, input_shape, input_shape, input_shape)
config.add_optimization_profile(profile)
engine_data = builder.build_serialized_network(network, config)
if not engine_data:
raise RuntimeError("Failed to build serialized engine.")
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(engine_data)
return engine
# PyTorch inference
def pytorch_inference(model, dummy_input, num_runs=100):
model.eval()
# model.half()
# dummy_input = dummy_input.half()
with torch.no_grad():
# Warmup
for _ in range(10):
_ = model(dummy_input)
# Benchmark
torch.cuda.synchronize()
start = time.time()
for _ in range(num_runs):
_ = model(dummy_input)
torch.cuda.synchronize()
end = time.time()
return (end - start) / num_runs
# ONNX inference
def onnx_inference(
onnx_path,
dummy_input,
num_runs=100,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
):
session = ort.InferenceSession(onnx_path, providers=providers)
print(
f"[ONNX Inference]: Using providers: {session.get_providers()} Args: {providers}"
)
# Warmup
for _ in range(10):
_ = session.run(None, {"input": dummy_input})
# Benchmark
start = time.time()
for _ in range(num_runs):
_ = session.run(None, {"input": dummy_input})
end = time.time()
return (end - start) / num_runs
# TensorRT inference
def tensorrt_inference(engine, d_input, dummy_input, num_runs=100):
context = engine.create_execution_context()
stream = cuda.Stream()
# Allocate host and device memory for output
output_shape = (BATCH_SIZE, 1000) # Based on ResNet18 output shape
h_output = cuda.pagelocked_empty(output_shape, dtype=np.float32)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create bindings
# bindings = [int(d_input), int(d_output)]
# Ref to https://forums.developer.nvidia.com/t/how-to-correctly-set-up-bindings-for-execute-async-v3/289924 to fix bugs about bindings
context.set_tensor_address(engine.get_tensor_name(0), int(d_input))
context.set_tensor_address(engine.get_tensor_name(1), int(d_output))
# Warmup
for _ in range(10):
cuda.memcpy_htod_async(d_input, dummy_input, stream)
context.execute_async_v3(stream_handle=stream.handle)
stream.synchronize()
# Benchmark
start = time.time()
for _ in range(num_runs):
cuda.memcpy_htod_async(d_input, dummy_input, stream)
context.execute_async_v3(stream.handle)
stream.synchronize()
end = time.time()
avg_time = (end - start) / num_runs
return avg_time
def quantized_tensorrt_inference(engine, dummy_tensor, num_runs=100):
total_time = 0
# Warmup
for _ in range(10):
output, time_span = engine.inference(dummy_tensor)
# Benchmark
for _ in range(num_runs):
output, time_span = engine.inference(dummy_tensor)
total_time += time_span
avg_time = total_time / num_runs
return avg_time
# Run benchmarks
# CPU Information
cpu_info = cpuinfo.get_cpu_info()
print("This Benchmark is running on the following Hardware:")
print("CPU Information:")
print(f"CPU Brand: {cpu_info['brand_raw']}")
print(f"CPU Architecture: {cpu_info['arch']}")
print(f"CPU Count: {cpu_info['count']}")
# GPU Information
print("\nGPU Information:")
if torch.cuda.is_available():
print(f"GPU Device: {torch.cuda.get_device_name(0)}")
print(f"GPU Count: {torch.cuda.device_count()}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"GPU Memory Usage:")
print(f" Allocated: {torch.cuda.memory_allocated(0)/1024**2:.2f}MB")
print(f" Cached: {torch.cuda.memory_reserved(0)/1024**2:.2f}MB")
else:
print("No GPU available")
print("This Benchmark is running on the following Software:")
print(f"PyTorch Version: {torch.__version__}")
print(f"ONNX Version: {onnx.__version__}")
print(f"ONNXRuntime Version: {ort.__version__}")
print(f"TensorRT Version: {trt.__version__}")
# 1. 构建引擎
engine = build_engine(cnn_model_onnx_save_path, INPUT_SHAPE)
input_nbytes = int(np.prod(INPUT_SHAPE) * np.float32().nbytes)
d_input = cuda.mem_alloc(input_nbytes)
# 假设输出维度为1000
output_nbytes = int(np.prod(OUTPUT_SHAPE) * np.float32().nbytes)
d_output = cuda.mem_alloc(output_nbytes)
bindings = [int(d_input), int(d_output)]
dummy_input = np.random.rand(*INPUT_SHAPE).astype(np.float32)
# dummy_input_pytorch = torch.tensor(dummy_input).cuda()
tensorrt_time = tensorrt_inference(engine, d_input, dummy_input)
onnx_cpu_time = onnx_inference(
cnn_model_onnx_save_path, dummy_input, providers=["CPUExecutionProvider"]
)
onnx_gpu_time = onnx_inference(
cnn_model_onnx_save_path, dummy_input, providers=["CUDAExecutionProvider"]
)
onnx_tensorrt_time = onnx_inference(
cnn_model_onnx_save_path,
dummy_input,
providers=["TensorrtExecutionProvider"],
)
dummy_input_pytorch = torch.tensor(dummy_input)
pytorch_cpu_time = pytorch_inference(model.cpu(), dummy_input_pytorch.cpu())
pytorch_gpu_time = pytorch_inference(model.cuda(), dummy_input_pytorch.cuda())
from tquant.quantization.quantizer import QuantizationManager
from tquant.quantization.utils import create_optimizer
from torchvision.datasets import CIFAR10
from torchvision import transforms
config_list = [
{
"op_types": ["Conv2d", "Linear"],
"target_names": ["weight"],
"quant_dtype": "int8",
"quant_scheme": "affine",
"granularity": "default",
},
]
device = "cuda" if torch.cuda.is_available() else "cpu"
ptq_manager = QuantizationManager('ptq', model, config_list, device)
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
)
calibration_loader = torch.utils.data.DataLoader(
dataset=CIFAR10(root="./data", train=True, download=True, transform=transform),
batch_size=BATCH_SIZE,
shuffle=True,
)
optimizer = create_optimizer(model, optimizer_name="SGD", lr=0.001)
scheduler = None
quantized_model, calibration_config = ptq_manager.quantize(
calibration_loader, optimizer, scheduler
)
print(calibration_config)
calib_data = None
for image, target in calibration_loader:
calib_data = image.numpy()
break
from nni.compression.quantization_speedup.calibrator import Calibrator
# TensorRT processes the calibration data in the batch size of 64
calib = Calibrator(
calib_data,
"data/cache/calib_cache_file.cache", # Replace with your own cache file path(absolute path)
batch_size=BATCH_SIZE,
)
from nni.compression.quantization_speedup import ModelSpeedupTensorRT
quant_engine = ModelSpeedupTensorRT(
model, input_shape=INPUT_SHAPE, config=calibration_config
)
quant_engine.compress_with_calibrator(calib)
quantize_tensorrt_inference_time = quantized_tensorrt_inference(
quant_engine, dummy_input_pytorch
)
times = [
pytorch_cpu_time * 1000,
onnx_cpu_time * 1000,
pytorch_gpu_time * 1000,
onnx_gpu_time * 1000,
onnx_tensorrt_time * 1000,
tensorrt_time * 1000,
quantize_tensorrt_inference_time * 1000,
]
labels = [
"PyTorch CPU",
"ONNX CPU",
"PyTorch GPU",
"ONNX GPU",
"ONNX TensorRT",
"TensorRT",
"TensorRT(Quantized)",
]
plt.figure(figsize=(15, 10))
plt.bar(labels, times)
plt.title("Inference Time Comparison")
plt.ylabel("Time (ms)")
plt.grid(True, alpha=0.3)
for i, v in enumerate(times):
plt.text(i, v + 0.1, f"{v:.2f}ms", ha="center")
plt.show()
import numpy as np
import seaborn as sns
# Create a 5x5 matrix where each cell is time_row / time_col
n = len(times)
comparison_matrix = np.zeros((n, n))
for i in range(n):
for j in range(n):
comparison_matrix[i][j] = times[i] / times[j]
# Create heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(
comparison_matrix,
annot=True,
fmt=".2f",
xticklabels=labels,
yticklabels=labels,
cmap="YlOrRd",
)
plt.title("Speed Comparison Matrix (row/column)")
plt.xlabel("Framework (denominator)")
plt.ylabel("Framework (numerator)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
实验结果
- 时间对比图
- 加速热力图
可以看到,量化后的ResNet18 模型,其推理时间相比它的浮点模型在PyTorch CPU,ONNX CPU,PyTorch GPU,ONNX GPU,ONNX TensorRT,TensorRT等平台上的推理速度都有比较大或者一定程度的提升!
结语
至此,我们完成NNI 适配 TensorRT10的过程