NNI 适配 TensorRT10教程

引言

本文涉及两个框架及其版本分别为

NNI (Neural Network Intelligence) ：3.0
TensorRT：10.9.0.34

NNI 在文档 Speed Up Quantized Model with TensorRT里描述了如何使用 TensorRT 为NNI量化的模型实现加速，但是从NNI 的源代码https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/integrated_tensorrt.py 来看：

https://github.com/microsoft/nni/blob/767ed7f22e1e588ce76cbbecb6c6a4a76a309805/nni/compression/quantization_speedup/integrated_tensorrt.py#L14

TRT8 = 8

https://github.com/microsoft/nni/blob/767ed7f22e1e588ce76cbbecb6c6a4a76a309805/nni/compression/quantization_speedup/integrated_tensorrt.py#L292

assert trt_version >= TRT8, "Version of TensorRT is too old, please \
    update TensorRT to version >= 8.0"

来看，实际上 NNI只支持 TensorRT 8 的API，但是TensorRT后续已经更新到10了，并且NNI 已经不再更新，因此有必要实现NNI 适配 TensorRT10。

本文将参考：

API Migration Guide
个人实践经验
完成适配过程。

适配过程

修改 integrated_tensorrt.py 文件

integrated_tensorrt.py 位于https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/integrated_tensorrt.py

L291

assert trt_version >= TRT8, "Version of TensorRT is too old, please \
    update TensorRT to version >= 8.0"

修改为

trt_version = int(trt.__version__.split('.')[0])
assert trt_version >= TRT8, "Version of TensorRT is too old, please \
    update TensorRT to version >= 8.0"

L231-L232

builder.max_batch_size = input_shape[0]
trt_config.max_workspace_size = common.GiB(8)

修改为

# builder.max_batch_size = input_shape[0]
# trt_config.max_workspace_size = common.GiB(8)
trt_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 8 << 30)

L255

engine = builder.build_engine(network, trt_config)

修改为

# engine = builder.build_engine(network, trt_config)
engine_data = builder.build_serialized_network(network, trt_config)
if not engine_data:
    raise RuntimeError("Failed to build serialized engine.")

runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(engine_data)

L354

engine_input_shape = self.engine.get_tensor_shape(self.engine.get_tensor_name(0))

修改为

# engine_input_shape = self.engine.get_binding_shape(0)
engine_input_shape = self.engine.get_tensor_shape(self.engine.get_tensor_name(0))

L365

trt_outputs = common.do_inference_v2(self.context, bindings=self.bindings, inputs=self.inputs,
                                        outputs=self.outputs, stream=self.stream)

修改为

trt_outputs = common.do_inference_v2(self.engine, self.context, 
                                     self.bindings, 
                                     inputs=self.inputs,
                                        outputs=self.outputs, stream=self.stream)

修改 trt_pycuda.py

trt_pycuda.py 位于https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/trt_pycuda.py

for binding in engine:
    size = trt.volume(engine.get_binding_shape(binding)) # * engine.max_batch_size, batch size already in
    dtype = trt.nptype(engine.get_binding_dtype(binding))

修改为

# ref to https://docs.nvidia.com/deeplearning/tensorrt/migration-guide/index.html
# modify the code to support to compatibility with TensorRT 10.0
"""
for binding in engine: 
    size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
    dtype = trt.nptype(engine.get_binding_dtype(binding))
"""
for i in range(engine.num_io_tensors):
    tensor_name = engine.get_tensor_name(i)
    size = trt.volume(engine.get_tensor_shape(tensor_name))
    dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))

L93

if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:

修改为

# if engine.binding_is_input(binding):
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:

L102

def do_inference_v2(context, bindings, inputs, outputs, stream):

修改为

def do_inference_v2(engine, context, bindings, inputs, outputs, stream):
    # Ref to https://docs.nvidia.com/deeplearning/tensorrt/migration-guide/index.html to 
    # Setup tensor address
    for i in range(engine.num_io_tensors):
        context.set_tensor_address(engine.get_tensor_name(i), bindings[i])

L110

context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

修改为

# context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
context.execute_async_v3(stream_handle=stream.handle)

实验

我们试着在NNI上对ResNet18 进行量化，然后使用TensorRT 进行加速验证

实验准备

实验平台信息

This Benchmark is running on the following Hardware:
CPU Information:
CPU Brand: Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz
CPU Architecture: X86_64
CPU Count: 40

GPU Information:
GPU Device: Tesla T4
GPU Count: 1
CUDA Version: 12.1
GPU Memory Usage:
  Allocated: 0.00MB
  Cached: 0.00MB
This Benchmark is running on the following Software:
PyTorch Version: 2.4.1+cu121
ONNX Version: 1.17.0
ONNXRuntime Version: 1.19.2
TensorRT Version: 10.9.0.34

代码

import torch
import torch.nn.functional as F
import torch.nn as nn
import torchvision.models as models

model = models.resnet18(pretrained=True)
BATCH_SIZE = 32
NUM_CLASSES = 1000

INPUT_SHAPE = (BATCH_SIZE, 3, 32, 32)
OUTPUT_SHAPE = (BATCH_SIZE, NUM_CLASSES)

dummy_input = torch.randn(BATCH_SIZE, 3, 32, 32)
cnn_model_onnx_save_path = "resnet18_pytorch.onnx"


torch.onnx.export(
    model,
    dummy_input,  # 例如 shape=(1,3,224,224)
    cnn_model_onnx_save_path,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
    opset_version=11,
)


import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import numpy as np
import torch.onnx
import torchvision.models as models
from torchvision.models import ResNet18_Weights
import torch
import time
import onnx
import onnxruntime as ort
import cpuinfo
import matplotlib.pyplot as plt


def build_engine(onnx_path, input_shape):
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)

    with open(onnx_path, "rb") as f:
        if not parser.parse(f.read()):
            for i in range(parser.num_errors):
                print(parser.get_error(i))
            raise RuntimeError("Failed to parse ONNX.")

    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB
    # 固定输入大小为 (1, 3, 224, 224)
    profile = builder.create_optimization_profile()
    input_tensor = network.get_input(0)
    profile.set_shape(input_tensor.name, input_shape, input_shape, input_shape)
    config.add_optimization_profile(profile)

    engine_data = builder.build_serialized_network(network, config)
    if not engine_data:
        raise RuntimeError("Failed to build serialized engine.")

    runtime = trt.Runtime(logger)
    engine = runtime.deserialize_cuda_engine(engine_data)
    return engine


# PyTorch inference
def pytorch_inference(model, dummy_input, num_runs=100):
    model.eval()
    # model.half()
    # dummy_input = dummy_input.half()
    with torch.no_grad():
        # Warmup
        for _ in range(10):
            _ = model(dummy_input)

        # Benchmark
        torch.cuda.synchronize()
        start = time.time()
        for _ in range(num_runs):
            _ = model(dummy_input)
        torch.cuda.synchronize()
        end = time.time()
    return (end - start) / num_runs


# ONNX inference
def onnx_inference(
    onnx_path,
    dummy_input,
    num_runs=100,
    providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
):

    session = ort.InferenceSession(onnx_path, providers=providers)
    print(
        f"[ONNX Inference]: Using providers: {session.get_providers()} Args: {providers}"
    )
    # Warmup
    for _ in range(10):
        _ = session.run(None, {"input": dummy_input})

    # Benchmark
    start = time.time()
    for _ in range(num_runs):
        _ = session.run(None, {"input": dummy_input})
    end = time.time()
    return (end - start) / num_runs


# TensorRT inference
def tensorrt_inference(engine, d_input, dummy_input, num_runs=100):
    context = engine.create_execution_context()
    stream = cuda.Stream()

    # Allocate host and device memory for output
    output_shape = (BATCH_SIZE, 1000)  # Based on ResNet18 output shape
    h_output = cuda.pagelocked_empty(output_shape, dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)

    # Create bindings
    # bindings = [int(d_input), int(d_output)]
    # Ref to https://forums.developer.nvidia.com/t/how-to-correctly-set-up-bindings-for-execute-async-v3/289924 to fix bugs about bindings
    context.set_tensor_address(engine.get_tensor_name(0), int(d_input))
    context.set_tensor_address(engine.get_tensor_name(1), int(d_output))

    # Warmup
    for _ in range(10):
        cuda.memcpy_htod_async(d_input, dummy_input, stream)
        context.execute_async_v3(stream_handle=stream.handle)
        stream.synchronize()

    # Benchmark
    start = time.time()
    for _ in range(num_runs):
        cuda.memcpy_htod_async(d_input, dummy_input, stream)
        context.execute_async_v3(stream.handle)
        stream.synchronize()
    end = time.time()

    avg_time = (end - start) / num_runs
    return avg_time


def quantized_tensorrt_inference(engine, dummy_tensor, num_runs=100):
    total_time = 0

    # Warmup
    for _ in range(10):
        output, time_span = engine.inference(dummy_tensor)

    # Benchmark
    for _ in range(num_runs):
        output, time_span = engine.inference(dummy_tensor)
        total_time += time_span

    avg_time = total_time / num_runs
    return avg_time


# Run benchmarks
# CPU Information
cpu_info = cpuinfo.get_cpu_info()
print("This Benchmark is running on the following Hardware:")
print("CPU Information:")
print(f"CPU Brand: {cpu_info['brand_raw']}")
print(f"CPU Architecture: {cpu_info['arch']}")
print(f"CPU Count: {cpu_info['count']}")

# GPU Information
print("\nGPU Information:")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Count: {torch.cuda.device_count()}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Memory Usage:")
    print(f"  Allocated: {torch.cuda.memory_allocated(0)/1024**2:.2f}MB")
    print(f"  Cached: {torch.cuda.memory_reserved(0)/1024**2:.2f}MB")
else:
    print("No GPU available")

print("This Benchmark is running on the following Software:")
print(f"PyTorch Version: {torch.__version__}")
print(f"ONNX Version: {onnx.__version__}")
print(f"ONNXRuntime Version: {ort.__version__}")
print(f"TensorRT Version: {trt.__version__}")


# 1. 构建引擎
engine = build_engine(cnn_model_onnx_save_path, INPUT_SHAPE)

input_nbytes = int(np.prod(INPUT_SHAPE) * np.float32().nbytes)
d_input = cuda.mem_alloc(input_nbytes)

# 假设输出维度为1000
output_nbytes = int(np.prod(OUTPUT_SHAPE) * np.float32().nbytes)
d_output = cuda.mem_alloc(output_nbytes)
bindings = [int(d_input), int(d_output)]

dummy_input = np.random.rand(*INPUT_SHAPE).astype(np.float32)
# dummy_input_pytorch = torch.tensor(dummy_input).cuda()


tensorrt_time = tensorrt_inference(engine, d_input, dummy_input)

onnx_cpu_time = onnx_inference(
    cnn_model_onnx_save_path, dummy_input, providers=["CPUExecutionProvider"]
)
onnx_gpu_time = onnx_inference(
    cnn_model_onnx_save_path, dummy_input, providers=["CUDAExecutionProvider"]
)
onnx_tensorrt_time = onnx_inference(
    cnn_model_onnx_save_path,
    dummy_input,
    providers=["TensorrtExecutionProvider"],
)
dummy_input_pytorch = torch.tensor(dummy_input)
pytorch_cpu_time = pytorch_inference(model.cpu(), dummy_input_pytorch.cpu())
pytorch_gpu_time = pytorch_inference(model.cuda(), dummy_input_pytorch.cuda())


from tquant.quantization.quantizer import QuantizationManager
from tquant.quantization.utils import create_optimizer
from torchvision.datasets import CIFAR10
from torchvision import transforms


config_list = [
    {
        "op_types": ["Conv2d", "Linear"],
        "target_names": ["weight"],
        "quant_dtype": "int8",
        "quant_scheme": "affine",
        "granularity": "default",
    },
]

device = "cuda" if torch.cuda.is_available() else "cpu"
ptq_manager = QuantizationManager('ptq', model, config_list, device)
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
)
calibration_loader = torch.utils.data.DataLoader(
    dataset=CIFAR10(root="./data", train=True, download=True, transform=transform),
    batch_size=BATCH_SIZE,
    shuffle=True,
)

optimizer = create_optimizer(model, optimizer_name="SGD", lr=0.001)
scheduler = None

quantized_model, calibration_config = ptq_manager.quantize(
    calibration_loader, optimizer, scheduler
)
print(calibration_config)

calib_data = None

for image, target in calibration_loader:
    calib_data = image.numpy()
    break


from nni.compression.quantization_speedup.calibrator import Calibrator

# TensorRT processes the calibration data in the batch size of 64
calib = Calibrator(
    calib_data,
    "data/cache/calib_cache_file.cache", # Replace with your own cache file path(absolute path)
    batch_size=BATCH_SIZE,
)

from nni.compression.quantization_speedup import ModelSpeedupTensorRT

quant_engine = ModelSpeedupTensorRT(
    model, input_shape=INPUT_SHAPE, config=calibration_config
)
quant_engine.compress_with_calibrator(calib)


quantize_tensorrt_inference_time = quantized_tensorrt_inference(
    quant_engine, dummy_input_pytorch
)


times = [
    pytorch_cpu_time * 1000,
    onnx_cpu_time * 1000,
    pytorch_gpu_time * 1000,
    onnx_gpu_time * 1000,
    onnx_tensorrt_time * 1000,
    tensorrt_time * 1000,
    quantize_tensorrt_inference_time * 1000,
]
labels = [
    "PyTorch CPU",
    "ONNX CPU",
    "PyTorch GPU",
    "ONNX GPU",
    "ONNX TensorRT",
    "TensorRT",
    "TensorRT(Quantized)",
]

plt.figure(figsize=(15, 10))
plt.bar(labels, times)
plt.title("Inference Time Comparison")
plt.ylabel("Time (ms)")
plt.grid(True, alpha=0.3)

for i, v in enumerate(times):
    plt.text(i, v + 0.1, f"{v:.2f}ms", ha="center")
plt.show()


import numpy as np
import seaborn as sns


# Create a 5x5 matrix where each cell is time_row / time_col
n = len(times)
comparison_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        comparison_matrix[i][j] = times[i] / times[j]

# Create heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(
    comparison_matrix,
    annot=True,
    fmt=".2f",
    xticklabels=labels,
    yticklabels=labels,
    cmap="YlOrRd",
)
plt.title("Speed Comparison Matrix (row/column)")
plt.xlabel("Framework (denominator)")
plt.ylabel("Framework (numerator)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()