VScode版本
linux最新版本的vscode,可能无法进行python的调试
选择下载1.85 https://code.visualstudio.com/updates/v1_85
CUDA版本
https://developer.nvidia.com/Cuda-Toolkit-archive
由于受限于TRT的8.6(下面会说明),所以CUDA Version最高为: 12.1,而pytorch的限制只能是12.1或者是11.8。
重装cuda toolkit后,须重装pytorch
Pytorch版本
https://pytorch.org/get-started/locally/
选择12.1
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
TensorRT版本
https://developer.nvidia.com/tensorrt/download
受限于CentOS7版本,trt最高为8.6。
安装后的提示,配置路径与卸载命令:
Please make sure that
- PATH includes /usr/local/cuda-12.0/bin
- LD_LIBRARY_PATH includes /usr/local/cuda-12.0/lib64, or, add /usr/local/cuda-12.0/lib64 to /etc/ld.so.conf and run ldconfig as root
To uninstall the CUDA Toolkit, run cuda-uninstaller in /usr/local/cuda-12.0/bin
To uninstall the NVIDIA Driver, run nvidia-uninstall
报错
nn.Conv1d运算报错
在pytorch中使用 nn.Conv1d 算子,会报错:
[1] 17446 segmentation fault python test.py
这个网上无法搜到的问题,就是版本不匹配导致的问题,重装一遍cuda与pytorch。
TensorRT环境的可用性验证
这是一个带loop的完整样例,直接运行看TensorRT是否正常。
import numpy as np
import tensorrt as trt
from tensorrt import INetworkDefinition
from trt_inference import TRTInference
logger = trt.Logger(trt.Logger.WARNING)
# class MyLogger(trt.ILogger):
# def __init__(self):
# trt.ILogger.__init__(self)
# def log(self, severity, msg):
# pass # Your custom logging implementation here
# logger = MyLogger()
builder = trt.Builder(logger)
network = builder.create_network(trt.NetworkDefinitionCreationFlag.EXPLICIT_PRECISION)
num_iterations = 3
trip_limit = network.add_constant(shape=(), weights=trt.Weights(np.array([num_iterations], dtype=np.dtype("i"))))
accumaltor_value = network.add_input("input1", dtype=trt.float32, shape=(2, 3))
accumaltor_added_value = network.add_input("input2", dtype=trt.float32, shape=(2, 3))
loop = network.add_loop()
# setting the ITripLimit layer to stop after `num_iterations` iterations
loop.add_trip_limit(trip_limit.get_output(0), trt.TripLimit.COUNT)
# initialzing the IRecurrenceLayer with a init value
rec = loop.add_recurrence(accumaltor_value)
# eltwise inputs are 'accumaltor_added_value', and the IRecurrenceLayer output.
eltwise = network.add_elementwise(accumaltor_added_value, rec.get_output(0), op=trt.ElementWiseOperation.SUM)
# wiring the IRecurrenceLayer with the output of eltwise.
# The IRecurrenceLayer output would now be `accumaltor_value` for the first iteration, and the eltwise output for any other iteration
rec.set_input(1, eltwise.get_output(0))
# marking the IRecurrenceLayer output as the Loop output
loop_out = loop.add_loop_output(rec.get_output(0), trt.LoopOutput.LAST_VALUE)
# marking the Loop output as the network output
network.mark_output(loop_out.get_output(0))
inputs = {}
outputs = {}
expected = {}
inputs[accumaltor_value.name] = np.array(
[
[2.7, -4.9, 23.34],
[8.9, 10.3, -19.8],
])
inputs[accumaltor_added_value.name] = np.array(
[
[1.1, 2.2, 3.3],
[-5.7, 1.3, 4.6],
])
outputs[loop_out.get_output(0).name] = eltwise.get_input(0).shape
expected[loop_out.get_output(0).name] = inputs[accumaltor_value.name] + inputs[accumaltor_added_value.name] * num_iterations
print("Expected:", expected)
builder_config = builder.create_builder_config()
builder_config.set_flag(trt.BuilderFlag.VERSION_COMPATIBLE)
builder_config.set_flag(trt.BuilderFlag.EXCLUDE_LEAN_RUNTIME)
plan = builder.build_serialized_network(network, builder_config)
# v10_runtime = trt.Runtime(logger)
# v8_shim_runtime = v10_runtime.load_runtime('/home/mark.yj/TensorRT-8.6.1.6/bin/trtexec')
# engine = v10_runtime.deserialize_cuda_engine(plan)
trtInfer = TRTInference(plan)
r = trtInfer.infer(inputs, outputs)
print("Prediction:", r)
上述代码中的 TRTInference是一个封装类:
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
import os
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit # 注意,必须引入
from collections import OrderedDict
import torch
import numpy as np
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem, dtype, size):
self.host = host_mem
self.device = device_mem
self.dtype = dtype
self.size = size
def allocate_buffers(engine, context, input_data):
# 根据input_data改shape
for key, value in input_data.items():
r = context.set_binding_shape(engine.get_binding_index(key), value.shape)
if not r:
print(f"set binding shape False: {key}")
inputs = OrderedDict()
outputs = OrderedDict()
bindings = []
for binding_idx, binding_name in enumerate(engine):
size = trt.volume(context.get_binding_shape(binding_idx))
# size = trt.volume(engine.get_binding_shape(binding))
dtype = trt.nptype(engine.get_binding_dtype(binding_idx))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding_idx):
inputs[binding_name] = HostDeviceMem(host_mem, device_mem, dtype, size)
else:
outputs[binding_name] = HostDeviceMem(host_mem, device_mem, dtype, size)
return inputs, outputs, bindings
# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs:map, outputs, stream):
inputs, outputs = inputs.values(), outputs.values()
# Transfer input data to the GPU.
for inp in inputs:
cuda.memcpy_htod_async(inp.device, inp.host, stream)
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
for out in outputs:
cuda.memcpy_dtoh_async(out.host, out.device, stream)
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
class TRTInference(object):
def __init__(self, plan):
TRT_LOGGER = trt.Logger()
self.trt_runtime = trt.Runtime(TRT_LOGGER)
if isinstance(plan, str):
plan = open(plan, "rb").read()
self.trt_engine = self.trt_runtime.deserialize_cuda_engine(plan)
self.context = self.trt_engine.create_execution_context()
def infer(self, input_data:map, output_shapes:map):
# This allocates memory for network inputs/outputs on both CPU and GPU
self.inputs, self.outputs, self.bindings = \
allocate_buffers(self.trt_engine, self.context, input_data)
self.stream = cuda.Stream()
for binding_name, mem in self.inputs.items():
input_type = mem.dtype
input_fix = np.ascontiguousarray(input_data[binding_name].astype(input_type))
mem.host = input_fix
# input = np.array(input_data[binding_name], dtype=mem.dtype, order='C')
# np.copyto(mem.host, input.ravel())
# Fetch output from the model
res = do_inference_v2(
self.context, bindings=self.bindings, inputs=self.inputs,
outputs=self.outputs, stream=self.stream)
# Before doing post-processing, we need to reshape the outputs as the common.do_inference will
# give us flat arrays.
outputs_reshape = []
for binding_name, shape in output_shapes.items():
ot = self.outputs[binding_name]
outputs_reshape.append(ot.host.reshape(shape))
# And return results
return outputs_reshape