link
在使用摄像头直接检测目标时,检测的实时画面还是有点慢,下面是tensorrt加速过程记录。
一、设备
1、设备jetson agx xavier
2、jetpack4.6.1
3、tensorrt 8.2.1.8
4、conda虚拟环境 python=3.6
二、虚拟环境搭建及依赖
1、参考此博客安装torch
Nvidia jetson xavier agx 安装pytorch1.9.0 Gpu版_Ponnyao的博客-CSDN博客_xavier安装pytorch
2、安装pycuda
-
conda activate pytorch #我的虚拟环境名字是pytorch
-
pip3 install pycuda
3、虚拟环境中使用tensorrt
-
#查看tensorrt路径
-
sudo find
/ -name tensorrt
*
-
-
#进入虚拟环境的此路径
-
cd
/home
/nvidia
/archiconda
/envs
/pytorch
/lib
/python
3.6
/site-packages
-
-
#设置软连接
-
ln -s
/usr
/lib
/python
3.6
/dist-packages
/tensorrt
-
-
#上一步不行的话用这个
-
ln -s
/usr
/lib
/python
3.6
/dist-packages
/tensorrt
/tensorrt.so
三、加速过程
我的项目yolov5_tensorrt-深度学习文档类资源-CSDN下载
1、下载项目
以yolov5 _6.0为例
-
mkidr yolov
5_tensorrt
-
cd yolov
5_tensorrt
-
git clone -b v
6.0 https:
/
/github.com
/ultralytics
/yolov
5.git
-
git clone https:
/
/github.com
/wang-xinyu
/tensorrtx.git
2、下载yolov5s.pt文件
下载后,放到 yolov5_tensorrt/yolov5文件夹下
https://github.com/ultralytics/yolov5/releases/tag/v6.0
3、转换模型pt->wts
-
cp yolov
5_tensorrt
/tensorrtx
/yolov
5
/gen_wts.py yolov
5_tensorrt
/yolov
5
-
cd yolov
5_tensorrt
/yolov
5
-
python
3 gen_wts.py -w yolov
5s.pt -o yolov
5s.wts
4、生成引擎文件
-
cd yolov
5_tensorrt
/tensorrtx
/yolov
5
/
-
mkdir build
-
cd build
-
cp yolov
5_tensorrt
/yolov
5
/yolov
5s.wts yolov
5_tensorrt
/tensorrtx
/yolov
5
/build
-
cmake ..
-
make
-
sudo .
/yolov
5 -s yolov
5s.wts yolov
5s.engine s
生成yolov5s.engine。
5、摄像头加速
原作者只有图片加速,下面是大神修改的摄像头加速文件。
yolov5_trt_cam.py
-
""
"
-
An example that uses TensorRT's Python api to make inferences.
-
"
""
-
import ctypes
-
import os
-
import shutil
-
import
random
-
import sys
-
import threading
-
import
time
-
import cv
2
-
import numpy
as np
-
import pycuda.autoinit
-
import pycuda.driver
as cuda
-
import tensorrt
as trt
-
import torch
-
import torchvision
-
import argparse
-
-
CONF_THRESH
=
0.5
-
IOU_THRESHOLD
=
0.4
-
-
-
def get_img_path_batches(batch_
size, img_dir):
-
ret
= []
-
batch
= []
-
for root, dirs, files
in os.walk(img_dir):
-
for name
in files:
-
if len(batch)
=
= batch_
size:
-
ret.append(batch)
-
batch
= []
-
batch.append(os.path.join(root, name))
-
if len(batch)
>
0:
-
ret.append(batch)
-
return ret
-
-
def plot_one_box(x, img, color
=None, label
=None,
line_thickness
=None):
-
""
"
-
description: Plots one bounding box on image img,
-
this function comes from YoLov5 project.
-
param:
-
x: a box likes [x1,y1,x2,y2]
-
img: a opencv image object
-
color: color to draw rectangle, such as (0,255,0)
-
label: str
-
line_thickness: int
-
return:
-
no return
-
"
""
-
tl
= (
-
line_thickness
or round(
0.002
* (img.shape[
0]
+ img.shape[
1])
/
2)
+
1
-
) #
line
/font thickness
-
color
= color
or [
random.randint(
0,
255)
for _
in range(
3)]
-
c1, c
2
= (int(x[
0]), int(x[
1])), (int(x[
2]), int(x[
3]))
-
cv
2.rectangle(img, c
1, c
2, color, thickness
=tl, lineType
=cv
2.
LINE_AA)
-
if label:
-
tf
= max(tl
-
1,
1) # font thickness
-
t_
size
= cv
2.getTextSize(label,
0, fontScale
=tl
/
3, thickness
=tf)[
0]
-
c
2
= c
1[
0]
+ t_
size[
0], c
1[
1]
- t_
size[
1]
-
3
-
cv
2.rectangle(img, c
1, c
2, color, -
1, cv
2.
LINE_AA) # filled
-
cv
2.putText(
-
img,
-
label,
-
(c
1[
0], c
1[
1]
-
2),
-
0,
-
tl
/
3,
-
[
225,
255,
255],
-
thickness
=tf,
-
lineType
=cv
2.
LINE_AA,
-
)
-
-
-
class YoLov
5TRT(
object):
-
""
"
-
description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
-
"
""
-
-
def __init__(
self, engine_
file_path):
-
# Create a Context
on this device,
-
self.ctx
= cuda.Device(
0).make_context()
-
stream
= cuda.Stream()
-
TRT_LOGGER
= trt.Logger(trt.Logger.INFO)
-
runtime
= trt.Runtime(TRT_LOGGER)
-
-
# Deserialize the engine
from
file
-
with
open(engine_
file_path,
"rb")
as f:
-
engine
= runtime.deserialize_cuda_engine(f.
read())
-
context
= engine.create_execution_context()
-
-
host_inputs
= []
-
cuda_inputs
= []
-
host_outputs
= []
-
cuda_outputs
= []
-
bindings
= []
-
-
for binding
in engine:
-
print(
'bingding:', binding, engine.
get_binding_shape(binding))
-
size
= trt.volume(engine.
get_binding_shape(binding))
* engine.max_batch_
size
-
dtype
= trt.nptype(engine.
get_binding_dtype(binding))
-
#
Allocate host
and device buffers
-
host_mem
= cuda.pagelocked_empty(
size, dtype)
-
cuda_mem
= cuda.mem_alloc(host_mem.nbytes)
-
# Append the device buffer
to device bindings.
-
bindings.append(int(cuda_mem))
-
# Append
to the appropriate list.
-
if engine.binding_
is_
input(binding):
-
self.
input_w
= engine.
get_binding_shape(binding)[-
1]
-
self.
input_h
= engine.
get_binding_shape(binding)[-
2]
-
host_inputs.append(host_mem)
-
cuda_inputs.append(cuda_mem)
-
else:
-
host_outputs.append(host_mem)
-
cuda_outputs.append(cuda_mem)
-
-
# Store
-
self.stream
= stream
-
self.context
= context
-
self.engine
= engine
-
self.host_inputs
= host_inputs
-
self.cuda_inputs
= cuda_inputs
-
self.host_outputs
= host_outputs
-
self.cuda_outputs
= cuda_outputs
-
self.bindings
= bindings
-
self.batch_
size
= engine.max_batch_
size
-
-
def infer(
self,
input_image_path):
-
threading.Thread.__init__(
self)
-
# Make
self the active context, pushing it
on
top
of the context stack.
-
self.ctx.push()
-
self.
input_image_path
=
input_image_path
-
# Restore
-
stream
=
self.stream
-
context
=
self.context
-
engine
=
self.engine
-
host_inputs
=
self.host_inputs
-
cuda_inputs
=
self.cuda_inputs
-
host_outputs
=
self.host_outputs
-
cuda_outputs
=
self.cuda_outputs
-
bindings
=
self.bindings
-
# Do image preprocess
-
batch_image_raw
= []
-
batch_origin_h
= []
-
batch_origin_w
= []
-
batch_
input_image
= np.empty(shape
=[
self.batch_
size,
3,
self.
input_h,
self.
input_w])
-
-
input_image, image_raw, origin_h, origin_w
=
self.preprocess_image(
input_image_path
-
)
-
-
-
batch_origin_h.append(origin_h)
-
batch_origin_w.append(origin_w)
-
np.copyto(batch_
input_image,
input_image)
-
batch_
input_image
= np.ascontiguousarray(batch_
input_image)
-
-
#
Copy
input image
to host buffer
-
np.copyto(host_inputs[
0], batch_
input_image.ravel())
-
start
=
time.
time()
-
# Transfer
input
data
to the GPU.
-
cuda.memcpy_htod_async(cuda_inputs[
0], host_inputs[
0], stream)
-
#
Run inference.
-
context.execute_async(batch_
size
=
self.batch_
size, bindings
=bindings, stream_handle
=stream.handle)
-
# Transfer predictions back
from the GPU.
-
cuda.memcpy_dtoh_async(host_outputs[
0], cuda_outputs[
0], stream)
-
# Synchronize the stream
-
stream.synchronize()
-
end
=
time.
time()
-
# Remove
any context
from the
top
of the context stack, deactivating it.
-
self.ctx.pop()
-
# Here we
use the
first row
of
output
in that batch_
size
=
1
-
output
= host_outputs[
0]
-
# Do postprocess
-
result_boxes, result_scores, result_classid
=
self.post_process(
-
output, origin_h, origin_w)
-
# Draw rectangles
and labels
on the original image
-
for j
in range(len(result_boxes)):
-
box
= result_boxes[j]
-
plot_one_box(
-
box,
-
image_raw,
-
label
=
"{}:{:.2f}".
format(
-
categories[int(result_classid[j])], result_scores[j]
-
),
-
)
-
return image_raw,
end
-
start
-
-
def destroy(
self):
-
# Remove
any context
from the
top
of the context stack, deactivating it.
-
self.ctx.pop()
-
-
def
get_raw_image(
self, image_path_batch):
-
""
"
-
description: Read an image from image path
-
"
""
-
for img_path
in image_path_batch:
-
yield cv
2.imread(img_path)
-
-
def
get_raw_image_
zeros(
self, image_path_batch
=None):
-
""
"
-
description: Ready data for warmup
-
"
""
-
for _
in range(
self.batch_
size):
-
yield np.
zeros([
self.
input_h,
self.
input_w,
3], dtype
=np.uint
8)
-
-
def preprocess_image(
self,
input_image_path):
-
""
"
-
description: Convert BGR image to RGB,
-
resize and pad it to target size, normalize to [0,1],
-
transform to NCHW format.
-
param:
-
input_image_path: str, image path
-
return:
-
image: the processed image
-
image_raw: the original image
-
h: original height
-
w: original width
-
"
""
-
image_raw
=
input_image_path
-
h, w, c
= image_raw.shape
-
image
= cv
2.cvtColor(image_raw, cv
2.COLOR_BGR
2RGB)
-
# Calculate widht
and height
and paddings
-
r_w
=
self.
input_w
/ w
-
r_h
=
self.
input_h
/ h
-
if r_h
> r_w:
-
tw
=
self.
input_w
-
th
= int(r_w
* h)
-
tx
1
= tx
2
=
0
-
ty
1
= int((
self.
input_h
- th)
/
2)
-
ty
2
=
self.
input_h
- th
- ty
1
-
else:
-
tw
= int(r_h
* w)
-
th
=
self.
input_h
-
tx
1
= int((
self.
input_w
- tw)
/
2)
-
tx
2
=
self.
input_w
- tw
- tx
1
-
ty
1
= ty
2
=
0
-
# Resize the image
with long side while maintaining ratio
-
image
= cv
2.resize(image, (tw, th))
-
# Pad the short side
with (
128,128,128)
-
image
= cv
2.copyMakeBorder(
-
image, ty
1, ty
2, tx
1, tx
2, cv
2.BORDER_
CONSTANT, (
128,
128,
128)
-
)
-
image
= image.astype(np.float
32)
-
# Normalize
to [
0,1]
-
image
/
=
255.0
-
# HWC
to CHW
format:
-
image
= np.transpose(image, [
2,
0,
1])
-
# CHW
to NCHW
format
-
image
= np.expand_dims(image, axis
=
0)
-
# Convert the image
to row-major
order,
also known
as
"C order":
-
image
= np.ascontiguousarray(image)
-
return image, image_raw, h, w
-
-
def xywh
2xyxy(
self, origin_h, origin_w, x):
-
""
"
-
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
-
param:
-
origin_h: height of original image
-
origin_w: width of original image
-
x: A boxes tensor, each row is a box [center_x, center_y, w, h]
-
return:
-
y: A boxes tensor, each row is a box [x1, y1, x2, y2]
-
"
""
-
y
= torch.
zeros_like(x)
if isinstance(x, torch.Tensor)
else np.
zeros_like(x)
-
r_w
=
self.
input_w
/ origin_w
-
r_h
=
self.
input_h
/ origin_h
-
if r_h
> r_w:
-
y[:,
0]
= x[:,
0]
- x[:,
2]
/
2
-
y[:,
2]
= x[:,
0]
+ x[:,
2]
/
2
-
y[:,
1]
= x[:,
1]
- x[:,
3]
/
2
- (
self.
input_h
- r_w
* origin_h)
/
2
-
y[:,
3]
= x[:,
1]
+ x[:,
3]
/
2
- (
self.
input_h
- r_w
* origin_h)
/
2
-
y
/
= r_w
-
else:
-
y[:,
0]
= x[:,
0]
- x[:,
2]
/
2
- (
self.
input_w
- r_h
* origin_w)
/
2
-
y[:,
2]
= x[:,
0]
+ x[:,
2]
/
2
- (
self.
input_w
- r_h
* origin_w)
/
2
-
y[:,
1]
= x[:,
1]
- x[:,
3]
/
2
-
y[:,
3]
= x[:,
1]
+ x[:,
3]
/
2
-
y
/
= r_h
-
-
return y
-
-
def post_process(
self,
output, origin_h, origin_w):
-
""
"
-
description: postprocess the prediction
-
param:
-
output: A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
-
origin_h: height of original image
-
origin_w: width of original image
-
return:
-
result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
-
result_scores: finally scores, a tensor, each element is the score correspoing to box
-
result_classid: finally classid, a tensor, each element is the classid correspoing to box
-
"
""
-
#
Get the num
of boxes detected
-
num
= int(
output[
0])
-
# Reshape
to a two dimentional ndarray
-
pred
= np.reshape(
output[
1:], (-
1,
6))[:num, :]
-
#
to a torch Tensor
-
pred
= torch.Tensor(pred).cud
a()
-
#
Get the boxes
-
boxes
= pred[:, :
4]
-
#
Get the scores
-
scores
= pred[:,
4]
-
#
Get the classid
-
classid
= pred[:,
5]
-
# Choose those boxes that score
> CONF_THRESH
-
si
= scores
> CONF_THRESH
-
boxes
= boxes[si, :]
-
scores
= scores[si]
-
classid
= classid[si]
-
# Trandform bbox
from [center_x, center_y, w, h]
to [x
1, y
1, x
2, y
2]
-
boxes
=
self.xywh
2xyxy(origin_h, origin_w, boxes)
-
# Do nms
-
indices
= torchvision.ops.nms(boxes, scores, iou_threshold
=IOU_THRESHOLD).cpu()
-
result_boxes
= boxes[indices, :].cpu()
-
result_scores
= scores[indices].cpu()
-
result_classid
= classid[indices].cpu()
-
return result_boxes, result_scores, result_classid
-
-
-
class inferThread(threading.Thread):
-
def __init__(
self, yolov
5_wrapper):
-
threading.Thread.__init__(
self)
-
self.yolov
5_wrapper
= yolov
5_wrapper
-
def infer(
self , frame):
-
batch_image_raw,
use_
time
=
self.yolov
5_wrapper.infer(frame)
-
-
#
for i, img_path
in enumerate(
self.image_path_batch):
-
# parent, filename
= os.path.split(img_path)
-
# save_name
= os.path.join(
'output', filename)
-
# # Save image
-
# cv
2.imwrite(save_name, batch_image_raw[i])
-
# print(
'input->{}, time->{:.2f}ms, saving into output/'.
format(
self.image_path_batch,
use_
time
*
1000))
-
return batch_image_raw,
use_
time
-
-
class warmUpThread(threading.Thread):
-
def __init__(
self, yolov
5_wrapper):
-
threading.Thread.__init__(
self)
-
self.yolov
5_wrapper
= yolov
5_wrapper
-
-
def
run(
self):
-
batch_image_raw,
use_
time
=
self.yolov
5_wrapper.infer(
self.yolov
5_wrapper.
get_raw_image_
zeros())
-
print(
'warm_up->{}, time->{:.2f}ms'.
format(batch_image_raw[
0].shape,
use_
time
*
1000))
-
-
-
-
if __name__
=
=
"__main__":
-
# load custom plugins
-
parser
= argparse.ArgumentParser()
-
parser.
add_argument(
'--engine', nargs
=
'+',
type
=str,
default
=
"build/yolov5s.engine", help
=
'.engine path(s)')
-
parser.
add_argument(
'--save',
type
=int,
default
=
0, help
=
'save?')
-
opt
= parser.parse_args()
-
PLUGIN_LIBRARY
=
"build/libmyplugins.so"
-
engine_
file_path
= opt.engine
-
-
ctypes.CDLL(PLUGIN_LIBRARY)
-
-
# load coco labels
-
-
categories
= [
"person",
"bicycle",
"car",
"motorcycle",
"airplane",
"bus",
"train",
"truck",
"boat",
"traffic light",
-
"fire hydrant",
"stop sign",
"parking meter",
"bench",
"bird",
"cat",
"dog",
"horse",
"sheep",
"cow",
-
"elephant",
"bear",
"zebra",
"giraffe",
"backpack",
"umbrella",
"handbag",
"tie",
"suitcase",
"frisbee",
-
"skis",
"snowboard",
"sports ball",
"kite",
"baseball bat",
"baseball glove",
"skateboard",
"surfboard",
-
"tennis racket",
"bottle",
"wine glass",
"cup",
"fork",
"knife",
"spoon",
"bowl",
"banana",
"apple",
-
"sandwich",
"orange",
"broccoli",
"carrot",
"hot dog",
"pizza",
"donut",
"cake",
"chair",
"couch",
-
"potted plant",
"bed",
"dining table",
"toilet",
"tv",
"laptop",
"mouse",
"remote",
"keyboard",
"cell phone",
-
"microwave",
"oven",
"toaster",
"sink",
"refrigerator",
"book",
"clock",
"vase",
"scissors",
"teddy bear",
-
"hair drier",
"toothbrush"]
-
# a YoLov
5TRT instance
-
yolov
5_wrapper
= YoLov
5TRT(engine_
file_path)
-
cap
= cv
2.VideoCapture(
0)
-
try:
-
thread
1
= inferThread(yolov
5_wrapper)
-
thread
1.
start()
-
thread
1.join()
-
while
1:
-
_,frame
= cap.
read()
-
img,t
=thread
1.infer(frame)
-
cv
2.imshow(
"result", img)
-
if cv
2.waitKey(
1)
&
0XFF
=
= ord(
'q'): #
1 millisecond
-
break
-
-
-
finally:
-
# destroy the instance
-
cap.
release()
-
cv
2.destroyAllWindows()
-
yolov
5_wrapper.destroy()
参考
tensorrtx/yolov5 at master · wang-xinyu/tensorrtx · GitHub
Jetson AGX Xavier实现TensorRT加速YOLOv5进行实时检测_围白的尾巴的博客-CSDN博客