pointnet C++推理部署--tensorrt框架

news2024/9/23 3:26:42

classification

在这里插入图片描述

如上图所示,由于直接export出的onnx文件有两个输出节点,不方便处理,所以编写脚本删除不需要的输出节点193:

import onnx


onnx_model = onnx.load("cls.onnx")
graph = onnx_model.graph
 
inputs = graph.input
for input in inputs:
    print('input',input.name)
    
outputs = graph.output
for output in outputs:
    print('output',output.name)

graph.output.remove(outputs[1])
onnx.save(onnx_model, 'cls_modified.onnx')

C++推理代码:

#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <NvOnnxParser.h>


const int point_num = 1024;


void pc_normalize(std::vector<float>& points)
{
	float mean_x = 0, mean_y = 0, mean_z = 0;
	for (size_t i = 0; i < point_num; ++i)
	{
		mean_x += points[3 * i];
		mean_y += points[3 * i + 1];
		mean_z += points[3 * i + 2];
	}
	mean_x /= point_num;
	mean_y /= point_num;
	mean_z /= point_num;

	for (size_t i = 0; i < point_num; ++i)
	{
		points[3 * i] -= mean_x;
		points[3 * i + 1] -= mean_y;
		points[3 * i + 2] -= mean_z;
	}

	float m = 0;
	for (size_t i = 0; i < point_num; ++i)
	{
		if (sqrt(pow(points[3 * i], 2) + pow(points[3 * i + 1], 2) + pow(points[3 * i + 2], 2)) > m)
			m = sqrt(pow(points[3 * i], 2) + pow(points[3 * i + 1], 2) + pow(points[3 * i + 2], 2));
	}

	for (size_t i = 0; i < point_num; ++i)
	{
		points[3 * i] /= m;
		points[3 * i + 1] /= m;
		points[3 * i + 2] /= m;
	}
}

class TRTLogger : public nvinfer1::ILogger 
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO) 
			printf(msg);
	}
} logger;

std::vector<unsigned char> load_file(const std::string& file) 
{
	std::ifstream in(file, std::ios::in | std::ios::binary);
	if (!in.is_open())
		return {};

	in.seekg(0, std::ios::end);
	size_t length = in.tellg();

	std::vector<uint8_t> data;
	if (length > 0) 
	{
		in.seekg(0, std::ios::beg);
		data.resize(length);
		in.read((char*)& data[0], length);
	}
	in.close();
	return data;
}

void classfier(std::vector<float> & points)
{
	TRTLogger logger;
	nvinfer1::ICudaEngine* engine;

//#define BUILD_ENGINE

#ifdef  BUILD_ENGINE
	nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
	nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
	nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);

	nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);
	if (!parser->parseFromFile("cls_modified.onnx", 1))
	{
		printf("Failed to parser onnx\n");
		return;
	}

	int maxBatchSize = 1;
	config->setMaxWorkspaceSize(1 << 32);

	engine = builder->buildEngineWithConfig(*network, *config);
	if (engine == nullptr) {
		printf("Build engine failed.\n");
		return;
	}

	nvinfer1::IHostMemory* model_data = engine->serialize();
	FILE* f = fopen("cls.engine", "wb");
	fwrite(model_data->data(), 1, model_data->size(), f);
	fclose(f);

	model_data->destroy();
	parser->destroy();
	engine->destroy();
	network->destroy();
	config->destroy();
	builder->destroy();
#endif  

	auto engine_data = load_file("cls.engine");
	nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
	engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
	if (engine == nullptr)
	{
		printf("Deserialize cuda engine failed.\n");
		runtime->destroy();
		return;
	}

	nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
	cudaStream_t stream = nullptr;
	cudaStreamCreate(&stream);

	float* input_data_host = nullptr;
	const size_t input_numel = 1 * 3 * point_num;
	cudaMallocHost(&input_data_host, input_numel * sizeof(float));
	for (size_t i = 0; i < 3; i++)
	{
		for (size_t j = 0; j < point_num; j++)
		{
			input_data_host[point_num * i + j] = points[3 * j + i];
		}
	}

	float* input_data_device = nullptr;
	float output_data_host[10];
	float* output_data_device = nullptr;
	cudaMalloc(&input_data_device, input_numel * sizeof(float));
	cudaMalloc(&output_data_device, sizeof(output_data_host));
	cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream);
	float* bindings[] = { input_data_device, output_data_device };

	bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
	cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	int predict_label = std::max_element(output_data_host, output_data_host + 10) - output_data_host;
	std::cout << "\npredict_label: " << predict_label << std::endl;

	cudaStreamDestroy(stream);
	execution_context->destroy();
	engine->destroy();
	runtime->destroy();
}


int main()
{
	std::vector<float> points;
	std::ifstream infile;
	float x, y, z, nx, ny, nz;
	char ch;
	infile.open("bed_0610.txt");
	for (size_t i = 0; i < point_num; i++)
	{
		infile >> x >> ch >> y >> ch >> z >> ch >> nx >> ch >> ny >> ch >> nz;
		points.push_back(x);
		points.push_back(y);
		points.push_back(z);
	}
	infile.close();

	pc_normalize(points);

	classfier(points);

	return 0;
}

其中推理引擎的构建也可以直接使用tensorrt的bin目录下的trtexec.exe。
LZ也实现了cuda版本的前处理代码,但似乎效率比cpu前处理还低。可能是数据量不够大吧(才10^3数量级),而且目前LZ的cuda水平也只是入门阶段…

#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <NvOnnxParser.h>


const int point_num = 1024;
const int thread_num = 1024;
const int block_num = 1;

__global__ void array_sum(float* data, float* val, int N)
{
	__shared__ double share_dTemp[thread_num];
	const int nStep = gridDim.x * blockDim.x;
	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	double dTempSum = 0.0;
	for (int i = tid; i < N; i += nStep)
	{
		dTempSum += data[i];
	}
	share_dTemp[threadIdx.x] = dTempSum;
	__syncthreads();

	for (int i = blockDim.x / 2; i != 0; i /= 2)
	{
		if (threadIdx.x < i)
		{
			share_dTemp[threadIdx.x] += share_dTemp[threadIdx.x + i];
		}
		__syncthreads();
	}

	if (0 == threadIdx.x)
	{
		atomicAdd(val, share_dTemp[0]);
	}
}

__global__ void array_sub(float* data, float val, int N)
{
	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	const int nStep = blockDim.x * gridDim.x;

	for (int i = tid; i < N; i += nStep)
	{
		data[i] = data[i] - val;
	}
}

__global__ void array_L2(float* in, float* out, int N)
{
	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	const int nStep = blockDim.x * gridDim.x;

	for (int i = tid; i < N; i += nStep)
	{
		out[i] = sqrt(pow(in[i], 2) + pow(in[i + N], 2) + pow(in[i + 2 * N], 2));
	}
}

__global__ void array_max(float* mem, int numbers) 
{
	int tid = threadIdx.x;
	int idof = blockIdx.x * blockDim.x;
	int idx = tid + idof;
	extern __shared__ float tep[];
	if (idx >= numbers) return;
	tep[tid] = mem[idx];
	unsigned int bi = 0;
	for (int s = 1; s < blockDim.x; s = (s << 1))
	{
		unsigned int kid = tid << (bi + 1);
		if ((kid + s) >= blockDim.x || (idof + kid + s) >= numbers) break;
		tep[kid] = tep[kid] > tep[kid + s] ? tep[kid] : tep[kid + s];
		++bi;
		__syncthreads();
	}
	if (tid == 0) 
	{
		mem[blockIdx.x] = tep[0];
	}
}

__global__ void array_div(float* data, float val, int N)
{
	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	const int nStep = blockDim.x * gridDim.x;

	for (int i = tid; i < N; i += nStep)
	{
		data[i] = data[i] / val;
	}
}

void pc_normalize_gpu(float* points)
{
	float *mean_x = NULL,  *mean_y = NULL,  *mean_z = NULL;
	cudaMalloc((void**)& mean_x, sizeof(float));
	cudaMalloc((void**)& mean_y, sizeof(float));
	cudaMalloc((void**)& mean_z, sizeof(float));
	array_sum << <thread_num, block_num >> > (points + 0 * point_num, mean_x, point_num);
	array_sum << <thread_num, block_num >> > (points + 1 * point_num, mean_y, point_num);
	array_sum << <thread_num, block_num >> > (points + 2 * point_num, mean_z, point_num);

	float mx, my, mz;
	cudaMemcpy(&mx, mean_x, sizeof(float), cudaMemcpyDeviceToHost);
	cudaMemcpy(&my, mean_y, sizeof(float), cudaMemcpyDeviceToHost);
	cudaMemcpy(&mz, mean_z, sizeof(float), cudaMemcpyDeviceToHost);

	array_sub << <thread_num, block_num >> > (points + 0 * point_num, mx / point_num, point_num);
	array_sub << <thread_num, block_num >> > (points + 1 * point_num, my / point_num, point_num);
	array_sub << <thread_num, block_num >> > (points + 2 * point_num, mz / point_num, point_num);
	//float* pts = (float*)malloc(sizeof(float) * point_num);
	//cudaMemcpy(pts, points, sizeof(float) * point_num, cudaMemcpyDeviceToHost);
	//for (size_t i = 0; i < point_num; i++)
	//{
	//	std::cout << pts[i] << std::endl;
	//}

	float* L2 = NULL;
	cudaMalloc((void**)& L2, sizeof(float) * point_num);
	array_L2 << <thread_num, block_num >> > (points, L2, point_num);
	//float* l2 = (float*)malloc(sizeof(float) * point_num);
	//cudaMemcpy(l2, L2, sizeof(float) * point_num, cudaMemcpyDeviceToHost);
	//for (size_t i = 0; i < point_num; i++)
	//{
	//	std::cout << l2[i] << std::endl;
	//}

	int tmp_num = point_num;
	int share_size = sizeof(float) * thread_num;
	int block_num = (tmp_num + thread_num - 1) / thread_num;
	do {
		array_max << <block_num, thread_num, share_size >> > (L2, thread_num);
		tmp_num = block_num;
		block_num = (tmp_num + thread_num - 1) / thread_num;
	} while (tmp_num > 1);

	float max;
	cudaMemcpy(&max, L2, sizeof(float), cudaMemcpyDeviceToHost);
	//std::cout << max << std::endl;

	array_div << <thread_num, block_num >> > (points + 0 * point_num, max, point_num);
	array_div << <thread_num, block_num >> > (points + 1 * point_num, max, point_num);
	array_div << <thread_num, block_num >> > (points + 2 * point_num, max, point_num);


}

class TRTLogger : public nvinfer1::ILogger 
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO) 
			printf(msg);
	}
} logger;

std::vector<unsigned char> load_file(const std::string& file) 
{
	std::ifstream in(file, std::ios::in | std::ios::binary);
	if (!in.is_open())
		return {};

	in.seekg(0, std::ios::end);
	size_t length = in.tellg();

	std::vector<uint8_t> data;
	if (length > 0) 
	{
		in.seekg(0, std::ios::beg);
		data.resize(length);
		in.read((char*)& data[0], length);
	}
	in.close();
	return data;
}

void classfier(std::vector<float> & points)
{
	TRTLogger logger;
	nvinfer1::ICudaEngine* engine;

//#define BUILD_ENGINE

#ifdef  BUILD_ENGINE
	nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
	nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
	nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);

	nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);
	if (!parser->parseFromFile("cls_modified.onnx", 1))
	{
		printf("Failed to parser onnx\n");
		return;
	}

	int maxBatchSize = 1;
	config->setMaxWorkspaceSize(1 << 32);

	engine = builder->buildEngineWithConfig(*network, *config);
	if (engine == nullptr) {
		printf("Build engine failed.\n");
		return;
	}

	nvinfer1::IHostMemory* model_data = engine->serialize();
	FILE* f = fopen("cls.engine", "wb");
	fwrite(model_data->data(), 1, model_data->size(), f);
	fclose(f);

	model_data->destroy();
	parser->destroy();
	engine->destroy();
	network->destroy();
	config->destroy();
	builder->destroy();
#endif  

	auto engine_data = load_file("cls.engine");
	nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
	engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
	if (engine == nullptr)
	{
		printf("Deserialize cuda engine failed.\n");
		runtime->destroy();
		return;
	}

	nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
	cudaStream_t stream = nullptr;
	cudaStreamCreate(&stream);

	float* input_data_host = nullptr;
	const size_t input_numel = 1 * 3 * point_num;
	cudaMallocHost(&input_data_host, input_numel * sizeof(float));
	for (size_t i = 0; i < 3; i++)
	{
		for (size_t j = 0; j < point_num; j++)
		{
			input_data_host[point_num * i + j] = points[3 * j + i];
		}
	}

	float* input_data_device = nullptr;
	float output_data_host[10];
	float* output_data_device = nullptr;
	cudaMalloc(&input_data_device, input_numel * sizeof(float));
	cudaMalloc(&output_data_device, sizeof(output_data_host));
	cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream);
	pc_normalize_gpu(input_data_device);
	float* bindings[] = { input_data_device, output_data_device };

	bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
	cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	int predict_label = std::max_element(output_data_host, output_data_host + 10) - output_data_host;
	std::cout << "\npredict_label: " << predict_label << std::endl;

	cudaStreamDestroy(stream);
	execution_context->destroy();
	engine->destroy();
	runtime->destroy();
}


int main()
{
	std::vector<float> points;
	std::ifstream infile;
	float x, y, z, nx, ny, nz;
	char ch;
	infile.open("sofa_0020.txt");
	for (size_t i = 0; i < point_num; i++)
	{
		infile >> x >> ch >> y >> ch >> z >> ch >> nx >> ch >> ny >> ch >> nz;
		points.push_back(x);
		points.push_back(y);
		points.push_back(z);
	}
	infile.close();

	classfier(points);

	return 0;
}

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/901422.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

使用SSH隧道将Ubuntu云服务器Jupyter Notebook端口映射到本地

本文主要实现了在Ubuntu云服务器后台运行Jupyter Notebook&#xff0c;并使用SSH隧道将服务器端口映射到本地 1. 生成配置文件 运行以下命令生成Jupyter Notebook的配置文件&#xff1a; jupyter notebook --generate-config这将在用户主目录下生成一个名为.jupyter的文件夹&…

微人事 登录问题完善

重启服务端的时候&#xff0c;发现前端页面会操作不了&#xff0c;这样后端session会失效&#xff0c;我们就需要让页面重新跳转到登录页 springsecurity配置类后端配置 前端拦截器进行拦截跳转

【Git】分支管理

文章目录 一、理解分支二、创建、切换、合并分支三、删除分支四、合并冲突五、合并模式六、分支策略七、bug分支八、强制删除分支 努力经营当下 直至未来明朗&#xff01; 一、理解分支 HEAD指向的是master分支&#xff0c;master中指向的是最新一次的提交&#xff0c;也就是m…

T113-S3-TCA6424-gpio扩展芯片调试

目录 前言 一、TCA6424介绍 二、原理图连接 三、设备树配置 四、内核配置 五、gpio操作 总结 前言 TCA6424是一款常用的GPIO&#xff08;通用输入输出&#xff09;扩展芯片&#xff0c;可以扩展微控制器的IO口数量。在T113-S3平台上&#xff0c;使用TCA6424作为GPIO扩展芯…

使用 Elasticsearch 轻松进行中文文本分类

本文记录下使用 Elasticsearch 进行文本分类&#xff0c;当我第一次偶然发现 Elasticsearch 时&#xff0c;就被它的易用性、速度和配置选项所吸引。每次使用 Elasticsearch&#xff0c;我都能找到一种更为简单的方法来解决我一贯通过传统的自然语言处理 (NLP) 工具和技术来解决…

leetcode 198. 打家劫舍

2023.8.19 打劫问题是经典的动态规划问题。先设一个dp数组&#xff0c;dp[i]的含义为&#xff1a;前 i 个房屋能盗取的最高金额。 每间房屋无非就是偷&#xff0c;或者不偷这两种情况&#xff0c;于是可以写出递推公式&#xff1a; …

使用nrm快速切换npm源以及解决Method Not Implemented

文章目录 什么是nrm如何使用nrm查看本机目前使用的npm 源安装nrm查看可选源查看当前使用源切换源添加源删除源测试源的响应时间 如果你遇到这个报错&#xff0c;就可以采用这种方案解决哦解决方案&#xff1a;1. 切换为官方源2. 查看漏洞3. 修复漏洞4. 下面命令慎重使用&#x…

【Linux】进程信号篇Ⅰ:信号的产生(signal、kill、raise、abort、alarm)、信号的保存(core dump)

文章目录 一、 signal 函数&#xff1a;用户自定义捕捉信号二、信号的产生1. 通过中断按键产生信号2. 调用系统函数向进程发信号2.1 kill 函数&#xff1a;给任意进程发送任意信号2.2 raise 函数&#xff1a;给调用进程发送任意信号2.3 abort 函数&#xff1a;给调用进程发送 6…

域名解析和代理

购买域名 这里使用腾讯云进行购买。 对域名进行解析 通过添加记录接口对域名进行解析。 此时我们的服务器地址就被解析到域名上了。 我们可以通过以下格式进行访问&#xff1a; [域名]:[对应的项目端口] 效果为下: 通过nginx进行代理 如果我们使用上述的方式进行访问还是…

轻松搭建书店小程序

在现今数字化时代&#xff0c;拥有一个自己的小程序成为了许多企业和个人的追求。而对于书店经营者来说&#xff0c;拥有一个能够提供在线购书服务的小程序将有助于吸引更多的读者&#xff0c;并提升销售额。本文将为您介绍如何轻松搭建书店小程序&#xff0c;并将其成功上线。…

Spring事件监听源码解析

spring事件监听机制离不开容器IOC特性提供的支持&#xff0c;比如容器会自动创建事件发布器&#xff0c;自动识别用户注册的监听器并进行管理&#xff0c;在特定的事件发布后会找到对应的事件监听器并对其监听方法进行回调。Spring帮助用户屏蔽了关于事件监听机制背后的很多细节…

Spring事务和事务传播机制(1)

前言&#x1f36d; ❤️❤️❤️SSM专栏更新中&#xff0c;各位大佬觉得写得不错&#xff0c;支持一下&#xff0c;感谢了&#xff01;❤️❤️❤️ Spring Spring MVC MyBatis_冷兮雪的博客-CSDN博客 在Spring框架中&#xff0c;事务管理是一种用于维护数据库操作的一致性和…

HLK-LD105/2410B/2420模块测试

HLK105/2410B/2420模块测试 &#x1f4cc;模块资料地址&#xff1a;https://h.hlktech.com/Mobile/download &#x1f33f;HLK-LD105模块&#xff1a; 10G微波雷达 &#x1f33f;HLK-LD2420-24G&#xff1a;24G毫米波雷达 &#x1f33f;HLK-LD2410B-24G&#xff1a;24…

网络安全--wazuh环境配置及漏洞复现

目录 一、wazuh配置 二、wazuh案例复现 一、wazuh配置 1.1进入官网下载OVA启动软件 Virtual Machine (OVA) - Installation alternatives (wazuh.com) 1.2点击启动部署&#xff0c;傻瓜式操作 1.3通过账号&#xff1a;wazuh-user&#xff0c;密码&#xff1a;wazuh进入wazuh…

【NOIP】XR-4模拟赛,Oler教练的上班策略

author&#xff1a;&Carlton tag&#xff1a;模拟 topic&#xff1a;【NOIP】XR-4模拟赛&#xff0c;Oler教练的上班策略 language&#xff1a;C website&#xff1a;洛谷 date&#xff1a;2023年8月19日 目录 题目 我的题解 思路 源代码 改进 题目 我的题解 思路…

iptables防火墙(SNAT与DNAT)

目录 1 SNAT 1.1 SNAT原理与应用 1.2 SNAT工作原理 1.3 SNAT转换前提条件 2 SNAT示例 ​编辑 2.1 网关服务器配置 2.1.1 网关服务器配置网卡 2.1.2 开启SNAT命令 2.2 内网服务器端配置 2.3 外网服务器端配置 2.4 网卡服务器端添加规则 2.5 SNAT 测试 3 DNAT 3.1 网卡…

Windows 11 下使用 VMWare Workstation 17 Pro 新建 CentOS Stream 9 64位 虚拟机 并配置网络

文章目录 为什么选择 CentOS Stream 9下载安装访问连接快照克隆网络配置 为什么选择 CentOS Stream 9 CentOS Linux 8: 已经过了 End-of-life (EOL)CentOS Linux 7: EOL Jun 30th, 2024CentOS Stream 8: EOL May 31st, 2024CentOS Stream 9: End of RHEL9 full support phase …

DTC 19服务学习1

在UDS&#xff08;统一诊断服务&#xff09;协议中&#xff0c;0x19是用于DTC&#xff08;诊断故障代码&#xff09;信息的服务。以下是你提到的子服务的功能和作用&#xff1a; 0x01 - 报告DTC按状态掩码。这个子服务用于获取当前存储在ECU中的DTC列表。状态掩码用于过滤DTC&a…

redis闪退的三大解决办法(windows版本)

目录 一、遇到问题 二、出现问题的原因 三、三大启动方法 &#xff08;一&#xff09;启动服务端的命令一 &#xff08;二&#xff09;启动服务端命令二 &#xff08;三&#xff09;必解决redis服务端 一、遇到问题 下载完成redis的5的版本&#xff0c;要进行启动redis的…

UML基础模型

目录 1.抽象类2.接口3.继承4.实现接口5.关联关系6.聚合关系7.合成&#xff08;组合&#xff09;关系8.依赖关系 1.抽象类 矩形框代表一个类&#xff08;Class&#xff09;。 类图分为三层&#xff1a; 第一层显示类的名称&#xff0c;如果是抽象类&#xff0c;就用斜体显示&am…