前言
在Windows下使用TCP协议,基于OpenCV C++与Yolov5实现了一个完整的实时推流的深度学习图像处理客户端与服务器端,为了达到实时传输的效果,客户端使用了多线程的方式实现。深度学习模型是基于onnxruntime的GPU推理。,实现效果如下:
基于OpenCV C++的网络实时视频流传输
1. 服务器端
1. 1服务器类的实现
首先定义一个服务器类,类实现,接收发送过来的图像并进行解码,并接收客户端检测到的人脸关键点,并画到最终要显示的图像上:
#include "Server.h"
#define SIZE 100
Server::Server()
{
}
Server::~Server()
{
}
bool Server::initialization(const int& port, const cv::VideoCapture& cap)
{
m_port = port;
m_cap = cap;
// 初始化库(windows独有)
//初始化套接字库
WORD w_req = MAKEWORD(2, 2);//版本号
WSADATA wsadata;
int err;
err = WSAStartup(w_req, &wsadata);
if (err != 0) {
std::cout << "初始化套接字库失败!" << std::endl;
return -1;
}
else {
std::cout << "初始化套接字库成功!" << std::endl;
}
//检测版本号
if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2) {
std::cout << "套接字库版本号不符!" << std::endl;
WSACleanup();
return false;
}
else {
std::cout << "套接字库版本正确!" << std::endl;
}
return true;
}
bool Server::initialization(const int& port)
{
m_port = port;
// 初始化库(windows独有)
//初始化套接字库
WORD w_req = MAKEWORD(2, 2);//版本号
WSADATA wsadata;
int err;
err = WSAStartup(w_req, &wsadata);
if (err != 0)
{
std::cout << "初始化套接字库失败!" << std::endl;
return false;
}
else
{
std::cout << "初始化套接字库成功!" << std::endl;
}
//检测版本号
if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2)
{
std::cout << "套接字库版本号不符!" << std::endl;
WSACleanup();
return false;
}
else
{
std::cout << "套接字库版本正确!" << std::endl;
}
return true;
}
bool Server::build_connect()
{
//服务端地址客户端地址
SOCKADDR_IN server_addr;
SOCKADDR_IN accept_addr;
//填充服务端信息
server_addr.sin_family = AF_INET; // 用来定义那种地址族,AF_INET:IPV4
server_addr.sin_addr.S_un.S_addr = htonl(INADDR_ANY); // 保存ip地址,htonl将一个无符号长整型转换为TCP/IP协议网络的大端
// INADDR_ANY表示一个服务器上的所有网卡
server_addr.sin_port = htons(m_port); // 端口号
//创建套接字
m_server = socket(AF_INET, SOCK_STREAM, 0); // 使用tcp进行连接
if (::bind(m_server, (SOCKADDR*)&server_addr, sizeof(SOCKADDR)) == SOCKET_ERROR)
{
std::cout << "套接字绑定失败!" << std::endl;
WSACleanup();
return false;
}
else
{
std::cout << "套接字绑定成功!" << std::endl;
}
//设置套接字为监听状态
if (listen(m_server, SOMAXCONN) < 0)
{
std::cout << "设置监听状态失败!" << std::endl;
WSACleanup();
return false;
}
else
{
std::cout << "设置监听状态成功!" << std::endl;
}
std::cout << "服务端正在监听连接,请稍候...." << std::endl;
//接受连接请求
int len = sizeof(SOCKADDR);
m_accept = accept(m_server, (SOCKADDR*)&accept_addr, &len);
if (m_accept == SOCKET_ERROR)
{
std::cout << "连接失败!" << std::endl;
WSACleanup();
return false;
}
std::cout << "连接建立,准备接受数据" << std::endl;
return true;
}
bool Server::send_data()
{
cv::Mat frame;
std::vector<uchar> data_encode;
std::vector<int> params; // 压缩参数
params.resize(3, 0);
params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩
params[1] = 30;
char frames_cnt[10] = { 0, };
_itoa_s(int(m_cap.get(cv::CAP_PROP_FRAME_COUNT)), frames_cnt, 10);
send(m_accept, frames_cnt, 10, 0);
std::cout << "开始发送" << std::endl;
int j = 0;
while (m_cap.read(frame)) {
m_file_in.push_back(frame.clone());
imencode(".jpg", frame, data_encode, params); // 对图像进行压缩
int len_encoder = data_encode.size();
_itoa_s(len_encoder, frames_cnt, 10);
send(m_accept, frames_cnt, 10, 0);
_itoa_s(SIZE, frames_cnt, 10);
send(m_accept, frames_cnt, 10, 0);
// 发送
char send_char[SIZE] = { 0, };
int index = 0;
bool flag = false;
for (int i = 0; i < len_encoder / SIZE + 1; ++i) {
for (int k = 0; k < SIZE; ++k) {
if (index >= data_encode.size()) {
flag = true;
break;
}
send_char[k] = data_encode[index++];
}
send(m_accept, send_char, SIZE, 0);
}
data_encode.clear();
++j;
std::cout << j << std::endl; // 发送端一直在发送
}
std::cout << "发送完成";
return true;
}
std::vector<std::string> split_string(const char str[], char delimiter)
{
std::vector<std::string> tokens;
std::istringstream stream(str);
std::string token;
// 使用分隔符分割字符串
while (std::getline(stream, token, delimiter))
{
tokens.push_back(token);
}
return tokens;
}
float str_to_float(const std::string& str)
{
// 将std::string转换为C风格字符串
const char* c_str = str.c_str();
// 使用std::strtof转换字符串为float
char* endptr;
float result = std::strtof(c_str, &endptr);
// 检查转换是否成功
if (endptr == c_str)
{
std::cerr << "StringToFloat failed to convert string to float." << std::endl;
}
return result;
}
int str_to_point2f(std::vector<std::string> str, std::vector<cv::Point2f> &point2f)
{
point2f.push_back(cv::Point2f(str_to_float(str[1]), str_to_float(str[2])));
point2f.push_back(cv::Point2f(str_to_float(str[3]), str_to_float(str[4])));
point2f.push_back(cv::Point2f(str_to_float(str[5]), str_to_float(str[6])));
point2f.push_back(cv::Point2f(str_to_float(str[7]), str_to_float(str[8])));
point2f.push_back(cv::Point2f(str_to_float(str[9]), str_to_float(str[10])));
cv::Point2f p2f(0, 0);
int index = 0;
for (const cv::Point2f& p : point2f)
{
if (p == p2f)
{
index++;
}
}
return index;
}
bool Server::receive_data()
{
cv::Mat cv_frame;
std::vector<uchar> data_decode;
std::vector<int> params; // 压缩参数
params.resize(3, 0);
params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩
params[1] = image_quality;
cv::namedWindow("ServerData", cv::WINDOW_NORMAL);
char frams_cnt[100] = { 0, };
// 解析总帧数
int count = atoi(frams_cnt);
//std::cout << count << std::endl;
int idx = 0;
while (1)
{
// 解析图片字节长度
int irecv = recv(m_accept, frams_cnt, 100, 0);
char s = '#';
std::vector<std::string> strs = split_string(frams_cnt, s);
if (strs.size() != 11)
{
break;
}
int cnt = std::stoi(strs[0]);
std::vector<cv::Point2f> face_points;
int point_index = str_to_point2f(strs,face_points);
data_decode.resize(cnt);//将队列大小重置为图片字节长度
int index = 0;//表示接收数据长度计量
count = cnt;//表示的是要从接收缓冲区接收字节的数量
char *recv_char = new char[cnt];//新建一个字节数组 数组长度为图片字节长度
while (count > 0)//这里只能写count > 0 如果写count >= 0 那么while循环会陷入一个死循环
{
//在网络通信中 recv 函数一次性接收到的字节数可能小于等于设定的SIZE大小,这时可能需要多次recv
int iRet = recv(m_accept, recv_char, count, 0);
int tmp = 0;
for (int k = 0; k < iRet; k++)
{
tmp = k+1;
index++;
if (index >= cnt) { break; }
}
memcpy(&data_decode[index - tmp ], recv_char , tmp);
if (!iRet) { return -1; }
count -= iRet;//更新余下需要从接收缓冲区接收的字节数量
}
delete[]recv_char;
try
{
cv_frame = cv::imdecode(data_decode, cv::IMREAD_COLOR);
if (!cv_frame.empty() && point_index < 2)
{
for (int i = 0; i < face_points.size(); ++i)
{
cv::circle(cv_frame, face_points[i], 4, cv::Scalar(0, 0, 255), -1, cv::LINE_AA);
}
cv::namedWindow("ServerData", 0);
cv::imshow("ServerData", cv_frame);
cv::waitKey(1);
data_decode.clear();
}
else
{
data_decode.clear();
continue;
}
}
catch (const char *msg)
{
data_decode.clear();
continue;
}
}
std::cout << "接受完成";
return true;
}
bool Server::send_data_frame(cv::Mat input)
{
cv::Mat frame = input;
std::vector<uchar> data_encode;
std::vector<int> params; // 压缩参数
params.resize(3, 0);
params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩
params[1] = 100;
char frames_cnt[10] = { 0, };
std::cout << "开始发送" << std::endl;
m_file_in.push_back(frame.clone());
imencode(".jpg", frame, data_encode, params); // 对图像进行压缩
int len_encoder = data_encode.size();
_itoa_s(len_encoder, frames_cnt, 10);
send(m_accept, frames_cnt, 10, 0);
_itoa_s(SIZE, frames_cnt, 10);
send(m_accept, frames_cnt, 10, 0);
// 发送
char send_char[SIZE] = { 0, };
int index = 0;
bool flag = false;
for (int i = 0; i < len_encoder / SIZE + 1; ++i) {
for (int k = 0; k < SIZE; ++k) {
if (index >= data_encode.size()) {
flag = true;
break;
}
send_char[k] = data_encode[index++];
}
send(m_accept, send_char, SIZE, 0);
}
data_encode.clear();
std::cout << "发送完成";
return true;
}
bool Server::receive_data_frame(cv::Mat& output)
{
cv::Mat frame;
std::vector<uchar> data_decode;
std::vector<int> params; // 压缩参数
params.resize(3, 0);
params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩
params[1] = 100;
char frams_cnt[10] = { 0, };
recv(m_accept, frams_cnt, 10, 0);
// 解析总帧数
int cnt = atoi(frams_cnt);
std::cout << "frams_cnt " << frams_cnt <<" "<< cnt<< std::endl;
recv(m_accept, frams_cnt, 10, 0);
int size = atoi(frams_cnt);
std::cout << "size " << size << std::endl;
data_decode.resize(cnt);
int index = 0;
bool flag = true;
char *recv_b = new char[cnt];
std::cout << " cnt= " << cnt << std::endl;
int iRecv = recv(m_accept, recv_b, cnt, 0);
for (int i = 0; i < cnt; i++)
{
data_decode[index++] = recv_b[i];
}
std::cout << "data_decode" << data_decode.size() << std::endl;
output = cv::imdecode(data_decode, cv::IMREAD_COLOR);
//output = imdecode(data_decode, CV_LOAD_IMAGE_COLOR);
std::cout << " output.size " << output.size().width << " "<< output.size().height << std::endl;
delete[]recv_b;
data_decode.clear();
std::cout << "接受完成";
return true;
}
bool Server::free_connect()
{
m_cap.release();
//关闭套接字
closesocket(m_server);
closesocket(m_accept);
//释放DLL资源
WSACleanup();
return true;
}
1.2 类调用
在main函数进行调用,这里使用8080这个端口,端口可以按自己的要求更改,只要不占用就行:
#include "server.h"
int recv_online_video(int port = 8080)
{
Server ser;
ser.initialization(port);
ser.build_connect();
ser.receive_data();
ser.free_connect();
return 0;
}
int main()
{
recv_online_video(8080);
return 0;
}
2. 客户端
2.1 客户端类实现
客户端实现了对视频里面的图像进行人脸检测,人脸检测使用的yolov5s face,并把检测到人脸的关键点打包发送给服务器,然后实现数据序列化发送给服务器:
#include "CameraClient.h"
CameraClient::CameraClient(std::string detect_model_path)
{
yolov5_face = new YOLOV5Face();
yolov5_face->set_gpu(0, 4);
yolov5_face->set_num_thread(4);
yolov5_face->read_model(detect_model_path);
}
CameraClient::~CameraClient()
{
}
std::string float_to_string(float value, int precision)
{
std::stringstream ss;
ss << std::fixed << std::setprecision(precision) << value;
return ss.str();
}
std::string point_to_str(cv::Point2f point, int precision)
{
return ("#" + float_to_string(point.x, 3) + "#" + float_to_string(point.y, 3));
}
std::string points_to_str(std::vector<cv::Point2f> points, int precision)
{
std::string strs;
for (int i = 0; i < points.size(); ++i)
{
std::string s = point_to_str(points[i], precision);
strs += s;
}
return strs;
}
void CameraClient::get_camera_frame(std::string camera_path,int width,int height,int fps,bool is_show)
{
bool found = false;
cv::Size size(width, height);
for (const cv::Size& size : this->camera_size)
{
if (size == size)
{
found = true;
break;
}
}
if (!found)
{
std::cerr << " Camera resolution setting error." << std::endl;
return;
}
int mask = 0;
if (size == cv::Size(540, 960))
{
size = cv::Size(720, 1280);
mask = 1;
}
/*int index = camera_index < 0 ? 0 : camera_index;
cv::VideoCapture cap(index);*/
cv::VideoCapture cap(camera_path);
if (!cap.isOpened())
{
std::cerr << " Camera is no opened." << std::endl;
return;
}
cap.set(cv::CAP_PROP_FRAME_WIDTH, size.width);//宽度
cap.set(cv::CAP_PROP_FRAME_HEIGHT, size.height);
cap.set(cv::CAP_PROP_FPS, fps);
//保存抽帧的图像矩阵
cv::Mat cv_frame;
while (true)
{
cap >> cv_frame;
if (cv_frame.empty())
{
break;
}
if (mask == 1)
{
cv::resize(cv_frame, cv_frame, cv::Size(540, 960));
}
mutex_lock.lock();
//保证队列里只有五帧,以保证内存的开销
if (mat_info_input.size() > max_frame)
{
mat_info_input.pop();
}
else
{
std::vector<FaceInfo> src_face_infos;
MatInfo mat_info;
if (yolov5_face->detect(cv_frame, src_face_infos))
{
mat_info.points = src_face_infos[0].points;
mat_info.cv_mat = cv_frame.clone();
}
else
{
std::vector<cv::Point2f> p(5, cv::Point2f(0.0f, 0.0f));
mat_info.cv_mat = cv_frame.clone();
mat_info.points = p;
}
mat_info_input.push(mat_info);
}
if (is_show)
{
cv::namedWindow("cam src", 0);
cv::imshow("cam src", cv_frame);
if (cv::waitKey(1) == 27)
{
break;
}
}
mutex_lock.unlock();
}
}
int CameraClient::send_camera_frame(std::string server_ip, int port)
{
// 设置Winsock版本号
WORD w_req = MAKEWORD(2, 2);
// WSADATA 结构体用于存储初始化信息
WSADATA wsadata;
int err;
// 初始化Winsock套接字库
err = WSAStartup(w_req, &wsadata);
if (err != 0)
{
// 初始化失败,打印错误信息
std::cout << "初始化套接字库失败!" << std::endl;
return -1;
}
else
{
// 初始化成功,打印信息
std::cout << "初始化套接字库成功!" << std::endl;
}
// 检测Winsock版本号是否符合要求
if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2)
{
std::cout << "套接字库版本号不符!" << std::endl;
WSACleanup(); // 清理Winsock
return -1;
}
else
{
std::cout << "套接字库版本正确!" << std::endl;
}
// 定义服务端地址结构体
SOCKADDR_IN server_addr;
// 填充服务器地址信息,地址族为IPv4
server_addr.sin_family = AF_INET;
// 将字符串形式的IP地址转换为二进制形式
struct in_addr addr;
int result = inet_pton(AF_INET, server_ip.c_str(), &addr);
if (result <= 0)
{
if (result == 0)
{
std::cerr << "inet_pton failed: address is invalid." << std::endl;
}
else
{
std::cerr << "inet_pton failed: error occurred." << std::endl;
}
return -1;
}
server_addr.sin_addr = addr;
// 设置端口号并转换为网络字节序
server_addr.sin_port = htons(port);
// 创建套接字
SOCKET m_server = socket(AF_INET, SOCK_STREAM, 0);
if (connect(m_server, (SOCKADDR*)&server_addr, sizeof(SOCKADDR)) == SOCKET_ERROR)
{
// 连接服务器失败,打印错误信息
std::cout << "服务器连接失败!" << std::endl;
WSACleanup();
return -1;
}
else
{
// 连接服务器成功,打印信息
std::cout << "服务器连接成功!" << std::endl;
}
// 声明用于读取视频帧的Mat对象
cv::Mat cv_frame;
std::vector<cv::Point2f> face_points;
// 声明用于保存JPEG编码后数据的向量
std::vector<uchar> data_encode;
// 声明JPEG编码参数
std::vector<int> params;
params.resize(4, 0);
// 设置压缩类型为JPEG
params[0] = cv::IMWRITE_JPEG_QUALITY;
//设置JPEG压缩质量为99(最佳质量)
params[1] = image_quality;
//用于记录发送帧数量的缓冲区
char frames_cnt[100] = { 0, };
std::cout << "开始发送" << std::endl;
int j = 0;
//打开视频文件
//cv::VideoCapture cap(video_path);
while (true)
{
//锁定互斥锁以安全访问queueInput队列
mutex_lock.lock();
if (mat_info_input.empty()) // 如果队列为空,则等待
{
mutex_lock.unlock(); // 释放锁
Sleep(3); // 短暂休眠
continue; // 继续下一次循环
}
else
{
cv_frame = mat_info_input.front().cv_mat; // 从队列中取出帧
face_points = mat_info_input.front().points;
mat_info_input.pop();
mutex_lock.unlock(); // 释放锁
Sleep(1); // 短暂休眠
}
// 对帧进行JPEG压缩
imencode(".jpg", cv_frame, data_encode, params);
std::string str = points_to_str(face_points, 3);
// 获取压缩后数据的大小
int len_encoder = data_encode.size();
char len[10] = {0,};
// 将数据大小转换为字符串并发送
_itoa_s(len_encoder, len, 10);
// 复制第一个数组到结果数组
strcpy(frames_cnt, len);
// 将第二个数组追加到结果数组的末尾
strcat(frames_cnt, str.c_str());
std::cout << str.size() << " len_encoder = " << frames_cnt << std::endl;
send(m_server, frames_cnt, 100, 0);
// 发送压缩后的图像数据
char* send_b = new char[data_encode.size()];
memcpy(send_b, &data_encode[0], data_encode.size());
int iSend = send(m_server, send_b, data_encode.size(), 0);
delete[]send_b;
// 清空编码数据向量,为下一次编码做准备
data_encode.clear();
++j;
}
std::cout << "发送完成";
closesocket(m_server); // 关闭套接字
WSACleanup(); // 清理Winsock
}
2.2 类调用
在main里面使用多线程进行调用:
#include "CameraClient.h"
#include <thread>
int main()
{
std::string video_path = "A.mp4";
std::string server_ip = "192.168.xx.xxx";
int port = 8080;
std::string detect_model_path = "models/yolov5face-s-640x640.onnx";
CameraClient video_client(detect_model_path);
int w = 720;
int h = 1280;
std::thread Get(&CameraClient::get_camera_frame, &video_client, video_path, w,h, 25, true);
std::thread Send(&CameraClient::send_camera_frame, &video_client, server_ip, port);
Get.join();
Send.join();
return 0;
}
2.3 服务器IP地址
在Windows系统中获取IP地址可以通过多种方法实现,包括使用命令行工具、编程接口或图形用户界面。以下是一些常用的方法:
使用命令行工具
在命令提示符(cmd)或PowerShell中,可以使用ipconfig
命令来查看IP地址:
ipconfig
这将列出所有网络适配器的详细信息,包括IPv4和IPv6地址。如果你只想获取特定的适配器的IP地址,可以使用:
ipconfig | findstr "适配器名称"
替换“适配器名称”为你想要查询的网络适配器的名称。
使用Windows API
如果你在编程中需要获取Windows系统的IP地址,可以使用Windows API函数。例如,使用以下C++代码:
#include <winsock2.h>
#include <iphlpapi.h>
#include <iostream>
#pragma comment(lib, "iphlpapi.lib")
#pragma comment(lib, "ws2_32.lib")
int main() {
WSADATA d;
if (WSAStartup(MAKEWORD(2, 2), &d) != 0) {
return -1;
}
ULONG outBufLen = sizeof(IP_ADAPTER_INFO);
PIP_ADAPTER_INFO pAdapterInfo = (IP_ADAPTER_INFO *)malloc(outBufLen);
DWORD dwRetVal = GetAdaptersInfo(pAdapterInfo, &outBufLen);
if (dwRetVal == ERROR_BUFFER_OVERFLOW) {
free(pAdapterInfo);
pAdapterInfo = (IP_ADAPTER_INFO *)malloc(outBufLen);
dwRetVal = GetAdaptersInfo(pAdapterInfo, &outBufLen);
}
if (dwRetVal == NO_ERROR) {
for (PIP_ADAPTER_INFO pAdapter = pAdapterInfo; pAdapter != NULL; pAdapter = pAdapter->Next) {
std::cout << "适配器名称: " << pAdapter->AdapterName << std::endl;
std::cout << "IPv4 地址: " << pAdapter->IpAddressList.IpAddress.String << std::endl;
// 可以继续遍历其他IP地址或适配器
}
}
free(pAdapterInfo);
WSACleanup();
return 0;
}
3.人脸检测
YOLOv5 是一个流行的实时对象检测系统,它使用单个卷积神经网络同时预测对象的类别和位置。虽然YOLOv5主要用于对象检测任务,但也可以被训练或微调以检测特定类型的物体,比如人脸。
#include "YOLOV5Face.h"
YOLOV5Face::YOLOV5Face(std::string& onnx_path, unsigned int _num_threads)
{
std::wstring widestr = std::wstring(onnx_path.begin(), onnx_path.end());
Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "YOLOV5Face");
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(4);
OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
ort_session = new Ort::Session(env, widestr.c_str(), session_options);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
Ort::AllocatorWithDefaultOptions allocator;
input_name = ort_session->GetInputName(0, allocator);
input_node_names.resize(1);
input_node_names[0] = input_name;
Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
input_tensor_size = 1;
input_node_dims = tensor_info.GetShape();
for (unsigned int i = 0; i < input_node_dims.size(); ++i)
input_tensor_size *= input_node_dims.at(i);
input_values_handler.resize(input_tensor_size);
// 4. output names & output dimms
num_outputs = ort_session->GetOutputCount();
output_node_names.resize(num_outputs);
for (unsigned int i = 0; i < num_outputs; ++i)
{
output_node_names[i] = ort_session->GetOutputName(i, allocator);
Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);
auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
auto output_dims = output_tensor_info.GetShape();
output_node_dims.push_back(output_dims);
}
}
YOLOV5Face::YOLOV5Face()
{
}
void YOLOV5Face::set_gpu(int gpu_index, int gpu_ram)
{
std::vector<std::string> available_providers = Ort::GetAvailableProviders();
auto cuda_available = std::find(available_providers.begin(),
available_providers.end(), "CUDAExecutionProvider");
if (gpu_index >= 0 && (cuda_available != available_providers.end()))
{
OrtCUDAProviderOptions cuda_options;
cuda_options.device_id = gpu_index;
cuda_options.arena_extend_strategy = 0;
if (gpu_ram == -1)
{
cuda_options.gpu_mem_limit = ~0ULL;
}
else
{
cuda_options.gpu_mem_limit = size_t(gpu_ram * 1024 * 1024 * 1024);
}
cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearch::OrtCudnnConvAlgoSearchExhaustive;
cuda_options.do_copy_in_default_stream = 1;
session_options.AppendExecutionProvider_CUDA(cuda_options);
}
}
void YOLOV5Face::set_num_thread(int num_thread)
{
_num_thread = num_thread;
session_options.SetInterOpNumThreads(num_thread);
session_options.SetIntraOpNumThreads(num_thread);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
}
bool YOLOV5Face::read_model(const std::string model_path)
{
try
{
std::wstring widestr = std::wstring(model_path.begin(), model_path.end());
ort_session = new Ort::Session(env, widestr.c_str(), session_options);
Ort::AllocatorWithDefaultOptions allocator;
input_name = ort_session->GetInputName(0, allocator);
input_node_names.resize(1);
input_node_names[0] = input_name;
Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
input_tensor_size = 1;
input_node_dims = tensor_info.GetShape();
for (unsigned int i = 0; i < input_node_dims.size(); ++i)
input_tensor_size *= input_node_dims.at(i);
input_values_handler.resize(input_tensor_size);
// 4. output names & output dimms
num_outputs = ort_session->GetOutputCount();
output_node_names.resize(num_outputs);
for (unsigned int i = 0; i < num_outputs; ++i)
{
output_node_names[i] = ort_session->GetOutputName(i, allocator);
Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);
auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
auto output_dims = output_tensor_info.GetShape();
output_node_dims.push_back(output_dims);
}
return true;
}
catch (const std::exception&)
{
return false;
}
return true;
}
cv::Mat normalize(const cv::Mat& mat, float mean, float scale)
{
cv::Mat matf;
if (mat.type() != CV_32FC3) mat.convertTo(matf, CV_32FC3);
else matf = mat; // reference
return (matf - mean) * scale;
}
void normalize(const cv::Mat& inmat, cv::Mat& outmat,
float mean, float scale)
{
outmat = normalize(inmat, mean, scale);
}
void normalize_inplace(cv::Mat& mat_inplace, float mean, float scale)
{
if (mat_inplace.type() != CV_32FC3) mat_inplace.convertTo(mat_inplace, CV_32FC3);
normalize(mat_inplace, mat_inplace, mean, scale);
}
Ort::Value create_tensor(const cv::Mat& mat,
const std::vector<int64_t>& tensor_dims,
const Ort::MemoryInfo& memory_info_handler,
std::vector<float>& tensor_value_handler,
unsigned int data_format)
throw(std::runtime_error)
{
const unsigned int rows = mat.rows;
const unsigned int cols = mat.cols;
const unsigned int channels = mat.channels();
cv::Mat mat_ref;
if (mat.type() != CV_32FC(channels)) mat.convertTo(mat_ref, CV_32FC(channels));
else mat_ref = mat; // reference only. zero-time cost. support 1/2/3/... channels
if (tensor_dims.size() != 4) throw std::runtime_error("dims mismatch.");
if (tensor_dims.at(0) != 1) throw std::runtime_error("batch != 1");
// CXHXW
if (data_format == CHW)
{
const unsigned int target_height = tensor_dims.at(2);
const unsigned int target_width = tensor_dims.at(3);
const unsigned int target_channel = tensor_dims.at(1);
const unsigned int target_tensor_size = target_channel * target_height * target_width;
if (target_channel != channels) throw std::runtime_error("channel mismatch.");
tensor_value_handler.resize(target_tensor_size);
cv::Mat resize_mat_ref;
if (target_height != rows || target_width != cols)
cv::resize(mat_ref, resize_mat_ref, cv::Size(target_width, target_height));
else resize_mat_ref = mat_ref; // reference only. zero-time cost.
std::vector<cv::Mat> mat_channels;
cv::split(resize_mat_ref, mat_channels);
// CXHXW
for (unsigned int i = 0; i < channels; ++i)
std::memcpy(tensor_value_handler.data() + i * (target_height * target_width),
mat_channels.at(i).data, target_height * target_width * sizeof(float));
return Ort::Value::CreateTensor<float>(memory_info_handler, tensor_value_handler.data(),
target_tensor_size, tensor_dims.data(),
tensor_dims.size());
}
// HXWXC
const unsigned int target_height = tensor_dims.at(1);
const unsigned int target_width = tensor_dims.at(2);
const unsigned int target_channel = tensor_dims.at(3);
const unsigned int target_tensor_size = target_channel * target_height * target_width;
if (target_channel != channels) throw std::runtime_error("channel mismatch!");
tensor_value_handler.resize(target_tensor_size);
cv::Mat resize_mat_ref;
if (target_height != rows || target_width != cols)
cv::resize(mat_ref, resize_mat_ref, cv::Size(target_width, target_height));
else resize_mat_ref = mat_ref; // reference only. zero-time cost.
std::memcpy(tensor_value_handler.data(), resize_mat_ref.data, target_tensor_size * sizeof(float));
return Ort::Value::CreateTensor<float>(memory_info_handler, tensor_value_handler.data(),
target_tensor_size, tensor_dims.data(),
tensor_dims.size());
}
Ort::Value YOLOV5Face::transform(const cv::Mat& mat_rs)
{
cv::Mat canvas;
cv::cvtColor(mat_rs, canvas, cv::COLOR_BGR2RGB);
normalize_inplace(canvas, mean_val, scale_val); // float32
return create_tensor(
canvas, input_node_dims, memory_info_handler,
input_values_handler, CHW);
}
void YOLOV5Face::resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,
int target_height, int target_width,ScaleParams& scale_params)
{
if (mat.empty()) return;
int img_height = static_cast<int>(mat.rows);
int img_width = static_cast<int>(mat.cols);
mat_rs = cv::Mat(target_height, target_width, CV_8UC3,
cv::Scalar(0, 0, 0));
// scale ratio (new / old) new_shape(h,w)
float w_r = (float)target_width / (float)img_width;
float h_r = (float)target_height / (float)img_height;
float r = std::min(w_r, h_r);
// compute padding
int new_unpad_w = static_cast<int>((float)img_width * r); // floor
int new_unpad_h = static_cast<int>((float)img_height * r); // floor
int pad_w = target_width - new_unpad_w; // >=0
int pad_h = target_height - new_unpad_h; // >=0
int dw = pad_w / 2;
int dh = pad_h / 2;
// resize with unscaling
cv::Mat new_unpad_mat;
// cv::Mat new_unpad_mat = mat.clone(); // may not need clone.
cv::resize(mat, new_unpad_mat, cv::Size(new_unpad_w, new_unpad_h));
new_unpad_mat.copyTo(mat_rs(cv::Rect(dw, dh, new_unpad_w, new_unpad_h)));
// record scale params.
scale_params.ratio = r;
scale_params.dw = dw;
scale_params.dh = dh;
scale_params.flag = true;
}
void YOLOV5Face::generate_bboxes_kps(const ScaleParams& scale_params,
std::vector<lite::types::BoxfWithLandmarks>& bbox_kps_collection,
std::vector<Ort::Value>& output_tensors, float score_threshold,
float img_height, float img_width)
{
Ort::Value& output = output_tensors.at(0); // (1,n,16=4+1+10+1)
auto output_dims = output_node_dims.at(0); // (1,n,16)
const unsigned int num_anchors = output_dims.at(1); // n = ?
const float* output_ptr = output.GetTensorMutableData<float>();
float r_ = scale_params.ratio;
int dw_ = scale_params.dw;
int dh_ = scale_params.dh;
bbox_kps_collection.clear();
unsigned int count = 0;
for (unsigned int i = 0; i < num_anchors; ++i)
{
const float* row_ptr = output_ptr + i * 16;
float obj_conf = row_ptr[4];
if (obj_conf < score_threshold) continue; // filter first.
float cls_conf = row_ptr[15];
if (cls_conf < score_threshold) continue; // face score.
// bounding box
const float* offsets = row_ptr;
float cx = offsets[0];
float cy = offsets[1];
float w = offsets[2];
float h = offsets[3];
lite::types::BoxfWithLandmarks box_kps;
float x1 = ((cx - w / 2.f) - (float)dw_) / r_;
float y1 = ((cy - h / 2.f) - (float)dh_) / r_;
float x2 = ((cx + w / 2.f) - (float)dw_) / r_;
float y2 = ((cy + h / 2.f) - (float)dh_) / r_;
box_kps.box.x1 = std::max(0.f, x1);
box_kps.box.y1 = std::max(0.f, y1);
box_kps.box.x2 = std::min(img_width - 1.f, x2);
box_kps.box.y2 = std::min(img_height - 1.f, y2);
box_kps.box.score = cls_conf;
box_kps.box.label = 1;
box_kps.box.label_text = "face";
box_kps.box.flag = true;
// landmarks
const float* kps_offsets = row_ptr + 5;
for (unsigned int j = 0; j < 10; j += 2)
{
cv::Point2f kps;
float kps_x = (kps_offsets[j] - (float)dw_) / r_;
float kps_y = (kps_offsets[j + 1] - (float)dh_) / r_;
kps.x = std::min(std::max(0.f, kps_x), img_width - 1.f);
kps.y = std::min(std::max(0.f, kps_y), img_height - 1.f);
box_kps.landmarks.points.push_back(kps);
}
box_kps.landmarks.flag = true;
box_kps.flag = true;
bbox_kps_collection.push_back(box_kps);
count += 1; // limit boxes for nms.
if (count > max_nms)
break;
}
}
void YOLOV5Face::nms_bboxes_kps(std::vector<lite::types::BoxfWithLandmarks>& input,
std::vector<lite::types::BoxfWithLandmarks>& output,
float iou_threshold, unsigned int topk)
{
if (input.empty()) return;
std::sort(
input.begin(), input.end(),
[](const lite::types::BoxfWithLandmarks& a, const lite::types::BoxfWithLandmarks& b)
{ return a.box.score > b.box.score; }
);
const unsigned int box_num = input.size();
std::vector<int> merged(box_num, 0);
unsigned int count = 0;
for (unsigned int i = 0; i < box_num; ++i)
{
if (merged[i]) continue;
std::vector<lite::types::BoxfWithLandmarks> buf;
buf.push_back(input[i]);
merged[i] = 1;
for (unsigned int j = i + 1; j < box_num; ++j)
{
if (merged[j]) continue;
float iou = static_cast<float>(input[i].box.iou_of(input[j].box));
if (iou > iou_threshold)
{
merged[j] = 1;
buf.push_back(input[j]);
}
}
output.push_back(buf[0]);
// keep top k
count += 1;
if (count >= topk)
break;
}
}
void YOLOV5Face::detect(const cv::Mat& mat, std::vector<lite::types::BoxfWithLandmarks>& detected_boxes_kps,
float score_threshold, float iou_threshold, unsigned int topk)
{
if (mat.empty()) return;
auto img_height = static_cast<float>(mat.rows);
auto img_width = static_cast<float>(mat.cols);
const int target_height = (int)input_node_dims.at(2);
const int target_width = (int)input_node_dims.at(3);
// resize & unscale
cv::Mat mat_rs;
ScaleParams scale_params;
this->resize_unscale(mat, mat_rs, target_height, target_width, scale_params);
// 1. make input tensor
Ort::Value input_tensor = this->transform(mat_rs);
// 2. inference scores & boxes.
auto output_tensors = ort_session->Run(
Ort::RunOptions{ nullptr }, input_node_names.data(),
&input_tensor, 1, output_node_names.data(), num_outputs
);
// 3. rescale & exclude.
std::vector<lite::types::BoxfWithLandmarks> bbox_kps_collection;
this->generate_bboxes_kps(scale_params, bbox_kps_collection, output_tensors,
score_threshold, img_height, img_width);
// 4. hard nms with topk.
this->nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps, iou_threshold, topk);
}
bool YOLOV5Face::detect(const cv::Mat& cv_src, std::vector<FaceInfo>& face_infos, int position,
float score_threshold, float iou_threshold, unsigned int topk)
{
auto img_height = static_cast<float>(cv_src.rows);
auto img_width = static_cast<float>(cv_src.cols);
const int target_height = (int)input_node_dims.at(2);
const int target_width = (int)input_node_dims.at(3);
// resize & unscale
cv::Mat mat_rs;
ScaleParams scale_params;
this->resize_unscale(cv_src, mat_rs, target_height, target_width, scale_params);
// 1. make input tensor
Ort::Value input_tensor = this->transform(mat_rs);
// 2. inference scores & boxes.
auto output_tensors = ort_session->Run(
Ort::RunOptions{ nullptr }, input_node_names.data(),
&input_tensor, 1, output_node_names.data(), num_outputs
);
// 3. rescale & exclude.
std::vector<lite::types::BoxfWithLandmarks> bbox_kps_collection;
this->generate_bboxes_kps(scale_params, bbox_kps_collection, output_tensors,
score_threshold, img_height, img_width);
std::vector<lite::types::BoxfWithLandmarks> detected_boxes_kps;
// 4. hard nms with topk.
this->nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps, iou_threshold, topk);
if (detected_boxes_kps.size() == 1)
{
FaceInfo info;
info.bbox.xmin = detected_boxes_kps[0].box.rect().x;
info.bbox.ymin = detected_boxes_kps[0].box.rect().y;
info.bbox.xmax = detected_boxes_kps[0].box.rect().br().x;
info.bbox.ymax = detected_boxes_kps[0].box.rect().br().y;
info.points = detected_boxes_kps[0].landmarks.points;
face_infos.push_back(info);
return true;
}
else if (detected_boxes_kps.size() > 1 && position == 1)
{
int arec = 0;
int index = 0;
for (int i = 0; i < detected_boxes_kps.size(); ++i)
{
if (detected_boxes_kps[i].box.score >= 0.7)
{
if (arec <= detected_boxes_kps[i].box.rect().area())
{
arec = detected_boxes_kps[i].box.rect().area();
index = i;
}
}
}
FaceInfo info;
info.bbox.xmin = detected_boxes_kps[index].box.rect().x;
info.bbox.ymin = detected_boxes_kps[index].box.rect().y;
info.bbox.xmax = detected_boxes_kps[index].box.rect().br().x;
info.bbox.ymax = detected_boxes_kps[index].box.rect().br().y;
info.points = detected_boxes_kps[index].landmarks.points;
face_infos.push_back(info);
return true;
}
return false;
}
4. 实现效果
整个源码下载地址:https://download.csdn.net/download/matt45m/89595413