OpenCV C++的网络实时视频流传输——基于Yolov5 face与TCP实现实时推流的深度学习图像处理客户端与服务器端

前言

在Windows下使用TCP协议，基于OpenCV C++与Yolov5实现了一个完整的实时推流的深度学习图像处理客户端与服务器端，为了达到实时传输的效果，客户端使用了多线程的方式实现。深度学习模型是基于onnxruntime的GPU推理。，实现效果如下：

基于OpenCV C++的网络实时视频流传输

1. 服务器端

1. 1服务器类的实现

首先定义一个服务器类，类实现，接收发送过来的图像并进行解码，并接收客户端检测到的人脸关键点，并画到最终要显示的图像上：

#include "Server.h"
#define SIZE 100

Server::Server()
{
}

Server::~Server()
{
}

bool Server::initialization(const int& port, const cv::VideoCapture& cap) 
{
	m_port = port;
	m_cap = cap;

	// 初始化库（windows独有）
	//初始化套接字库
	WORD w_req = MAKEWORD(2, 2);//版本号
	WSADATA wsadata;
	int err;
	err = WSAStartup(w_req, &wsadata);
	if (err != 0) {
		std::cout << "初始化套接字库失败！" << std::endl;
		return -1;
	}
	else {
		std::cout << "初始化套接字库成功！" << std::endl;
	}

	//检测版本号
	if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2) {
		std::cout << "套接字库版本号不符！" << std::endl;
		WSACleanup();
		return false;
	}
	else {
		std::cout << "套接字库版本正确！" << std::endl;
	}

	return true;
}

bool Server::initialization(const int& port) 
{
	m_port = port;

	// 初始化库（windows独有）
	//初始化套接字库
	WORD w_req = MAKEWORD(2, 2);//版本号
	WSADATA wsadata;
	int err;
	err = WSAStartup(w_req, &wsadata);
	if (err != 0) 
	{
		std::cout << "初始化套接字库失败！" << std::endl;
		return false;
	}
	else 
	{
		std::cout << "初始化套接字库成功！" << std::endl;
	}

	//检测版本号
	if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2) 
	{
		std::cout << "套接字库版本号不符！" << std::endl;
		WSACleanup();
		return false;
	}
	else 
	{
		std::cout << "套接字库版本正确！" << std::endl;
	}

	return true;
}

bool Server::build_connect() 
{
	//服务端地址客户端地址
	SOCKADDR_IN server_addr;
	SOCKADDR_IN accept_addr;

	//填充服务端信息
	server_addr.sin_family = AF_INET;  // 用来定义那种地址族，AF_INET：IPV4
	server_addr.sin_addr.S_un.S_addr = htonl(INADDR_ANY);  // 保存ip地址，htonl将一个无符号长整型转换为TCP/IP协议网络的大端
	// INADDR_ANY表示一个服务器上的所有网卡
	server_addr.sin_port = htons(m_port);  // 端口号

	//创建套接字
	m_server = socket(AF_INET, SOCK_STREAM, 0);  // 使用tcp进行连接

	if (::bind(m_server, (SOCKADDR*)&server_addr, sizeof(SOCKADDR)) == SOCKET_ERROR) 
	{
		std::cout << "套接字绑定失败！" << std::endl;
		WSACleanup();
		return false;
	}
	else 
	{
		std::cout << "套接字绑定成功！" << std::endl;
	}

	//设置套接字为监听状态
	if (listen(m_server, SOMAXCONN) < 0) 
	{
		std::cout << "设置监听状态失败！" << std::endl;
		WSACleanup();
		return false;
	}
	else 
	{
		std::cout << "设置监听状态成功！" << std::endl;
	}
	std::cout << "服务端正在监听连接，请稍候...." << std::endl;
	//接受连接请求
	int len = sizeof(SOCKADDR);
	m_accept = accept(m_server, (SOCKADDR*)&accept_addr, &len);
	if (m_accept == SOCKET_ERROR) 
	{
		std::cout << "连接失败！" << std::endl;
		WSACleanup();
		return false;
	}
	std::cout << "连接建立，准备接受数据" << std::endl;
	return true;
}

bool Server::send_data() 
{
	cv::Mat frame;
	std::vector<uchar> data_encode;
	std::vector<int> params;  // 压缩参数
	params.resize(3, 0);
	params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩
	params[1] = 30;
	char frames_cnt[10] = { 0, };
	_itoa_s(int(m_cap.get(cv::CAP_PROP_FRAME_COUNT)), frames_cnt, 10);
	send(m_accept, frames_cnt, 10, 0);
	std::cout << "开始发送" << std::endl;
	int j = 0;
	while (m_cap.read(frame)) {

		m_file_in.push_back(frame.clone());
		imencode(".jpg", frame, data_encode, params);  // 对图像进行压缩
		int len_encoder = data_encode.size();

		_itoa_s(len_encoder, frames_cnt, 10);
		send(m_accept, frames_cnt, 10, 0);

		_itoa_s(SIZE, frames_cnt, 10);
		send(m_accept, frames_cnt, 10, 0);
		// 发送
		char send_char[SIZE] = { 0, };
		int index = 0;
		bool flag = false;
		for (int i = 0; i < len_encoder / SIZE + 1; ++i) {
			for (int k = 0; k < SIZE; ++k) {
				if (index >= data_encode.size()) {
					flag = true;
					break;
				}
				send_char[k] = data_encode[index++];
			}
			send(m_accept, send_char, SIZE, 0);
		}

		data_encode.clear();
		++j;
		std::cout << j << std::endl;  // 发送端一直在发送
	}
	std::cout << "发送完成";
	return true;
}


std::vector<std::string> split_string(const char str[], char delimiter) 
{
	std::vector<std::string> tokens;
	std::istringstream stream(str);
	std::string token;

	// 使用分隔符分割字符串
	while (std::getline(stream, token, delimiter)) 
	{
		tokens.push_back(token);
	}

	return tokens;
}

float str_to_float(const std::string& str) 
{
	// 将std::string转换为C风格字符串
	const char* c_str = str.c_str();
	// 使用std::strtof转换字符串为float
	char* endptr;
	float result = std::strtof(c_str, &endptr);
	// 检查转换是否成功
	if (endptr == c_str) 
	{
		std::cerr << "StringToFloat failed to convert string to float." << std::endl;
	}
	return result;
}


int str_to_point2f(std::vector<std::string> str, std::vector<cv::Point2f> &point2f)
{
	point2f.push_back(cv::Point2f(str_to_float(str[1]), str_to_float(str[2])));
	point2f.push_back(cv::Point2f(str_to_float(str[3]), str_to_float(str[4])));
	point2f.push_back(cv::Point2f(str_to_float(str[5]), str_to_float(str[6])));
	point2f.push_back(cv::Point2f(str_to_float(str[7]), str_to_float(str[8])));
	point2f.push_back(cv::Point2f(str_to_float(str[9]), str_to_float(str[10])));

	cv::Point2f p2f(0, 0);

	int index = 0;
	for (const cv::Point2f& p : point2f)
	{
		if (p == p2f)
		{
			index++;
		}
	}

	return index;
}


bool Server::receive_data()
{
	cv::Mat cv_frame;
	std::vector<uchar> data_decode;
	std::vector<int> params;  // 压缩参数
	params.resize(3, 0);
	params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩
	params[1] = image_quality;
	cv::namedWindow("ServerData", cv::WINDOW_NORMAL);
	char frams_cnt[100] = { 0, };
	// 解析总帧数
	int count = atoi(frams_cnt);
	//std::cout << count << std::endl;
	int idx = 0;
	while (1) 
	{
		// 解析图片字节长度
		int irecv = recv(m_accept, frams_cnt, 100, 0);

		char s = '#';
		std::vector<std::string> strs = split_string(frams_cnt, s);

		if (strs.size() != 11)
		{
			break;
		}
		int cnt = std::stoi(strs[0]);

		std::vector<cv::Point2f> face_points;
		int point_index = str_to_point2f(strs,face_points);

		data_decode.resize(cnt);//将队列大小重置为图片字节长度
		int index = 0;//表示接收数据长度计量
		count = cnt;//表示的是要从接收缓冲区接收字节的数量
		char *recv_char = new char[cnt];//新建一个字节数组 数组长度为图片字节长度
		while (count > 0)//这里只能写count > 0 如果写count >= 0 那么while循环会陷入一个死循环
		{
			//在网络通信中  recv 函数一次性接收到的字节数可能小于等于设定的SIZE大小，这时可能需要多次recv
			int iRet = recv(m_accept, recv_char, count, 0);
			int tmp = 0;
			for (int k = 0; k < iRet; k++)
			{
				tmp = k+1;
				index++;
				if (index >= cnt) { break; }
			}
			memcpy(&data_decode[index - tmp ], recv_char , tmp);
			if (!iRet) { return -1; }
			count -= iRet;//更新余下需要从接收缓冲区接收的字节数量
		}
		delete[]recv_char;
		try 
		{
			cv_frame = cv::imdecode(data_decode, cv::IMREAD_COLOR);
			if (!cv_frame.empty() && point_index < 2)
			{
				for (int i = 0; i < face_points.size(); ++i)
				{
					cv::circle(cv_frame, face_points[i], 4, cv::Scalar(0, 0, 255), -1, cv::LINE_AA);
				}

				cv::namedWindow("ServerData", 0);
				cv::imshow("ServerData", cv_frame);
				cv::waitKey(1);
				data_decode.clear();
			}
			else
			{
				data_decode.clear();
				continue;
			}
		}
		catch (const char *msg)
		{
			data_decode.clear();
			continue;
		}
		
	}
	std::cout << "接受完成";
	return true;
}

bool Server::send_data_frame(cv::Mat input)
{
	cv::Mat frame = input;
	std::vector<uchar> data_encode;
	std::vector<int> params;  // 压缩参数
	params.resize(3, 0);
	params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩
	params[1] = 100;
	char frames_cnt[10] = { 0, };
	std::cout << "开始发送" << std::endl;

	m_file_in.push_back(frame.clone());
	imencode(".jpg", frame, data_encode, params);  // 对图像进行压缩
	int len_encoder = data_encode.size();

	_itoa_s(len_encoder, frames_cnt, 10);
	send(m_accept, frames_cnt, 10, 0);

	_itoa_s(SIZE, frames_cnt, 10);
	send(m_accept, frames_cnt, 10, 0);
	// 发送
	char send_char[SIZE] = { 0, };
	int index = 0;
	bool flag = false;
	for (int i = 0; i < len_encoder / SIZE + 1; ++i) {
		for (int k = 0; k < SIZE; ++k) {
			if (index >= data_encode.size()) {
				flag = true;
				break;
			}
			send_char[k] = data_encode[index++];
		}
		send(m_accept, send_char, SIZE, 0);
	}

	data_encode.clear();

	std::cout << "发送完成";
	return true;
}

bool Server::receive_data_frame(cv::Mat& output) 
{
	cv::Mat frame;
	std::vector<uchar> data_decode;
	std::vector<int> params;  // 压缩参数

	params.resize(3, 0);
	params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩
	params[1] = 100;

	char frams_cnt[10] = { 0, };

	recv(m_accept, frams_cnt, 10, 0);
	// 解析总帧数
	int cnt = atoi(frams_cnt);
	std::cout << "frams_cnt " << frams_cnt <<" "<< cnt<< std::endl;
	recv(m_accept, frams_cnt, 10, 0);
	int size = atoi(frams_cnt);
	std::cout << "size " << size << std::endl;

	data_decode.resize(cnt);
	int index = 0;
	bool flag = true;
	char *recv_b = new char[cnt];
	std::cout << " cnt= " << cnt << std::endl;
	int iRecv = recv(m_accept, recv_b, cnt, 0);
	for (int i = 0; i < cnt; i++)
	{
		data_decode[index++] = recv_b[i];
	}
	std::cout << "data_decode" << data_decode.size() << std::endl;
	output = cv::imdecode(data_decode, cv::IMREAD_COLOR);
	//output = imdecode(data_decode, CV_LOAD_IMAGE_COLOR);
	std::cout << "  output.size " << output.size().width << "  "<< output.size().height  << std::endl;
	delete[]recv_b;
	data_decode.clear();
	std::cout << "接受完成";
	return true;
}

bool Server::free_connect() 
{
	m_cap.release();
	//关闭套接字
	closesocket(m_server);
	closesocket(m_accept);
	//释放DLL资源
	WSACleanup();
	return true;
}

1.2 类调用

在main函数进行调用，这里使用8080这个端口，端口可以按自己的要求更改，只要不占用就行：

#include "server.h"

int recv_online_video(int port = 8080)
{
	Server ser;
	ser.initialization(port);
	ser.build_connect();
	ser.receive_data();
	ser.free_connect();
	return 0;
}

int main() 
{
	recv_online_video(8080);
	return 0;
}

2. 客户端

2.1 客户端类实现

客户端实现了对视频里面的图像进行人脸检测，人脸检测使用的yolov5s face，并把检测到人脸的关键点打包发送给服务器，然后实现数据序列化发送给服务器：

#include "CameraClient.h"

CameraClient::CameraClient(std::string detect_model_path)
{
    yolov5_face = new YOLOV5Face();
    yolov5_face->set_gpu(0, 4);
    yolov5_face->set_num_thread(4);
    yolov5_face->read_model(detect_model_path);
}

CameraClient::~CameraClient()
{
   
}

std::string float_to_string(float value, int precision) 
{
    std::stringstream ss;
    ss << std::fixed << std::setprecision(precision) << value;
    return ss.str();
}

std::string point_to_str(cv::Point2f point, int precision)
{
    return  ("#" + float_to_string(point.x, 3) + "#" + float_to_string(point.y, 3));
}

std::string points_to_str(std::vector<cv::Point2f> points, int precision)
{
    std::string strs;
    for (int i = 0; i < points.size(); ++i)
    {
        std::string s = point_to_str(points[i], precision);

        strs += s;
    }

    return strs;
}

void CameraClient::get_camera_frame(std::string camera_path,int width,int height,int fps,bool is_show)
{
    bool found = false;
    cv::Size size(width, height);
    for (const cv::Size& size : this->camera_size)
    {
        if (size == size)
        {
            found = true;
            break;
        }
    }

    if (!found)
    {
        std::cerr << " Camera resolution setting error." << std::endl;
        return;
    }

    int mask = 0;
    if (size == cv::Size(540, 960))
    {
        size = cv::Size(720, 1280);
        mask = 1;
    }

    /*int index = camera_index < 0 ? 0 : camera_index;
    cv::VideoCapture cap(index);*/
    
    cv::VideoCapture cap(camera_path);
    if (!cap.isOpened())
    {
        std::cerr << " Camera is no opened." << std::endl;
        return;
    }

    cap.set(cv::CAP_PROP_FRAME_WIDTH, size.width);//宽度 
    cap.set(cv::CAP_PROP_FRAME_HEIGHT, size.height);
    cap.set(cv::CAP_PROP_FPS, fps);


	//保存抽帧的图像矩阵
	cv::Mat cv_frame;
	while (true)
	{
		cap >> cv_frame;
		if (cv_frame.empty())
		{
			break;
		}

        if (mask == 1)
        {
            cv::resize(cv_frame, cv_frame, cv::Size(540, 960));
        }

		mutex_lock.lock();
		//保证队列里只有五帧，以保证内存的开销
		if (mat_info_input.size() > max_frame)
		{
            mat_info_input.pop();
		}
        else
        {
            std::vector<FaceInfo> src_face_infos;
            MatInfo mat_info;
            if (yolov5_face->detect(cv_frame, src_face_infos))
            {
                mat_info.points = src_face_infos[0].points;
                mat_info.cv_mat = cv_frame.clone();
            }
            else
            {
                std::vector<cv::Point2f> p(5, cv::Point2f(0.0f, 0.0f));
                mat_info.cv_mat = cv_frame.clone();
                mat_info.points = p;
            }

            mat_info_input.push(mat_info);
		}

        if (is_show)
        {
            cv::namedWindow("cam src", 0);
            cv::imshow("cam src", cv_frame);
            if (cv::waitKey(1) == 27)
            {
                break;
            }
        }
		mutex_lock.unlock();
	}
}

int CameraClient::send_camera_frame(std::string server_ip, int port)
{
    // 设置Winsock版本号
    WORD w_req = MAKEWORD(2, 2);

    // WSADATA 结构体用于存储初始化信息
    WSADATA wsadata;
    int err;

    // 初始化Winsock套接字库
    err = WSAStartup(w_req, &wsadata);
    if (err != 0)
    {
        // 初始化失败，打印错误信息
        std::cout << "初始化套接字库失败！" << std::endl;
        return -1;
    }
    else
    {
        // 初始化成功，打印信息
        std::cout << "初始化套接字库成功！" << std::endl;
    }

    // 检测Winsock版本号是否符合要求
    if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2) 
    {
        std::cout << "套接字库版本号不符！" << std::endl;
        WSACleanup(); // 清理Winsock
        return -1;
    }
    else 
    {
        std::cout << "套接字库版本正确！" << std::endl;
    }

    // 定义服务端地址结构体
    SOCKADDR_IN server_addr;

    // 填充服务器地址信息,地址族为IPv4
    server_addr.sin_family = AF_INET;  

    // 将字符串形式的IP地址转换为二进制形式
    struct in_addr addr;
    int result = inet_pton(AF_INET, server_ip.c_str(), &addr);

    if (result <= 0)
    {
        if (result == 0)
        {
            std::cerr << "inet_pton failed: address is invalid." << std::endl;
        }
        else
        {
            std::cerr << "inet_pton failed: error occurred." << std::endl;
        }
        return -1;
    }
    server_addr.sin_addr = addr;

    // 设置端口号并转换为网络字节序
    server_addr.sin_port = htons(port);

    // 创建套接字
    SOCKET m_server = socket(AF_INET, SOCK_STREAM, 0);
    if (connect(m_server, (SOCKADDR*)&server_addr, sizeof(SOCKADDR)) == SOCKET_ERROR)
    {
        // 连接服务器失败，打印错误信息
        std::cout << "服务器连接失败！" << std::endl;
        WSACleanup();
        return -1;
    }
    else
    {
        // 连接服务器成功，打印信息
        std::cout << "服务器连接成功！" << std::endl;
    }

    // 声明用于读取视频帧的Mat对象
    cv::Mat cv_frame;
    
    std::vector<cv::Point2f> face_points;

    // 声明用于保存JPEG编码后数据的向量
    std::vector<uchar> data_encode;
    // 声明JPEG编码参数
    std::vector<int> params;
    params.resize(4, 0);


    // 设置压缩类型为JPEG
    params[0] = cv::IMWRITE_JPEG_QUALITY; 
    //设置JPEG压缩质量为99（最佳质量）
    params[1] = image_quality; 

    //用于记录发送帧数量的缓冲区
    char frames_cnt[100] = { 0, };
    std::cout << "开始发送" << std::endl;
    int j = 0;

    //打开视频文件
    //cv::VideoCapture cap(video_path);
    while (true)
    {
        //锁定互斥锁以安全访问queueInput队列
        mutex_lock.lock();
        if (mat_info_input.empty()) // 如果队列为空，则等待
        {
            mutex_lock.unlock(); // 释放锁
            Sleep(3); // 短暂休眠
            continue; // 继续下一次循环
        }
        else
        {
            cv_frame = mat_info_input.front().cv_mat; // 从队列中取出帧
            face_points = mat_info_input.front().points;
            mat_info_input.pop();
            mutex_lock.unlock(); // 释放锁
            Sleep(1); // 短暂休眠
        }


        // 对帧进行JPEG压缩
        imencode(".jpg", cv_frame, data_encode, params);

        std::string str = points_to_str(face_points, 3);

        // 获取压缩后数据的大小
        int len_encoder = data_encode.size();
        char len[10] = {0,};
        // 将数据大小转换为字符串并发送
        _itoa_s(len_encoder, len, 10);
        // 复制第一个数组到结果数组
        strcpy(frames_cnt, len);
        // 将第二个数组追加到结果数组的末尾
        strcat(frames_cnt, str.c_str());

        std::cout << str.size() << " len_encoder = " << frames_cnt << std::endl;

        send(m_server, frames_cnt, 100, 0);

        // 发送压缩后的图像数据
        char* send_b = new char[data_encode.size()];
        memcpy(send_b, &data_encode[0], data_encode.size());
        int iSend = send(m_server, send_b, data_encode.size(), 0);
        delete[]send_b;

        // 清空编码数据向量，为下一次编码做准备
        data_encode.clear();
        ++j;
    }
    std::cout << "发送完成";
    closesocket(m_server); // 关闭套接字
    WSACleanup(); // 清理Winsock
}

2.2 类调用

在main里面使用多线程进行调用：

#include "CameraClient.h"
#include <thread>


int main()
{
	std::string video_path = "A.mp4";
	std::string server_ip = "192.168.xx.xxx";
	int port = 8080;

	std::string detect_model_path = "models/yolov5face-s-640x640.onnx";

	CameraClient video_client(detect_model_path);
	int w = 720;
	int h = 1280;

	std::thread Get(&CameraClient::get_camera_frame, &video_client, video_path, w,h, 25, true);
	std::thread Send(&CameraClient::send_camera_frame, &video_client, server_ip, port);

	Get.join();
	Send.join();

	return 0;
}

2.3 服务器IP地址

在Windows系统中获取IP地址可以通过多种方法实现，包括使用命令行工具、编程接口或图形用户界面。以下是一些常用的方法：

使用命令行工具

在命令提示符（cmd）或PowerShell中，可以使用ipconfig命令来查看IP地址：

ipconfig

在这里插入图片描述

这将列出所有网络适配器的详细信息，包括IPv4和IPv6地址。如果你只想获取特定的适配器的IP地址，可以使用：

ipconfig | findstr "适配器名称"

替换“适配器名称”为你想要查询的网络适配器的名称。

使用Windows API

如果你在编程中需要获取Windows系统的IP地址，可以使用Windows API函数。例如，使用以下C++代码：

#include <winsock2.h>
#include <iphlpapi.h>
#include <iostream>

#pragma comment(lib, "iphlpapi.lib")
#pragma comment(lib, "ws2_32.lib")

int main() {
    WSADATA d;
    if (WSAStartup(MAKEWORD(2, 2), &d) != 0) {
        return -1;
    }

    ULONG outBufLen = sizeof(IP_ADAPTER_INFO);
    PIP_ADAPTER_INFO pAdapterInfo = (IP_ADAPTER_INFO *)malloc(outBufLen);
    DWORD dwRetVal = GetAdaptersInfo(pAdapterInfo, &outBufLen);

    if (dwRetVal == ERROR_BUFFER_OVERFLOW) {
        free(pAdapterInfo);
        pAdapterInfo = (IP_ADAPTER_INFO *)malloc(outBufLen);
        dwRetVal = GetAdaptersInfo(pAdapterInfo, &outBufLen);
    }

    if (dwRetVal == NO_ERROR) {
        for (PIP_ADAPTER_INFO pAdapter = pAdapterInfo; pAdapter != NULL; pAdapter = pAdapter->Next) {
            std::cout << "适配器名称: " << pAdapter->AdapterName << std::endl;
            std::cout << "IPv4 地址: " << pAdapter->IpAddressList.IpAddress.String << std::endl;
            // 可以继续遍历其他IP地址或适配器
        }
    }

    free(pAdapterInfo);
    WSACleanup();
    return 0;
}

3.人脸检测

YOLOv5 是一个流行的实时对象检测系统，它使用单个卷积神经网络同时预测对象的类别和位置。虽然YOLOv5主要用于对象检测任务，但也可以被训练或微调以检测特定类型的物体，比如人脸。

#include "YOLOV5Face.h"

YOLOV5Face::YOLOV5Face(std::string& onnx_path, unsigned int _num_threads)
{
	std::wstring widestr = std::wstring(onnx_path.begin(), onnx_path.end());
	Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "YOLOV5Face");
	Ort::SessionOptions session_options;
	session_options.SetIntraOpNumThreads(4);

	OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);

	ort_session = new Ort::Session(env, widestr.c_str(), session_options);

	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);

	Ort::AllocatorWithDefaultOptions allocator;

	input_name = ort_session->GetInputName(0, allocator);
	input_node_names.resize(1);
	input_node_names[0] = input_name;

	Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);
	auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
	input_tensor_size = 1;
	input_node_dims = tensor_info.GetShape();
	for (unsigned int i = 0; i < input_node_dims.size(); ++i)
		input_tensor_size *= input_node_dims.at(i);
	input_values_handler.resize(input_tensor_size);
	// 4. output names & output dimms
	num_outputs = ort_session->GetOutputCount();
	output_node_names.resize(num_outputs);
	for (unsigned int i = 0; i < num_outputs; ++i)
	{
		output_node_names[i] = ort_session->GetOutputName(i, allocator);
		Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);
		auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
		auto output_dims = output_tensor_info.GetShape();
		output_node_dims.push_back(output_dims);
	}
}

YOLOV5Face::YOLOV5Face()
{

}


void YOLOV5Face::set_gpu(int gpu_index, int gpu_ram)
{
	std::vector<std::string> available_providers = Ort::GetAvailableProviders();
	auto cuda_available = std::find(available_providers.begin(),
		available_providers.end(), "CUDAExecutionProvider");

	if (gpu_index >= 0 && (cuda_available != available_providers.end()))
	{
		OrtCUDAProviderOptions cuda_options;
		cuda_options.device_id = gpu_index;
		cuda_options.arena_extend_strategy = 0;

		if (gpu_ram == -1)
		{
			cuda_options.gpu_mem_limit = ~0ULL;
		}
		else
		{
			cuda_options.gpu_mem_limit = size_t(gpu_ram * 1024 * 1024 * 1024);
		}

		cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearch::OrtCudnnConvAlgoSearchExhaustive;
		cuda_options.do_copy_in_default_stream = 1;

		session_options.AppendExecutionProvider_CUDA(cuda_options);
	}
	
}

void YOLOV5Face::set_num_thread(int num_thread)
{
	_num_thread = num_thread;
	session_options.SetInterOpNumThreads(num_thread);
	session_options.SetIntraOpNumThreads(num_thread);
	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
}


bool YOLOV5Face::read_model(const std::string model_path)
{
	try
	{
		std::wstring widestr = std::wstring(model_path.begin(), model_path.end());
		ort_session = new Ort::Session(env, widestr.c_str(), session_options);

		Ort::AllocatorWithDefaultOptions allocator;

		input_name = ort_session->GetInputName(0, allocator);
		input_node_names.resize(1);
		input_node_names[0] = input_name;

		Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);
		auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
		input_tensor_size = 1;
		input_node_dims = tensor_info.GetShape();
		for (unsigned int i = 0; i < input_node_dims.size(); ++i)
			input_tensor_size *= input_node_dims.at(i);
		input_values_handler.resize(input_tensor_size);
		// 4. output names & output dimms
		num_outputs = ort_session->GetOutputCount();
		output_node_names.resize(num_outputs);
		for (unsigned int i = 0; i < num_outputs; ++i)
		{
			output_node_names[i] = ort_session->GetOutputName(i, allocator);
			Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);
			auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
			auto output_dims = output_tensor_info.GetShape();
			output_node_dims.push_back(output_dims);
		}
		return true;
	}
	catch (const std::exception&)
	{
		return false;
	}
	return true;
}

cv::Mat normalize(const cv::Mat& mat, float mean, float scale)
{
	cv::Mat matf;
	if (mat.type() != CV_32FC3) mat.convertTo(matf, CV_32FC3);
	else matf = mat; // reference
	return (matf - mean) * scale;
}

void normalize(const cv::Mat& inmat, cv::Mat& outmat,
	float mean, float scale)
{
	outmat = normalize(inmat, mean, scale);
}

void normalize_inplace(cv::Mat& mat_inplace, float mean, float scale)
{
	if (mat_inplace.type() != CV_32FC3) mat_inplace.convertTo(mat_inplace, CV_32FC3);
	normalize(mat_inplace, mat_inplace, mean, scale);
}

Ort::Value create_tensor(const cv::Mat& mat,
	const std::vector<int64_t>& tensor_dims,
	const Ort::MemoryInfo& memory_info_handler,
	std::vector<float>& tensor_value_handler,
	unsigned int data_format)
	throw(std::runtime_error)
{
	const unsigned int rows = mat.rows;
	const unsigned int cols = mat.cols;
	const unsigned int channels = mat.channels();

	cv::Mat mat_ref;
	if (mat.type() != CV_32FC(channels)) mat.convertTo(mat_ref, CV_32FC(channels));
	else mat_ref = mat; // reference only. zero-time cost. support 1/2/3/... channels

	if (tensor_dims.size() != 4) throw std::runtime_error("dims mismatch.");
	if (tensor_dims.at(0) != 1) throw std::runtime_error("batch != 1");

	// CXHXW
	if (data_format == CHW)
	{
		const unsigned int target_height = tensor_dims.at(2);
		const unsigned int target_width = tensor_dims.at(3);
		const unsigned int target_channel = tensor_dims.at(1);
		const unsigned int target_tensor_size = target_channel * target_height * target_width;
		if (target_channel != channels) throw std::runtime_error("channel mismatch.");

		tensor_value_handler.resize(target_tensor_size);

		cv::Mat resize_mat_ref;
		if (target_height != rows || target_width != cols)
			cv::resize(mat_ref, resize_mat_ref, cv::Size(target_width, target_height));
		else resize_mat_ref = mat_ref; // reference only. zero-time cost.

		std::vector<cv::Mat> mat_channels;
		cv::split(resize_mat_ref, mat_channels);
		// CXHXW
		for (unsigned int i = 0; i < channels; ++i)
			std::memcpy(tensor_value_handler.data() + i * (target_height * target_width),
				mat_channels.at(i).data, target_height * target_width * sizeof(float));

		return Ort::Value::CreateTensor<float>(memory_info_handler, tensor_value_handler.data(),
			target_tensor_size, tensor_dims.data(),
			tensor_dims.size());
	}

	// HXWXC
	const unsigned int target_height = tensor_dims.at(1);
	const unsigned int target_width = tensor_dims.at(2);
	const unsigned int target_channel = tensor_dims.at(3);
	const unsigned int target_tensor_size = target_channel * target_height * target_width;
	if (target_channel != channels) throw std::runtime_error("channel mismatch!");
	tensor_value_handler.resize(target_tensor_size);

	cv::Mat resize_mat_ref;
	if (target_height != rows || target_width != cols)
		cv::resize(mat_ref, resize_mat_ref, cv::Size(target_width, target_height));
	else resize_mat_ref = mat_ref; // reference only. zero-time cost.

	std::memcpy(tensor_value_handler.data(), resize_mat_ref.data, target_tensor_size * sizeof(float));

	return Ort::Value::CreateTensor<float>(memory_info_handler, tensor_value_handler.data(),
		target_tensor_size, tensor_dims.data(),
		tensor_dims.size());
}

Ort::Value YOLOV5Face::transform(const cv::Mat& mat_rs)
{
	cv::Mat canvas;
	cv::cvtColor(mat_rs, canvas, cv::COLOR_BGR2RGB);
	normalize_inplace(canvas, mean_val, scale_val); // float32
	return create_tensor(
		canvas, input_node_dims, memory_info_handler,
		input_values_handler, CHW);
}

void YOLOV5Face::resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,
	int target_height, int target_width,ScaleParams& scale_params)
{
	if (mat.empty()) return;
	int img_height = static_cast<int>(mat.rows);
	int img_width = static_cast<int>(mat.cols);

	mat_rs = cv::Mat(target_height, target_width, CV_8UC3,
		cv::Scalar(0, 0, 0));
	// scale ratio (new / old) new_shape(h,w)
	float w_r = (float)target_width / (float)img_width;
	float h_r = (float)target_height / (float)img_height;
	float r = std::min(w_r, h_r);
	// compute padding
	int new_unpad_w = static_cast<int>((float)img_width * r); // floor
	int new_unpad_h = static_cast<int>((float)img_height * r); // floor
	int pad_w = target_width - new_unpad_w; // >=0
	int pad_h = target_height - new_unpad_h; // >=0

	int dw = pad_w / 2;
	int dh = pad_h / 2;

	// resize with unscaling
	cv::Mat new_unpad_mat;
	// cv::Mat new_unpad_mat = mat.clone(); // may not need clone.
	cv::resize(mat, new_unpad_mat, cv::Size(new_unpad_w, new_unpad_h));
	new_unpad_mat.copyTo(mat_rs(cv::Rect(dw, dh, new_unpad_w, new_unpad_h)));

	// record scale params.
	scale_params.ratio = r;
	scale_params.dw = dw;
	scale_params.dh = dh;
	scale_params.flag = true;
}


void YOLOV5Face::generate_bboxes_kps(const ScaleParams& scale_params,
	std::vector<lite::types::BoxfWithLandmarks>& bbox_kps_collection,
	std::vector<Ort::Value>& output_tensors, float score_threshold,
	float img_height, float img_width)
{
	Ort::Value& output = output_tensors.at(0); // (1,n,16=4+1+10+1)
	auto output_dims = output_node_dims.at(0); // (1,n,16)
	const unsigned int num_anchors = output_dims.at(1); // n = ?
	const float* output_ptr = output.GetTensorMutableData<float>();

	float r_ = scale_params.ratio;
	int dw_ = scale_params.dw;
	int dh_ = scale_params.dh;

	bbox_kps_collection.clear();
	unsigned int count = 0;
	for (unsigned int i = 0; i < num_anchors; ++i)
	{
		const float* row_ptr = output_ptr + i * 16;
		float obj_conf = row_ptr[4];
		if (obj_conf < score_threshold) continue; // filter first.
		float cls_conf = row_ptr[15];
		if (cls_conf < score_threshold) continue; // face score.

		// bounding box
		const float* offsets = row_ptr;
		float cx = offsets[0];
		float cy = offsets[1];
		float w = offsets[2];
		float h = offsets[3];

		lite::types::BoxfWithLandmarks box_kps;
		float x1 = ((cx - w / 2.f) - (float)dw_) / r_;
		float y1 = ((cy - h / 2.f) - (float)dh_) / r_;
		float x2 = ((cx + w / 2.f) - (float)dw_) / r_;
		float y2 = ((cy + h / 2.f) - (float)dh_) / r_;
		box_kps.box.x1 = std::max(0.f, x1);
		box_kps.box.y1 = std::max(0.f, y1);
		box_kps.box.x2 = std::min(img_width - 1.f, x2);
		box_kps.box.y2 = std::min(img_height - 1.f, y2);
		box_kps.box.score = cls_conf;
		box_kps.box.label = 1;
		box_kps.box.label_text = "face";
		box_kps.box.flag = true;

		// landmarks
		const float* kps_offsets = row_ptr + 5;
		for (unsigned int j = 0; j < 10; j += 2)
		{
			cv::Point2f kps;
			float kps_x = (kps_offsets[j] - (float)dw_) / r_;
			float kps_y = (kps_offsets[j + 1] - (float)dh_) / r_;
			kps.x = std::min(std::max(0.f, kps_x), img_width - 1.f);
			kps.y = std::min(std::max(0.f, kps_y), img_height - 1.f);
			box_kps.landmarks.points.push_back(kps);
		}
		box_kps.landmarks.flag = true;
		box_kps.flag = true;

		bbox_kps_collection.push_back(box_kps);

		count += 1; // limit boxes for nms.
		if (count > max_nms)
			break;
	}
}


void YOLOV5Face::nms_bboxes_kps(std::vector<lite::types::BoxfWithLandmarks>& input,
	std::vector<lite::types::BoxfWithLandmarks>& output,
	float iou_threshold, unsigned int topk)
{
	if (input.empty()) return;
	std::sort(
		input.begin(), input.end(),
		[](const lite::types::BoxfWithLandmarks& a, const lite::types::BoxfWithLandmarks& b)
		{ return a.box.score > b.box.score; }
	);
	const unsigned int box_num = input.size();
	std::vector<int> merged(box_num, 0);

	unsigned int count = 0;
	for (unsigned int i = 0; i < box_num; ++i)
	{
		if (merged[i]) continue;
		std::vector<lite::types::BoxfWithLandmarks> buf;

		buf.push_back(input[i]);
		merged[i] = 1;

		for (unsigned int j = i + 1; j < box_num; ++j)
		{
			if (merged[j]) continue;

			float iou = static_cast<float>(input[i].box.iou_of(input[j].box));

			if (iou > iou_threshold)
			{
				merged[j] = 1;
				buf.push_back(input[j]);
			}

		}
		output.push_back(buf[0]);

		// keep top k
		count += 1;
		if (count >= topk)
			break;
	}
}

void YOLOV5Face::detect(const cv::Mat& mat, std::vector<lite::types::BoxfWithLandmarks>& detected_boxes_kps,
	float score_threshold, float iou_threshold, unsigned int topk)
{
	if (mat.empty()) return;
	auto img_height = static_cast<float>(mat.rows);
	auto img_width = static_cast<float>(mat.cols);
	const int target_height = (int)input_node_dims.at(2);
	const int target_width = (int)input_node_dims.at(3);

	// resize & unscale
	cv::Mat mat_rs;
	ScaleParams scale_params;
	this->resize_unscale(mat, mat_rs, target_height, target_width, scale_params);

	// 1. make input tensor
	Ort::Value input_tensor = this->transform(mat_rs);
	// 2. inference scores & boxes.
	auto output_tensors = ort_session->Run(
		Ort::RunOptions{ nullptr }, input_node_names.data(),
		&input_tensor, 1, output_node_names.data(), num_outputs
	);
	// 3. rescale & exclude.
	std::vector<lite::types::BoxfWithLandmarks> bbox_kps_collection;
	this->generate_bboxes_kps(scale_params, bbox_kps_collection, output_tensors,
		score_threshold, img_height, img_width);
	// 4. hard nms with topk.
	this->nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps, iou_threshold, topk);
}

bool YOLOV5Face::detect(const cv::Mat& cv_src, std::vector<FaceInfo>& face_infos, int position,
	float score_threshold, float iou_threshold, unsigned int topk)
{
	auto img_height = static_cast<float>(cv_src.rows);
	auto img_width = static_cast<float>(cv_src.cols);
	const int target_height = (int)input_node_dims.at(2);
	const int target_width = (int)input_node_dims.at(3);

	// resize & unscale
	cv::Mat mat_rs;
	ScaleParams scale_params;
	this->resize_unscale(cv_src, mat_rs, target_height, target_width, scale_params);

	// 1. make input tensor
	Ort::Value input_tensor = this->transform(mat_rs);
	// 2. inference scores & boxes.
	auto output_tensors = ort_session->Run(
		Ort::RunOptions{ nullptr }, input_node_names.data(),
		&input_tensor, 1, output_node_names.data(), num_outputs
	);
	// 3. rescale & exclude.
	std::vector<lite::types::BoxfWithLandmarks> bbox_kps_collection;
	this->generate_bboxes_kps(scale_params, bbox_kps_collection, output_tensors,
		score_threshold, img_height, img_width);

	std::vector<lite::types::BoxfWithLandmarks> detected_boxes_kps;
	// 4. hard nms with topk.
	this->nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps, iou_threshold, topk);

	if (detected_boxes_kps.size() == 1)
	{
		FaceInfo info;
		info.bbox.xmin = detected_boxes_kps[0].box.rect().x;
		info.bbox.ymin = detected_boxes_kps[0].box.rect().y;
		info.bbox.xmax = detected_boxes_kps[0].box.rect().br().x;
		info.bbox.ymax = detected_boxes_kps[0].box.rect().br().y;
		info.points = detected_boxes_kps[0].landmarks.points;

		face_infos.push_back(info);

		return true;
	}
	
	else if (detected_boxes_kps.size() > 1 && position == 1)
	{
		int arec = 0;
		int index = 0;
		for (int i = 0; i < detected_boxes_kps.size(); ++i)
		{
			if (detected_boxes_kps[i].box.score >= 0.7)
			{
				if (arec <= detected_boxes_kps[i].box.rect().area())
				{
					arec = detected_boxes_kps[i].box.rect().area();
					index = i;
				}
			}
		}

		FaceInfo info;
		info.bbox.xmin = detected_boxes_kps[index].box.rect().x;
		info.bbox.ymin = detected_boxes_kps[index].box.rect().y;
		info.bbox.xmax = detected_boxes_kps[index].box.rect().br().x;
		info.bbox.ymax = detected_boxes_kps[index].box.rect().br().y;
		info.points = detected_boxes_kps[index].landmarks.points;

		face_infos.push_back(info);

		return true;
	}
	return false;
}