记录下在rk1126上,实现 BiSeNet 网络推理.
https://github.com/CoinCheung/BiSeNet
ONNX
生成 onnx
模型
python tools/export_onnx.py --config configs/bisenetv2_city.py --weight-path ./checkpoints/model_final_v2_city.pth --outpath ./checkpoints/model_final_v2_city.onnx --no-onnxsim
转换 RKNN 模型
- 如果是自定义数据集,
mean
和std
参考自己数据集中的. datasets.txt
用于量化 ,datasets_ans
用于精度分析- 量化类型使用:
asymmetric_affine-u8
from rknn.api import RKNN ONNX_MODEL = './model/model_final_v2_city.onnx' RKNN_MODEL = './model/model_final_v2_city_u8.rknn' QUANTIZE_ON = True _force_builtin_perm = False _acc_analysis_output_dir = './output_dir' _acc_analysis_dataset = './images/city/datasets_ans.txt' _qua_dataset = './images/city/datasets.txt' if __name__ == '__main__': # Create RKNN object rknn = RKNN(verbose=True) # pre-process config # asymmetric_affine-u8, dynamic_fixed_point-i8, dynamic_fixed_point-i16 print('--> config model') rknn.config( reorder_channel='0 1 2', mean_values=[[83.0535, 94.095, 82.1865]], std_values=[[53.856, 54.774, 75.786]], optimization_level=3, target_platform = 'rv1126', quantize_input_node= QUANTIZE_ON, quantized_dtype='asymmetric_affine-u8', batch_size=32, output_optimize=1, force_builtin_perm=_force_builtin_perm) print('done') print('--> Loading model') # ret = rknn.load_onnx(model=ONNX_MODEL, outputs=['output0', 'output1']) ret = rknn.load_onnx(model=ONNX_MODEL, outputs=['preds']) if ret != 0: print('Load model failed!') exit(ret) print('done') # Build model print('--> Building model') ret = rknn.build(do_quantization=QUANTIZE_ON, dataset=_qua_dataset,pre_compile=True) if ret != 0: print('Build pp_liteseg_stdc1_camvid_960x720_10k_model failed!') exit(ret) print('done') # Export rknn model print('--> Export RKNN model') ret = rknn.export_rknn(RKNN_MODEL) if ret != 0: print('Export failed!') exit(ret) print('done') print('--> Accuracy analysis') ret = rknn.accuracy_analysis(inputs=_acc_analysis_dataset,output_dir=_acc_analysis_output_dir) if ret != 0: print('accuracy_analysis failed!') exit(ret) print('done') rknn.release()
NPU 推理
- 主要参考作者给的
ncnn
和tensorrt
demo 实现.#include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <sys/time.h> #include <dirent.h> #include <iostream> #include <fstream> #include <sstream> #include <queue> #include "rknn_api.h" #include "opencv2/opencv.hpp" #include "opencv2/core/core.hpp" #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/highgui/highgui.hpp" #include <chrono> #include <iostream> #include <random> #include <algorithm> #include <stdio.h> #include <string> #include <vector> using namespace std; using namespace cv; void printRKNNTensor(rknn_tensor_attr *attr) { printf("index=%d name=%s n_dims=%d dims=[%d %d %d %d] n_elems=%d size=%d " "fmt=%d type=%d qnt_type=%d fl=%d zp=%d scale=%f\n", attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0], attr->n_elems, attr->size, 0, attr->type, attr->qnt_type, attr->fl, attr->zp, attr->scale); } vector<vector<uint8_t>> get_color_map() { vector<vector<uint8_t>> color_map(256, vector<uint8_t>(3)); std::minstd_rand rand_eng(123); std::uniform_int_distribution<uint8_t> u(0, 255); for (int i{0}; i < 256; ++i) { for (int j{0}; j < 3; ++j) { color_map[i][j] = u(rand_eng); } } return color_map; } cv::Mat static_resize(cv::Mat &img, int INPUT_W, int INPUT_H) { float r = std::min(INPUT_W / (img.cols * 1.0), INPUT_H / (img.rows * 1.0)); // r = std::min(r, 1.0f); int unpad_w = r * img.cols; int unpad_h = r * img.rows; cv::Mat re(unpad_h, unpad_w, CV_8UC3); cv::resize(img, re, re.size()); cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(114, 114, 114)); re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows))); return out; } int main(int argc, char *argv[]) { std::string model_path = std::string(argv[1]); // std::string imagepath = std::string(argv[2]); std::string folder_path = std::string(argv[2]); int input_width_ = std::atoi(argv[3]); int input_height_ = std::atoi(argv[4]); std::vector<cv::String> file_names; cv::glob(folder_path, file_names); int oH{input_height_}, oW{input_width_}, n_classes{2}; // Load model FILE *fp = fopen(model_path.c_str(), "rb"); if (fp == NULL) { printf("fopen %s fail!\n", model_path); return -1; } fseek(fp, 0, SEEK_END); int model_len = ftell(fp); void *model = malloc(model_len); fseek(fp, 0, SEEK_SET); if (model_len != fread(model, 1, model_len, fp)) { printf("fread %s fail!\n", model_path); free(model); return -1; } rknn_context ctx = 0; int ret = rknn_init(&ctx, model, model_len, 0); if (ret < 0) { printf("rknn_init fail! ret=%d\n", ret); return -1; } /* Query sdk version */ rknn_sdk_version version; ret = rknn_query(ctx, RKNN_QUERY_SDK_VERSION, &version, sizeof(rknn_sdk_version)); if (ret < 0) { printf("rknn_init error ret=%d\n", ret); return -1; } printf("sdk version: %s driver version: %s\n", version.api_version, version.drv_version); /* Get input,output attr */ rknn_input_output_num io_num; ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); if (ret < 0) { printf("rknn_init error ret=%d\n", ret); return -1; } printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); rknn_tensor_attr input_attrs[io_num.n_input]; memset(input_attrs, 0, sizeof(input_attrs)); for (int i = 0; i < io_num.n_input; i++) { input_attrs[i].index = i; ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); if (ret < 0) { printf("rknn_init error ret=%d\n", ret); return -1; } printRKNNTensor(&(input_attrs[i])); } rknn_tensor_attr output_attrs[io_num.n_output]; memset(output_attrs, 0, sizeof(output_attrs)); for (int i = 0; i < io_num.n_output; i++) { output_attrs[i].index = i; ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); printRKNNTensor(&(output_attrs[i])); } int input_channel = 3; int input_width = 0; int input_height = 0; if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) { printf("model is NCHW input fmt\n"); input_width = input_attrs[0].dims[0]; input_height = input_attrs[0].dims[1]; printf("input_width=%d input_height=%d\n", input_width, input_height); } else { printf("model is NHWC input fmt\n"); input_width = input_attrs[0].dims[1]; input_height = input_attrs[0].dims[2]; printf("input_width=%d input_height=%d\n", input_width, input_height); } printf("model input height=%d, width=%d, channel=%d\n", input_height, input_width, input_channel); for (size_t i = 0; i < file_names.size(); i++) { cv::Mat im = cv::imread(file_names[i]); auto t1 = std::chrono::steady_clock::now(); Mat pr_img; cv::resize(im, pr_img, cv::Size(oW, oH)); cv::cvtColor(pr_img, pr_img, cv::COLOR_BGR2RGB); /* Init input tensor */ rknn_input inputs[1]; memset(inputs, 0, sizeof(inputs)); inputs[0].index = 0; inputs[0].buf = pr_img.data; inputs[0].type = RKNN_TENSOR_UINT8; inputs[0].size = input_width * input_height * input_channel; inputs[0].fmt = RKNN_TENSOR_NHWC; inputs[0].pass_through = 0; /* Init output tensor */ rknn_output outputs[io_num.n_output]; memset(outputs, 0, sizeof(outputs)); for (int i = 0; i < io_num.n_output; i++) { outputs[i].want_float = 1; } rknn_inputs_set(ctx, io_num.n_input, inputs); ret = rknn_run(ctx, NULL); if (ret < 0) { printf("ctx error ret=%d\n", ret); return -1; } ret = rknn_outputs_get(ctx, io_num.n_output, outputs, NULL); if (ret < 0) { printf("outputs error ret=%d\n", ret); return -1; } vector<vector<uint8_t>> color_map = get_color_map(); cv::Mat pred(cv::Size(oW, oH), CV_8UC3); int o_size = input_width * input_height * 4; float *prob = new float[o_size]; memcpy(prob, (float *)outputs[0].buf, o_size); int idx{0}; for (int i{0}; i < oH; ++i) { uint8_t *ptr = pred.ptr<uint8_t>(i); for (int j{0}; j < oW; ++j) { ptr[0] = color_map[prob[idx]][0]; ptr[1] = color_map[prob[idx]][1]; ptr[2] = color_map[prob[idx]][2]; ptr += 3; ++idx; } } // resize back and save cv::resize(pred, pred, im.size(), cv::INTER_CUBIC); cv::imwrite(cv::format("./out/%d.jpg", i), pred); ret = rknn_outputs_release(ctx, io_num.n_output, outputs); if (ret < 0) { printf("rknn_query fail! ret=%d\n", ret); goto Error; } } Error: if (ctx > 0) rknn_destroy(ctx); if (model) free(model); if (fp) fclose(fp); return 0; }
- 使用
adb
将程序拷贝到板载上调用
./bisenet_seg_npu_sample ./model/model_final_v2_city_u8.rknn ./images 1024 512
混合量化
-
如果感觉模型识别效果不是那么好,可以尝试这使用混合量化,找到一个速度兼精度的一个平衡点.
-
hybrid_quantization_step1.py
from rknn.api import RKNN ONNX_MODEL = './model/model_final_v2_city.onnx' RKNN_MODEL = './model/model_final_v2_city_u8.rknn' QUANTIZE_ON = True _qua_dataset = './images/city/datasets.txt' _force_builtin_perm = False if __name__ == '__main__': # Create RKNN object rknn = RKNN() # model config print('--> Config model') rknn.config(reorder_channel='0 1 2', mean_values=[[83.0535, 94.095, 82.1865]], std_values=[[53.856, 54.774, 75.786]], optimization_level=3, target_platform='rk1126', output_optimize=1, quantized_dtype='asymmetric_affine-u8', quantize_input_node= QUANTIZE_ON, batch_size=32, force_builtin_perm=False ) print('done') # Load onnx model print('--> Loading model') ret = rknn.load_onnx(model=ONNX_MODEL) if ret != 0: print('Load model failed!') exit(ret) print('done') # Hybrid quantization step1 print('--> hybrid_quantization_step1') ret = rknn.hybrid_quantization_step1(dataset=_qua_dataset) if ret != 0: print('hybrid_quantization_step1 failed!') exit(ret) print('done') print('==================================================================================================') rknn.release()
-
hybrid_quantization_step2.py
, 根据精度分析结果,在torchjitexport.quantization.cfg
中 ,将误差较大的层,换成float
或dynamic_fixed_point-i16
等精度高的量化类型.from rknn.api import RKNN ONNX_MODEL = './model/model_final_v2_city.onnx' RKNN_MODEL = './model/model_final_v2_city_u8_hyqua.rknn' QUANTIZE_ON = True _force_builtin_perm = False _qua_dataset = './images/city/datasets.txt' if __name__ == '__main__': # Create RKNN object rknn = RKNN() # Set model config print('--> config model') rknn.config(reorder_channel='0 1 2', mean_values=[[83.0535, 94.095, 82.1865]], std_values=[[53.856, 54.774, 75.786]], optimization_level=3, target_platform='rk1126', output_optimize=1, quantized_dtype='asymmetric_affine-u8', quantize_input_node= QUANTIZE_ON, batch_size=32, force_builtin_perm=False ) print('done') # Hybrid quantization step2 print('--> hybrid_quantization_step2') ret = rknn.hybrid_quantization_step2(model_input='./torchjitexport.json', data_input='./torchjitexport.data', model_quantization_cfg='./torchjitexport.quantization.cfg', dataset=_qua_dataset, pre_compile=True) if ret != 0: print('hybrid_quantization_step2 failed!') exit(ret) print('done') # Export RKNN model print('--> Export RKNN model') ret = rknn.export_rknn(RKNN_MODEL) if ret != 0: print('Export RKNN model failed!') exit(ret) print('done') rknn.release()
END
- 以上差不多就是实现推理全部过程,有不对地方欢迎大佬们指正.