遥感图像变换检测实践上手（TensorRT+UNet）

简介

分析PyTorch示例

onnx模型转engine

编写TensorRT推理代码

main.cpp测试代码

小结

简介

这里通过TensorRT+UNet，在Linux下实现对遥感图像的变化检测，示例如下：

可以先拉去代码：RemoteChangeDetection

分析PyTorch示例

在目录PyFiles中，unet.py存放UNet网络定义，可以使用test_infer.py脚本进行推理并导出onnx模型，可以简单分析一下test_infer.py中的关键代码。

（1）加载处理图像

import torch
import numpy as np
from PIL import Image
from torchvision import transforms
from unet import UNet
import onnx
import onnxsim

# 读取变换前后的代码
img1 = Image.open("./A/val_20.png")
img2 = Image.open("./B/val_20.png")

# 输出的图像名称
output_image_path = "result.png"

# PIL图像转Tensor张量
transform = transforms.Compose([
    transforms.ToTensor()
])

# 分别取两幅图像的第一个通道图像，因为PIL读取的图像是RGB的，注意和OpenCV图像区别
img1_data = np.array(img1)
img1_data = img1_data[:, :, 0]

img2_data = np.array(img2)
img2_data = img2_data[:, :, 0]

# 这里合并输入图像: shape ==> [height, width, 2]
input_image = np.stack([img1_data, img2_data], axis=2)

# 转换为模型输入，大致流程:
# 1. transform: 图像从[0, 255] 映射到 [0, 1]; 交换通道图像[h, w, 2] => [2, h, w]
# 2. unsqueeze(0)，增加第一个维度：[2, h, w] => [1, 2, h, w]
# 3. unit8 转 float32类型，并放置在GPU上
input_image_tensor = transform(input_image).unsqueeze(0).type(torch.float32).to(device)

（2）推理并导出为onnx

def export_norm_onnx(model, file, input):
    torch.onnx.export(
        model         = model, 
        args          = (input,),
        f             = file,
        input_names   = ["input0"],
        output_names  = ["output0"],
        opset_version = 9)

    print("Finished normal onnx export")

    model_onnx = onnx.load(file)
    onnx.checker.check_model(model_onnx)

    # 使用onnx-simplifier来进行onnx的简化。
    print(f"Simplifying with onnx-simplifier {onnxsim.__version__}...")
    model_onnx, check = onnxsim.simplify(model_onnx)
    assert check, "assert check failed"
    onnx.save(model_onnx, file)

这里定义了一个导出onnx函数，model为PyTorch模型，file是输出文件路径，input是模型的输入。

with torch.no_grad():
    net = UNet(2).to(device)
    net.eval()
    load_models = torch.load(weights)
    net.load_state_dict(torch.load(weights))
    
    out_image = net(input_image_tensor)

    _out_image = out_image[0][0].round().detach().cpu().numpy()
    _out_image = (_out_image * 255).astype(np.uint8)

    result_image = Image.fromarray(_out_image)
    result_image.save(output_image_path)
    export_norm_onnx(net, "./unet_simple.onnx", input_image_tensor)

这里是推理（为了测试.pth模型）并导出onnx。这里注意对输出图像的后处理过程，在编写c++接口时要留意。

使用onnx可视化工具查看导出的onnx模型：

onnx模型转engine

如果你已经按照了TensorRT，并且配置好了环境变量后，可以直接使用bin下的trtexec命令将onnx模型进行转换，假如你的TensorRT安装路径如下：

环境变量的配置：

使用如下命令进行转换：

trtexec --onnx=dncnn_color_blind.onnx --saveEngine=dncnn_color_engine_intro.engine  --explicitBatch
// *.onnx是输入的模型，*.engine是保存的模型

上边只是举个例子，把文件名换成自己的就可以了。

编写TensorRT推理代码

（1）运行环境搭建

我的运行环境目录大致如下：

RemoteChangeDetection
3rdparty
    |------- opencv-3.4.10
                    |-------- include
                    |-------- lib
    |------- TensorRT-8.5.2.2
                    |-------- include
                    |-------- lib
    ...

首先修改CMakeLists.txt中的三方库路径：

那么你应该修改CUDA，CUDNN，OpenCV以及TensorRT的路径。

在src/路径下是核心代码，trt_logger包含了TensorRT推理时依赖的logger，以及CUDA函数运行时的检查宏：

#ifndef __LOGGER_H__
#define __LOGGER_H__

#include <string>
#include <stdarg.h>
#include <memory>
#include <cuda_runtime.h>
#include <system_error>
#include "NvInfer.h"


#define CUDA_CHECK(call)             __cudaCheck(call, __FILE__, __LINE__)
#define LAST_KERNEL_CHECK(call)      __kernelCheck(__FILE__, __LINE__)

static void __cudaCheck(cudaError_t err, const char* file, const int line) {
    if (err != cudaSuccess) {
        printf("ERROR: %s:%d, ", file, line);
        printf("code:%s, reason:%s\n", cudaGetErrorName(err), cudaGetErrorString(err));
        exit(1);
    }
}

static void __kernelCheck(const char* file, const int line) {
    cudaError_t err = cudaPeekAtLastError();
    if (err != cudaSuccess) {
        printf("ERROR: %s:%d, ", file, line);
        printf("code:%s, reason:%s\n", cudaGetErrorName(err), cudaGetErrorString(err));
        exit(1);
    }
}

#define LOGF(...) logger::Logger::__log_info(logger::Level::FATAL, __VA_ARGS__)
#define LOGE(...) logger::Logger::__log_info(logger::Level::ERROR, __VA_ARGS__)
#define LOGW(...) logger::Logger::__log_info(logger::Level::WARN,  __VA_ARGS__)
#define LOG(...)  logger::Logger::__log_info(logger::Level::INFO,  __VA_ARGS__)
#define LOGV(...) logger::Logger::__log_info(logger::Level::VERB,  __VA_ARGS__)
#define LOGD(...) logger::Logger::__log_info(logger::Level::DEBUG, __VA_ARGS__)

最重要的是UNetTrt部分，在UNetTrt.h：

#ifndef UNET_TRT_H_
#define UNET_TRT_H_

#include <iostream>
#include <memory>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>

// 前置定义
namespace nvinfer1
{
    class IRuntime;
    class ICudaEngine;
    class IExecutionContext;
}

class UNet
{
public:
    UNet() {};
    ~UNet();
    
    // 加载engine文件
    bool loadTrtModel(const std::string model_path);
    
    // 推理，input_mat1: 变换前；input_mat2: 变换后；output是变量引用
    bool trt_infer(cv::Mat &input_mat1, cv::Mat &input_mat2, cv::Mat &output);          // input_mat1: before, input_mat2: after

private:
    // runtime_, engine_, context_等成员是TensorRT推理时最重要的几个成员变量
    // 为了放置内存泄露，用智能指针管理
    std::shared_ptr<nvinfer1::IRuntime>               runtime_;        
    std::shared_ptr<nvinfer1::ICudaEngine>            engine_;
    std::shared_ptr<nvinfer1::IExecutionContext>      context_;
    cudaStream_t                                      stream_;

    int input_index_;            // 索引输入
    int output_index_;           // 索引输出

    const char                  *INPUT_NAME         = "input0";            // 输入名称，和onnx导入时保持一致
    const char                  *OUTPUT_NAME        = "output0";           // 和上边保持一致
    const int                    BATCH_SIZE         = 1;                   // 一般都保持为1
    void                        *buffers_[2];                              // 存放TensorRT输入输出
    float                       *input_float_       = nullptr;             // 存放Host端输入，c11允许.h中初始化
    float                       *output_float_      = nullptr;             // Host端计算结果
};

#endif

在.cpp中，给出一些核心实现：

#include "UNetTrt.h"
#include <fstream>
#include <cmath>
#include "trt_logger.h"
#include "NvInfer.h"
#include "NvOnnxParser.h"

#define INPUT_WIDTH         1024
#define INPUT_HEIGHT        1024

bool UNet::loadTrtModel(const std::string model_path)
{
    char *trt_stream = nullptr;
    size_t size = 0;

    // load trt model
    std::ifstream file(model_path, std::ios::binary);
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trt_stream = new char[size];

        if(!trt_stream)
            return false;
        
        file.read(trt_stream, size);
        file.close();
    } else {
        return false;
    }

    logger::Logger trt_logger(logger::Level::INFO);
    runtime_.reset(nvinfer1::createInferRuntime(trt_logger));

    if(!runtime_)
        return false;

    engine_.reset(runtime_->deserializeCudaEngine(trt_stream, size, nullptr));
    if(!engine_)
        return false;

    context_.reset(engine_->createExecutionContext());
    if(!context_)
        return false;

    const nvinfer1::ICudaEngine& trtEngine = context_->getEngine();

    input_index_ = trtEngine.getBindingIndex(INPUT_NAME);
    output_index_ = trtEngine.getBindingIndex(OUTPUT_NAME);

    CUDA_CHECK(cudaMalloc(&buffers_[input_index_], BATCH_SIZE * 2 * INPUT_WIDTH * INPUT_HEIGHT * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers_[output_index_], BATCH_SIZE * 1 * INPUT_WIDTH * INPUT_HEIGHT * sizeof(float)));

    input_float_ = new float[BATCH_SIZE * 2 * INPUT_WIDTH * INPUT_HEIGHT];
    output_float_ = new float[BATCH_SIZE * 1 * INPUT_WIDTH * INPUT_HEIGHT];

    delete []trt_stream;
    return true;
}

首先，输入大小是固定的，所以在宏里写死了输入大小1024x1024；loadTrtModel根据路径加载engine文件，并对一些推理时用到的成员变量依次初始化，同时分配好输入输出空间。

推理代码如下：

bool UNet::trt_infer(cv::Mat &input_mat1, cv::Mat &input_mat2, cv::Mat &output)
{
    if(input_mat1.empty() || input_mat2.empty())
        return false;

    if(input_mat1.rows != input_mat2.rows || input_mat1.cols != input_mat2.cols)
        return false;

    if(input_mat1.channels() <= 1 && input_mat2.channels() <= 1) 
        return false;

    int pre_width = input_mat1.cols;
    int pre_height = input_mat1.rows;

    cv::resize(input_mat1, input_mat1, cv::Size(INPUT_WIDTH, INPUT_HEIGHT), cv::INTER_CUBIC);
    cv::resize(input_mat2, input_mat2, cv::Size(INPUT_WIDTH, INPUT_HEIGHT), cv::INTER_CUBIC);

    std::vector<cv::Mat> input_mat1_channels;
    cv::split(input_mat1, input_mat1_channels);

    std::vector<cv::Mat> input_mat2_channels;
    cv::split(input_mat2, input_mat2_channels);

    // [H, W, C] => [C, H, W] && [0.0, 0.1]
    for(int i = 0; i < INPUT_WIDTH; i++) {
        for(int j = 0; j < INPUT_HEIGHT; j++) {
            int idx_c1 = j * INPUT_WIDTH + i;
            int idx_c2 = idx_c1 + INPUT_WIDTH * INPUT_HEIGHT;
            input_float_[idx_c1] = (float)input_mat1_channels[2].data[idx_c1] / 255.0f;
            input_float_[idx_c2] = (float)input_mat2_channels[2].data[idx_c1] / 255.0f;
        }
    }
    
    memset(output_float_, 0, BATCH_SIZE * 1 * INPUT_WIDTH * INPUT_HEIGHT);
    CUDA_CHECK(cudaStreamCreate(&stream_));
    CUDA_CHECK(cudaMemcpyAsync(buffers_[input_index_], input_float_, 
                BATCH_SIZE * 2 * INPUT_WIDTH * INPUT_HEIGHT * sizeof(float), cudaMemcpyHostToDevice, stream_));

    context_->enqueueV2(buffers_, stream_, nullptr);
    CUDA_CHECK(cudaMemcpyAsync(output_float_, buffers_[output_index_], 
                BATCH_SIZE * 1 * INPUT_WIDTH * INPUT_HEIGHT * sizeof(float), cudaMemcpyDeviceToHost, stream_));
    cudaStreamSynchronize(stream_);

    // round
    for(int i = 0; i < INPUT_WIDTH; i++) {
        for(int j = 0; j < INPUT_HEIGHT; j++) {
            int index = j * INPUT_WIDTH + i;
            output_float_[index] = std::round(output_float_[index]);
        }
    }

    output = cv::Mat(INPUT_HEIGHT, INPUT_WIDTH, CV_32F, output_float_);
    output *= 255.0;
    output.convertTo(output, CV_8U);
    cv::resize(output, output, cv::Size(pre_width, pre_height), cv::INTER_CUBIC);
    return true;
}

这里依次讲解一下，首先你可能要把代码放入工程，那么应该尽量做好判断，比如图像是否为空；图像大小、通道是否一致，以防万一可以同时进行Resize；

cv::split对3通道图像进行剥离，放入vector中，然后开始进行通道转换与归一化。这里可以稍微理解一下不同图像在内存中的存放方式，一般的RGB图像或者BGR图像（height, width, channel）应该是这样：

B G R B G R B G R B G R B G R B G R
B G R B G R B G R B G R B G R B G R
B G R B G R B G R B G R B G R B G R
B G R B G R B G R B G R B G R B G R

互相交错存放，但是网络输入一般是（channel, height, width），那么存放方式是如下这样：

R R R R R R R R
R R R R R R R R
R R R R R R R R

G G G G G G G G
G G G G G G G G
G G G G G G G G

B B B B B B B B
B B B B B B B B
B B B B B B B B

那么就可以很容易写出通道转换与归一化代码：

// [H, W, C] => [C, H, W] && [0.0, 0.1]
    for(int i = 0; i < INPUT_WIDTH; i++) {
        for(int j = 0; j < INPUT_HEIGHT; j++) {
            int idx_c1 = j * INPUT_WIDTH + i;
            int idx_c2 = idx_c1 + INPUT_WIDTH * INPUT_HEIGHT;
            input_float_[idx_c1] = (float)input_mat1_channels[2].data[idx_c1] / 255.0f;
            input_float_[idx_c2] = (float)input_mat2_channels[2].data[idx_c1] / 255.0f;
        }
    }

每次推理前把输出结果清空置为0：

memset(output_float_, 0, BATCH_SIZE * 1 * INPUT_WIDTH * INPUT_HEIGHT);

重新分配cudaStream_t，cudaMemcpyAsync分配显存，context_->enqueueV2推理，cudaMemcpyAsync再将结果从显存拷贝到Host端。

CUDA_CHECK(cudaStreamCreate(&stream_));

CUDA_CHECK(cudaMemcpyAsync(buffers_[input_index_], input_float_, 
                BATCH_SIZE * 2 * INPUT_WIDTH * INPUT_HEIGHT * sizeof(float), cudaMemcpyHostToDevice, stream_));

context_->enqueueV2(buffers_, stream_, nullptr);

CUDA_CHECK(cudaMemcpyAsync(output_float_, buffers_[output_index_], 
                BATCH_SIZE * 1 * INPUT_WIDTH * INPUT_HEIGHT * sizeof(float), 

cudaMemcpyDeviceToHost, stream_));

cudaStreamSynchronize(stream_);

后处理过程中，也遇到了一些坑，总体而言，还是要一一对照python那部分后处理代码仔细分析：

_out_image = out_image[0][0].round().detach().cpu().numpy()
_out_image = (_out_image * 255).astype(np.uint8)
result_image = Image.fromarray(_out_image)
result_image.save(output_image_path)

    // round
    for(int i = 0; i < INPUT_WIDTH; i++) {
        for(int j = 0; j < INPUT_HEIGHT; j++) {
            int index = j * INPUT_WIDTH + i;
            output_float_[index] = std::round(output_float_[index]);
        }
    }

    output = cv::Mat(INPUT_HEIGHT, INPUT_WIDTH, CV_32F, output_float_);
    output *= 255.0;
    output.convertTo(output, CV_8U);
    cv::resize(output, output, cv::Size(pre_width, pre_height), cv::INTER_CUBIC);

因为图像是缩放过一次的，最后给缩放回去。

main.cpp测试代码

在main.cpp编写测试示例，一般是建议将类用智能指针管理：

std::shared_ptr<UNet> unet_infer = std::make_shared<UNet>();
    
std::string model_path = "./weights/unet_simple_trt.engine";

if(unet_infer) {
    if(unet_infer->loadTrtModel(model_path))
        std::cout << "UNet Init Successful! \n";
    else 
        std::cout << "UNet Init Failed! \n";
}

推理：

cv::Mat img1 = cv::imread("./test_images/val_20_A.png");
cv::Mat img2 = cv::imread("./test_images/val_20_B.png");
cv::Mat result;

if(unet_infer->trt_infer(img1, img2, result)) {
    std::cout << "UNet Infer Successfully! \n";
} else {
    std::cout << "UNet Infer Failed! \n";
}

当然，最后可以测试一下推理速度以及输出是不是一致：

int count = 100;
int cost = 0;
    
for(int i = 0; i < count; i++) {    
    auto start = std::chrono::high_resolution_clock::now();    
    bool success = unet_infer->trt_infer(img1, img2, result);
    auto end = std::chrono::high_resolution_clock::now();
    cost += std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
}
    
std::cout << "duration: " << (float)(cost) / count << " ms" << std::endl; 

if(!result.empty()) {
    cv::imwrite("./result.png", result);
}