【CUDA编程】动态体素化实现

动态体素化实现

动态体素化DV克服了硬体素化HV的一些缺点。动态体素化DV保留了分组grouping阶段，相反，它没有采样固定的点数或体素容量，它保留了点和体素之间的完全映射。因此，体素数和每个体素中的点数都是动态的，依赖于具体的映射函数。这消除了对固定大小缓冲区的需求，也消除了对随机点和体素的丢弃过程。

因为所有原始点和体素信息都被保留了，动态体素化DV没有引入信息丢失，并产生了确定的体素嵌入，使得检测结果更稳定。

另外，动态体素化建立了每一个点和体素对之间的双向关系，为从不同视图融合点级上下文特征提供了自然基础。

动态体素化DV动态高效分配资源来管理所有体素和点。

简单的动态体素特征编码，也是简单地对体素中所有点的特征向量进行平均值运算作为整个体素的特征，只不过每个体素中的点个数是动态的，每个体素都不同。

1，核心函数

torch::Tensor generate_voxel(torch::Tensor &points,
                             const std::vector<float> &voxel_size,
                             const std::vector<float> &coors_range,
                             const int NDim){
    torch::Tensor coors = points.new_zeros({points.size(0), 3},
                                           torch::TensorOptions().dtype(torch::kInt).device(torch::kCUDA, 0));
    dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, 3);
    return coors;
}
std::vector<at::Tensor> DynamicSimpleVFE(torch::Tensor &voxels,
                                         torch::Tensor &coors){
    assert(coors.size(1) == 3 && voxels.size(0) == coors.size(0));
    std::vector<at::Tensor> res = dynamic_point_to_voxel_forward_gpu(voxels, coors, MEAN);
    assert(res.size() == 2);
    return res;
}

2.动态体素化 dynamic_voxelize_gpu


void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
                          const std::vector<float> voxel_size,
                          const std::vector<float> coors_range,
                          const int NDim) {
    // check device
    CHECK_INPUT(points);

    at::cuda::CUDAGuard device_guard(points.device());

    const int num_points = points.size(0);
    const int num_features = points.size(1);

    const float voxel_x = voxel_size[0];
    const float voxel_y = voxel_size[1];
    const float voxel_z = voxel_size[2];
    const float coors_x_min = coors_range[0];
    const float coors_y_min = coors_range[1];
    const float coors_z_min = coors_range[2];
    const float coors_x_max = coors_range[3];
    const float coors_y_max = coors_range[4];
    const float coors_z_max = coors_range[5];

    const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
    const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
    const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);

//    const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
    const int col_blocks = ATenCeilDiv(num_points, threadsPerBlock);
    dim3 blocks(col_blocks);
    dim3 threads(threadsPerBlock);
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();

//    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
//            points.contiguous().data_ptr<scalar_t>(),
//            coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
//            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
//            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
    AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
                points.contiguous().data_ptr<scalar_t>(),
                coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
                coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
                coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
    });
    cudaDeviceSynchronize();
    AT_CUDA_CHECK(cudaGetLastError());

    return;
}

以上代码实现了获取了cuda streams 然后调用kernel函数，实现如下过程比较简单，其实就是一个将原始点的坐标转换为体素格子的坐标。voxel size为0.1时，原来范围是-100，100精确到米，现在就是-1000，1000精确到分米。

__global__ void dynamic_voxelize_kernel(
        const T* points, T_int* coors, const float voxel_x, const float voxel_y,
        const float voxel_z, const float coors_x_min, const float coors_y_min,
        const float coors_z_min, const float coors_x_max, const float coors_y_max,
        const float coors_z_max, const int grid_x, const int grid_y,
        const int grid_z, const int num_points, const int num_features,
        const int NDim) {
    //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
   // CUDA_1D_KERNEL_LOOP(index, num_points) 
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_points; 
       index += blockDim.x * gridDim.x)

     {
        // To save some computation
        auto points_offset = points + index * num_features;
        auto coors_offset = coors + index * NDim;
        int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
        if (c_x < 0 || c_x >= grid_x) {
            coors_offset[0] = -1;
            return;
        }

        int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
        if (c_y < 0 || c_y >= grid_y) {
            coors_offset[0] = -1;
            coors_offset[1] = -1;
            return;
        }

        int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
        if (c_z < 0 || c_z >= grid_z) {
            coors_offset[0] = -1;
            coors_offset[1] = -1;
            coors_offset[2] = -1;
        } else {
            coors_offset[0] = c_z;
            coors_offset[1] = c_y;
            coors_offset[2] = c_x;
        }
    }
}

3.动态体素特征提取 dynamic_point_to_voxel_forward_gpu

coors.masked_fill(coors.lt(0).any(-1, true), -1);
.lt 小于0的返回true .any表示有true就赋值为ture torch.any(input, dim, keepdim=False, *, out=None) → Tensor
masked_fill作用：在tensor对应mask对应的位置填充value，其中mask的shape必须是可以广播到该张量的。
因此此句含义为将坐标值（x，y,z）中有一个小于0的点都赋值为（-1,-1,-1）

std::tie(out_coors, coors_map, reduce_count) =at::unique_dim(coors_clean, 0, true, true, true);
unique_dim这里返回了体素对应的坐标，以及每个体素的点数

std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
        const at::Tensor &feats, const at::Tensor &coors,
        const reduce_t reduce_type) {
    CHECK_INPUT(feats);
    CHECK_INPUT(coors);

    const int num_input = feats.size(0);
    const int num_feats = feats.size(1);

    if (num_input == 0)
        return {feats.clone().detach(),
                coors.clone().detach(),
                coors.new_empty({0}, torch::kInt32),
                coors.new_empty({0}, torch::kInt32)};

    at::Tensor out_coors;
    at::Tensor coors_map;
    at::Tensor reduce_count;

    auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);  
    //.lt 小于0的返回true  .any表示有true就赋值为ture  torch.any(input, dim, keepdim=False, *, out=None) → Tensor
    //masked_fill作用：在tensor对应mask对应的位置填充value，其中mask的shape必须是可以广播到该张量的。
    //因此此句含义为将坐标值（x，y,z）中有一个小于0的点都赋值为（-1,-1,-1）

    std::tie(out_coors, coors_map, reduce_count) =
            at::unique_dim(coors_clean, 0, true, true, true);
    //这里返回了体素对应的坐标，以及每个体素的点数
    

    if (out_coors.index({0, 0}).lt(0).item<bool>()) {
        // the first element of out_coors (-1,-1,-1) and should be removed
        out_coors = out_coors.slice(0, 1);
        reduce_count = reduce_count.slice(0, 1);
        coors_map = coors_map - 1;
    }

    coors_map = coors_map.to(torch::kInt32);
    reduce_count = reduce_count.to(torch::kInt32);

    auto reduced_feats =
            at::empty({out_coors.size(0), num_feats}, feats.options());

    AT_DISPATCH_FLOATING_TYPES(
            feats.scalar_type(), "feats_reduce_kernel", ([&] {
        if (reduce_type == reduce_t::MAX)
            reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
        else
            reduced_feats.fill_(static_cast<scalar_t>(0));

        dim3 blocks(std::min(ATenCeilDiv(num_input, threadsPerBlock),
                             maxGridDim));
        dim3 threads(threadsPerBlock);
        feats_reduce_kernel<<<blocks, threads>>>(
                feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
                reduced_feats.data_ptr<scalar_t>(), num_input, num_feats, reduce_type);
        if (reduce_type == reduce_t::MEAN)
            reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
    }));
    AT_CUDA_CHECK(cudaGetLastError());

    return {reduced_feats, out_coors};
}

4.如何编译

4.1 使用find_package

如果CMake的版本小于3.10，可以在CMakeLists.txt文件中使用find_package来导入CUDA包，然后就可以使用cuda_add_executable()或者cuda_add_library()来编译CUDA可执行文件或者库文件了。

cmake_minimum_required(VERSION 3.8)
project(CUDA_TEST)
 
find_package(CUDA REQUIRED)
 
message(STATUS "cuda version: " ${CUDA_VERSION_STRING})
include_directories(${CUDA_INCLUDE_DIRS})
 
cuda_add_executable(cuda_test cuda_test.cu)
target_link_libraries(cuda_test ${CUDA_LIBRARIES})

其中变量CUDA_VERSION_STRING表示CUDA的版本号，CUDA_INCLUDE_DIRS表示CUDA头文件存放的目录，CUDA_LIBRARIES表示CUDA的库文件。更多说明可以参考CMake的官方文档：

https://cmake.org/cmake/help/latest/module/FindCUDA.html

CMakeLists.txt写好后，执行下面的命令就可以编译出可执行文件：

mkdir build && cd build

cmake ..

make

4.2 添加CUDA编程语言支持

在3.10及以上版本的CMake中，find_package的方式已经被弃用（可以用但不推荐），要编译CUDA代码可以CMakeLists.txt文件中添加对CUDA编程语言的支持。如果程序中CUDA代码是可选的，那么可以在CMakeLists.txt文件中使用下面的语句进行使能：

enable_language(CUDA)

如果CUDA代码是必须的，那么就需要像下面这样进行设置，表示在项目CUDA_TEST中要用到CUDA和C++两种编程语言：

project(CUDA_TEST LANGUAGES CUDA CXX)

可以通过CheckLanuage判断CUDA是否可用

include(CheckLanguage)

check_language(CUDA)

然后就可以跟编译普通C++代码一样用add_executable编译可执行文件了：

cmake_minimum_required(VERSION 3.10)
project(CUDA_TEST LANGUAGES CUDA CXX)
 
include(CheckLanguage)
check_language(CUDA)
 
add_executable(cuda_test cuda_test.cu)