初步认识torch自定义算子

此篇为PyTorch 自定义算子：复现CPU和CUDA版的二维卷积的代码详解
这篇是为了展示setup在构建简单的cpp算子的使用

1.环境配置

整体结构如下图所示
在这里插入图片描述

pytorch_cpp_helper.hpp中准备了CPU版卷积需要的头文件
pytorch_cuda_helper.hpp和common_cuda_helper.hpp是cuda的头文件
my_conv.cpp中是对卷积的实现
my_conv.py是使用卷积搭建的网络
setup.py对cpp文件编译使用
test.py是对整体的测试

2.pytorch_cpp_helper.hpp

#ifndef PYTORCH_CPP_HELPER
#define PYTORCH_CPP_HELPER
// torch/extension.h 提供了如下的几个重要功能：
// Tensor 操作：可以创建、读取、修改和销毁张量。
// 错误处理：通过 TORCH_CHECK 和 TORCH_INTERNAL_ASSERT 等宏来检查条件并抛出异常。
// 自动梯度计算：通过 autograd::Function 和 autograd::Variable 类来实现自定义的自动微分操作。
// CUDA 支持：如果扩展需要使用 GPU 加速，可以通过 THC 和 ATen/CUDA API 来实现。
#include <torch/extension.h>

#include <vector>

using namespace at;

#define CHECK_CUDA(x) \
  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CPU(x) \
  TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) \
  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) \
  CHECK_CUDA(x);            \
  CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) \
  CHECK_CPU(x);            \
  CHECK_CONTIGUOUS(x)

#endif  // PYTORCH_CPP_HELPER

3. pytorch_cuda_helper.hpp

#ifndef PYTORCH_CUDA_HELPER
#define PYTORCH_CUDA_HELPER

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>

#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <THC/THCAtomics.cuh>

#include "common_cuda_helper.hpp"

using at::Half;
using at::Tensor;
using phalf = at::Half;

#define __PHALF(x) (x)

#endif  // PYTORCH_CUDA_HELPER

4. common_cuda_helper.hpp

#ifndef COMMON_CUDA_HELPER
#define COMMON_CUDA_HELPER

#include <cuda.h>

#define CUDA_1D_KERNEL_LOOP(i, n)                              \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)

#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
       i += blockDim.x * gridDim.x)                                 \
    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
         j += blockDim.y * gridDim.y)

#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)

#define THREADS_PER_BLOCK 512

inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
  int optimal_block_num = (N + num_threads - 1) / num_threads;
  int max_block_num = 4096;
  return min(optimal_block_num, max_block_num);
}

template <typename T>
__device__ T bilinear_interpolate(const T* input, const int height,
                                  const int width, T y, T x,
                                  const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  int y_low = (int)y;
  int x_low = (int)x;
  int y_high;
  int x_high;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;
  // do bilinear interpolation
  T v1 = input[y_low * width + x_low];
  T v2 = input[y_low * width + x_high];
  T v3 = input[y_high * width + x_low];
  T v4 = input[y_high * width + x_high];
  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  return val;
}

template <typename T>
__device__ void bilinear_interpolate_gradient(
    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
    int& x_low, int& x_high, int& y_low, int& y_high,
    const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    // empty
    w1 = w2 = w3 = w4 = 0.;
    x_low = x_high = y_low = y_high = -1;
    return;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  y_low = (int)y;
  x_low = (int)x;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;

  // reference in forward
  // T v1 = input[y_low * width + x_low];
  // T v2 = input[y_low * width + x_high];
  // T v3 = input[y_high * width + x_low];
  // T v4 = input[y_high * width + x_high];
  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  return;
}
#endif  // COMMON_CUDA_HELPER

5.my_conv.cpp

#include "pytorch_cpp_helper.hpp"

void my_conv_im2col_cpu(Tensor data_im,
                        const int channels, const int height,
                        const int width, const int ksize_h,
                        const int ksize_w, const int pad_h, const int pad_w,
                        const int stride_h, const int stride_w,
                        const int dilation_h, const int dilation_w,
                        const int parallel_imgs, Tensor data_col);
// cuda实现
void my_conv_im2col_cuda(Tensor data_im,
                        const int channels, const int height,
                        const int width, const int ksize_h,
                        const int ksize_w, const int pad_h, const int pad_w,
                        const int stride_h, const int stride_w,
                        const int dilation_h, const int dilation_w,
                        const int parallel_imgs, Tensor data_col);
// shape检查
void my_conv_shape_check(at::Tensor input,
                         at::Tensor weight, int kH,
                         int kW, int dH, int dW, int padH, int padW,
                         int dilationH, int dilationW, int group)
{
    TORCH_CHECK(
        weight.ndimension() == 4,
        "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
        weight.ndimension());

    TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");

    TORCH_CHECK(kW > 0 && kH > 0,
                "kernel size should be greater than zero, but got kH: %d kW: %d",
                kH, kW);

    TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
                "kernel size should be consistent with weight, ",
                "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
                kH, kW, weight.size(2), weight.size(3));

    TORCH_CHECK(dW > 0 && dH > 0,
                "stride should be greater than zero, but got dH: %d dW: %d", dH,
                dW);

    TORCH_CHECK(
        dilationW > 0 && dilationH > 0,
        "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
        dilationH, dilationW);

    int ndim = input.ndimension();
    int dimf = 0;
    int dimh = 1;
    int dimw = 2;

    if (ndim == 4)
    {
        dimf++;
        dimh++;
        dimw++;
    }

    TORCH_CHECK(ndim == 3 || ndim == 4,
                "3D or 4D input tensor expected but got: %s", ndim);

    long nInputPlane = weight.size(1) * group;
    long inputHeight = input.size(dimh);
    long inputWidth = input.size(dimw);
    long nOutputPlane = weight.size(0);
    long outputHeight =
        (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
    long outputWidth =
        (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;

    if (outputWidth < 1 || outputHeight < 1)
        AT_ERROR(
            "Given input size: (%ld x %ld x %ld). "
            "Calculated output size: (%ld x %ld x %ld). Output size is too small",
            nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
            outputWidth);

    TORCH_CHECK(input.size(1) == nInputPlane,
                "invalid number of input planes, expected: %d, but got: %d",
                nInputPlane, input.size(1));

    TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
                "input image is smaller than kernel");
}
// my_conv_forward就是卷积的主函数。它的参数除了PyTorch的Conv2d传入的参数外，
// 还多了两个参数output, columus。这两个张量是保存中间结果的，在PyTorch侧是看不到的。
// output用于保存卷积输出，columns用于保存卷积时的列矩阵。
// 底层实现卷积时，会先把图像转换成一个用列表示的矩阵，
// 再把卷积操作当成一个矩阵乘法来完成。其中，第一步操作叫做"im2col"。
// 自定义卷积前向传播函数
void my_conv_forward(Tensor input, Tensor weight, Tensor bias,
                     Tensor output, Tensor columns, int kW,
                     int kH, int dW, int dH, int padW, int padH,
                     int dilationW, int dilationH, int group,
                     int im2col_step)
{
    // 检查输入是否为CUDA设备
    bool isCuda = false;
    if (input.device().is_cuda())
    {
        CHECK_CUDA_INPUT(input);
        CHECK_CUDA_INPUT(weight);
        CHECK_CUDA_INPUT(bias);
        CHECK_CUDA_INPUT(output);
        CHECK_CUDA_INPUT(columns);
        isCuda = true;
    }
    else
    {
        CHECK_CPU_INPUT(input);
        CHECK_CPU_INPUT(weight);
        CHECK_CPU_INPUT(bias);
        CHECK_CPU_INPUT(output);
        CHECK_CPU_INPUT(columns);
    }
    // 检查shape
    my_conv_shape_check(input, weight, kH, kW, dH, dW, padH,
                        padW, dilationH, dilationW, group);
    // 管理PyTorch中的设备上下文
    at::DeviceGuard guard(input.device());

    int batch = 1;
    // CHW->NCHW
    if (input.ndimension() == 3)
    {
        // 强制批量处理
        batch = 0;
        input.unsqueeze_(0);
    }

    long batchSize = input.size(0);//N
    long nInputPlane = input.size(1);//C
    long inputHeight = input.size(2);//H
    long inputWidth = input.size(3);//W

    long nOutputPlane = weight.size(0);

    // 计算输出尺寸
    // O=(W−K+2P)/S+1
    // inputWidth: 输入图像的宽度。
    // inputHeight: 输入图像的高度。
    // kW: 卷积核的宽度。
    // kH: 卷积核的高度。
    // dW: 在宽度方向上的步长。
    // dH: 在高度方向上的步长。
    // padW: 在宽度方向上的填充大小。
    // padH: 在高度方向上的填充大小。
    // dilationW: 在宽度方向上的膨胀率。
    // dilationH: 在高度方向上的膨胀率。
    long outputWidth =
        (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
    long outputHeight =
        (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

    // 调整输出和列的形状
    output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
                          outputHeight, outputWidth});
    columns = at::zeros(
        {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
        input.options());

    input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
                        inputHeight, inputWidth});

    // 准备输出缓冲区
    Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
                                      im2col_step * outputHeight, outputWidth},
                                     output.options());

    output_buffer = output_buffer.view(
        {output_buffer.size(0), group, output_buffer.size(1) / group,
         output_buffer.size(2), output_buffer.size(3)});

    // 主循环
    for (int elt = 0; elt < batchSize / im2col_step; elt++)
    {
        // 根据设备类型执行im2col
        if (isCuda)
        {
            my_conv_im2col_cuda(input[elt], nInputPlane, inputHeight,
                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                            dilationW, im2col_step, columns);
        }
        else
        {
            my_conv_im2col_cpu(input[elt], nInputPlane, inputHeight,
                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                            dilationW, im2col_step, columns);
        }
        
        // 调整列和权重的形状
        columns = columns.view({group, columns.size(0) / group, columns.size(1)});
        weight = weight.view({group, weight.size(0) / group, weight.size(1),
                              weight.size(2), weight.size(3)});

        // 分组卷积
        for (int g = 0; g < group; g++)
        {
            output_buffer[elt][g] = output_buffer[elt][g]
                                        .flatten(1)
                                        .addmm_(weight[g].flatten(1), columns[g])
                                        .view_as(output_buffer[elt][g]);
        }
        // 恢复形状
        columns =
            columns.view({columns.size(0) * columns.size(1), columns.size(2)});
        weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                              weight.size(3), weight.size(4)});
    }

    // 最终调整输出缓冲区形状
    output_buffer = output_buffer.view(
        {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
         output_buffer.size(3), output_buffer.size(4)});

    output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
                                        im2col_step, outputHeight, outputWidth});
    output_buffer.transpose_(1, 2);

    // 复制并调整输出
    output.copy_(output_buffer);
    output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});

    // 加上偏置
    bias = bias.view({1, bias.size(0), 1, 1});
    output.add_(bias);

    // 恢复输入形状
    input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});

    // 特殊情况处理
    if (batch == 0)
    {
        output = output.view({nOutputPlane, outputHeight, outputWidth});
        input = input.view({nInputPlane, inputHeight, inputWidth});
    }
}
template <typename T>
/**
 * 二维卷积的im2col操作的CPU内核函数。
 * 该函数将图像数据按照卷积的参数展开成列，以便进行高效的矩阵乘法操作。
 * 
 * @param n 输出矩阵的元素数量。
 * @param data_im 输入图像数据指针。
 * @param height 输入图像的高度。
 * @param width 输入图像的宽度。
 * @param kernel_h 卷积核的高度。
 * @param kernel_w 卷积核的宽度。
 * @param pad_h 垂直方向上的填充大小。
 * @param pad_w 水平方向上的填充大小。
 * @param stride_h 垂直方向上的步长。
 * @param stride_w 水平方向上的步长。
 * @param dilation_h 垂直方向上的扩张大小。
 * @param dilation_w 水平方向上的扩张大小。
 * @param batch_size 批处理大小。
 * @param num_channels 输入图像的通道数。
 * @param height_col 输出矩阵的高度。
 * @param width_col 输出矩阵的宽度。
 * @param data_col 输出数据指针，存储展开后的列数据。
 */
void my_conv_im2col_cpu_kernel(
    const int n, const T *data_im, const int height,
    const int width, const int kernel_h, const int kernel_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int batch_size,
    const int num_channels, const int height_col,
    const int width_col, T *data_col)
{
    // 遍历输出矩阵的每个元素
    for (int index = 0; index < n; index++)
    {
        // 计算当前输出元素在宽度和高度上的位置，以及对应的批处理和通道索引
        const int w_col = index % width_col;
        const int h_col = (index / width_col) % height_col;
        const int b_col = (index / width_col / height_col) % batch_size;//batch
        const int c_im = (index / width_col / height_col) / batch_size;//channel,特征图
        const int c_col = c_im * kernel_h * kernel_w;//

        // 计算当前输出元素在输入图像上的起始位置
        const int h_in = h_col * stride_h - pad_h;
        const int w_in = w_col * stride_w - pad_w;

        // 计算当前输出元素在输出数据中的指针位置
        T *data_col_ptr =
            data_col +
            ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;

        // 计算当前输出元素在输入图像数据中的指针位置
        const T *data_im_ptr =
            data_im + (b_col * num_channels + c_im) * height * width;

        // 遍历卷积核的每个元素
        for (int i = 0; i < kernel_h; ++i)
        {
            for (int j = 0; j < kernel_w; ++j)
            {
                // 初始化当前卷积核元素的值
                T val = static_cast<T>(0);

                // 计算当前卷积核元素在输入图像上的位置
                const int h_im = h_in + i * dilation_h;
                const int w_im = w_in + j * dilation_w;

                // 如果当前位置在输入图像的有效范围内，则取出对应的像素值
                if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
                {
                    val = data_im_ptr[h_im * width + w_im];
                }

                // 将当前像素值写入到输出数据中，并更新指针位置
                *data_col_ptr = val;
                data_col_ptr += batch_size * height_col * width_col;
            }
        }
    }
}

void my_conv_im2col_cpu(Tensor data_im,
                        const int channels, const int height,
                        const int width, const int ksize_h,
                        const int ksize_w, const int pad_h, const int pad_w,
                        const int stride_h, const int stride_w,
                        const int dilation_h, const int dilation_w,
                        const int parallel_imgs, Tensor data_col)
{
    int height_col =
        (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
    int width_col =
        (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
    int num_kernels = channels * height_col * width_col * parallel_imgs;

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        data_im.scalar_type(), "", [&]
        { my_conv_im2col_cpu_kernel<scalar_t>(
              num_kernels, data_im.data_ptr<scalar_t>(),
              height, width, ksize_h, ksize_w,
              pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
              parallel_imgs, channels,
              height_col, width_col, data_col.data_ptr<scalar_t>()); });
}
// my_ops是生成的库名
// 代码块里m.def用于定义C++函数的Python接口。"my_conv_forward"是Python调用时的函数名称，
// my_conv_forward是被Python代码调用的这份代码里的C++函数名称。
// 也就是说，这份卷积实现的入口函数就是my_conv_forward。
PYBIND11_MODULE(my_ops, m)
{
      m.def("my_conv_forward", my_conv_forward, "my_conv_forward",
            py::arg("input"), py::arg("weight"), py::arg("bias"),
            py::arg("output"), py::arg("columns"), py::arg("kW"),
            py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
            py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
            py::arg("group"), py::arg("im2col_step"));
}

6.my_conv_cuda.cu

// Modify from https://github.com/open-mmlab/mmcv/blob/my_conv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
// Copyright (c) OpenMMLab. All rights reserved.
#include <torch/types.h>

#include "pytorch_cuda_helper.hpp"

template <typename T>
__global__ void my_conv_im2col_gpu_kernel(
    const int n, const T *data_im, const int height,
    const int width, const int kernel_h, const int kernel_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int batch_size,
    const int num_channels, const int height_col,
    const int width_col, T *data_col)
{
    CUDA_1D_KERNEL_LOOP(index, n)
    {
        // index index of output matrix
        const int w_col = index % width_col;
        const int h_col = (index / width_col) % height_col;
        const int b_col = (index / width_col / height_col) % batch_size;
        const int c_im = (index / width_col / height_col) / batch_size;
        const int c_col = c_im * kernel_h * kernel_w;

        const int h_in = h_col * stride_h - pad_h;
        const int w_in = w_col * stride_w - pad_w;
        T *data_col_ptr =
            data_col +
            ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
        const T *data_im_ptr =
            data_im + (b_col * num_channels + c_im) * height * width;

        for (int i = 0; i < kernel_h; ++i)
        {
            for (int j = 0; j < kernel_w; ++j)
            {
                T val = static_cast<T>(0);
                const int h_im = h_in + i * dilation_h;
                const int w_im = w_in + j * dilation_w;
                if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
                {
                    val = data_im_ptr[h_im * width + w_im];
                }
                *data_col_ptr = val;
                data_col_ptr += batch_size * height_col * width_col;
            }
        }
    }
}

void my_conv_im2col_cuda(Tensor data_im,
                         const int channels, const int height,
                         const int width, const int ksize_h,
                         const int ksize_w, const int pad_h, const int pad_w,
                         const int stride_h, const int stride_w,
                         const int dilation_h, const int dilation_w,
                         const int parallel_imgs, Tensor data_col)
{
    int height_col =
        (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
    int width_col =
        (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
    int num_kernels = channels * height_col * width_col * parallel_imgs;

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        data_im.scalar_type(), "my_conv_im2col_gpu", [&]
        { my_conv_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels),
                                                THREADS_PER_BLOCK, 0,
                                                at::cuda::getCurrentCUDAStream()>>>(
              num_kernels, data_im.data_ptr<scalar_t>(),
              height, width, ksize_h, ksize_w,
              pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
              parallel_imgs, channels,
              height_col, width_col, data_col.data_ptr<scalar_t>()); });

    AT_CUDA_CHECK(cudaGetLastError());
}

7.my_conv.py

import torch
from torch.autograd import Function
import torch.nn as nn
from torch import Tensor
from torch.nn.modules.utils import _pair
from torch.nn.parameter import Parameter

import my_ops


class MyConvF(Function):

    @staticmethod
    def forward(ctx,
                input: torch.Tensor,
                weight,
                bias,
                stride=1,
                padding=0,
                dilation=1,
                groups=1,
                im2col_step=32):
        if input is not None and input.dim() != 4:
            raise ValueError(
                f'Expected 4D tensor as input, got {input.dim()}D tensor \
                  instead.')
        ctx.stride = _pair(stride)
        ctx.padding = _pair(padding)
        ctx.dilation = _pair(dilation)
        ctx.groups = groups
        ctx.im2col_step = im2col_step

        weight = weight.type_as(input)
        ctx.save_for_backward(input, weight)

        output = input.new_empty(MyConvF._output_size(ctx, input, weight))

        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones

        cur_im2col_step = min(ctx.im2col_step, input.size(0))
        assert (input.size(0) % cur_im2col_step
                ) == 0, 'batch size must be divisible by im2col_step'

        if bias is None:
            bias = torch.zeros(weight.shape[0]).to(weight.device)

        my_ops.my_conv_forward(
            input,
            weight,
            bias,
            output,
            ctx.bufs_[0],
            kW=weight.size(3),
            kH=weight.size(2),
            dW=ctx.stride[1],
            dH=ctx.stride[0],
            padW=ctx.padding[1],
            padH=ctx.padding[0],
            dilationW=ctx.dilation[1],
            dilationH=ctx.dilation[0],
            group=ctx.groups,
            im2col_step=cur_im2col_step)
        return output

    @staticmethod
    def _output_size(ctx, input, weight):
        channels = weight.size(0)
        output_size = (input.size(0), channels)
        for d in range(input.dim() - 2):
            in_size = input.size(d + 2)
            pad = ctx.padding[d]
            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
            stride_ = ctx.stride[d]
            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
        if not all(map(lambda s: s > 0, output_size)):
            raise ValueError(
                'convolution input is too small (output would be ' +
                'x'.join(map(str, output_size)) + ')')
        return output_size


my_conv = MyConvF.apply


class MyConv2d(nn.Module):

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups: int = 1,
                 bias: bool = True):
        super().__init__()
        kernel_size_ = _pair(kernel_size)
        stride_ = _pair(stride)
        padding_ = _pair(padding)
        dilation_ = _pair(dilation)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size_
        self.stride = stride_
        self.padding = padding_
        self.dilation = dilation_
        self.groups = groups
        self.weight = Parameter(
            torch.Tensor(out_channels, in_channels // groups, *kernel_size_))
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        # Useless attributes
        self.transposed = None
        self.output_padding = None
        self.padding_mode = None

    def forward(self, input: Tensor) -> Tensor:
        return my_conv(input, self.weight, self.bias, self.stride,
                       self.padding, self.dilation, self.groups)

8.setup.py

from setuptools import setup
from torch.utils import cpp_extension
import os

src_root = './'
cpp_src = ['my_conv.cpp','my_conv_cuda.cu']

if __name__ == '__main__':
    include_dirs = ['./']
    cpp_path = [os.path.join(src_root, src) for src in cpp_src]

    setup(
        name='panoflow',
        ext_modules=[
            cpp_extension.CUDAExtension(
                'my_ops', cpp_path, include_dirs=include_dirs)
        ],
        cmdclass={'build_ext': cpp_extension.BuildExtension})

9.test.py

import torch
import torch.nn as nn
from my_conv import MyConv2d

inc = 3
outc = 4
img_shaspe = (50, 50)

device_name = 'cuda:0'
# device_name = 'cpu'
open_bias = True


def test_one():
    ts = torch.ones([1, 1, 3, 3]).to(device_name)
    layer = nn.Conv2d(1, 1, 3, 1, 1, bias=open_bias).to(device_name)
    gt = layer(ts)
    my_layer = MyConv2d(1, 1, 3, 1, 1).to(device_name)
    my_layer.load_state_dict(layer.state_dict(), strict=False)
    res = my_layer(ts)
    res = res.to('cpu')
    gt = gt.to('cpu')
    assert torch.allclose(res, gt, 1e-3, 1e-5)


def test_two():
    ts = torch.rand([1, inc, *img_shaspe]).to(device_name)
    layer = nn.Conv2d(inc, outc, 3, 1, 1, bias=open_bias).to(device_name)
    gt = layer(ts)
    my_layer = MyConv2d(inc, outc, 3, 1, 1).to(device_name)
    my_layer.load_state_dict(layer.state_dict(), strict=False)
    res = my_layer(ts)
    res = res.to('cpu')
    gt = gt.to('cpu')
    assert torch.allclose(res, gt, 1e-3, 1e-5)


if __name__ == '__main__':
    test_one()
    test_two()