【CUDA】矩阵乘法 matMatMul

矩阵乘法 matMatMul

矩阵乘法是基本线性代数子程序（BLAS）的重要组成部分，而且线性代数中许多其他操作以此为基础。

图1是两个矩阵的乘法。

基础方法，正方形tile和长方形tile

基础方法

执行矩阵乘法的基础方法是使用单个线程执行输出矩阵的单个元素乘法。这意味着所需的线程数等于输出矩阵中的元素数。线程排列在二维网格中，每个线程分配一个唯一的索引。线程的索引用于访问输入矩阵的相应行和列。执行行和列的乘法，结果存储在输出矩阵的相应元素中。

这种方法的主要缺点是多个线程加载相同的行和列来计算它们的输出。图2是一个示例。

图2

正如上图所示，为了计算P0,0和P0,1，两个线程都需要加载整个M0。对于P0,0和P1,0也是如此。两个线程都需要加载整个N1列。这意味着线程将多次访问相同的内存位置。

正方形tile

为了避免这个问题，我们可以使用tile。tile是将输入矩阵的一个小部分加载到共享内存中。线程将把tile加载到共享内存中，然后执行乘法运算。图3描述了这种技术。

图3

kernel将计算分为几个阶段。每个阶段，线程将将M和N矩阵中的tile加载到共享内存中。然后，线程将执行tile的乘法，并将结果累积到输出矩阵的相应元素中。

通过这种技术，我们可以看到全局内存访问减少了TILE_WIDTH（图示）倍。

长方形tile

为了进一步减少全局内存访问，我们可以使用图4所示的矩形块。

图4

通过这种技术，我们增加了每个线程的工作量，以进一步减少全局内存访问的次数。kernel再次将计算分成多个阶段。在每个阶段中，线程将从中加载一个tile M 和两个tile N 到共享内存中。然后线程将对这些tile进行乘法运算，并将结果累加到输出矩阵的相应元素中。

Code

host代码使用随机值初始化输入矩阵，并调用kernel执行乘法运算。输入矩阵以线性格式存储。


#include <iostream>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>


#include "mat_mat_mul_kernels.cuh"

int main(int argc, char *argv[])
{

    int rows1, cols1rows2, cols2, t_x;

    if(argc != 5){
        std::cout << "Usage: " << argv[0] << " <rows1> <cols1rows2> <cols2> <t_x>" << std::endl;
        return 1;
    }

    rows1 = atoi(argv[1]);
    cols1rows2 = atoi(argv[2]);
    cols2 = atoi(argv[3]);
    t_x = atoi(argv[4]);

    thrust::host_vector<float> h_in_mat1(rows1 * cols1rows2);
    thrust::host_vector<float> h_in_mat2(cols1rows2 * cols2);
    thrust::host_vector<float> h_out_host(rows1 * cols2);

    srand(time(NULL));
    for(int i = 0; i < rows1 * cols1rows2; i++)
        h_in_mat1[i] = rand() / (float)RAND_MAX;

    for(int i = 0; i < cols1rows2 * cols2; i++)
        h_in_mat2[i] = rand() / (float)RAND_MAX;

    for(int i = 0; i < rows1; ++i)
        for(int j = 0; j < cols2; ++j)
            for(int k = 0; k < cols1rows2; ++k)
                h_out_host[i * cols2 + j] += h_in_mat1[i * cols1rows2 + k] * h_in_mat2[k * cols2 + j];

    thrust::device_vector<float> d_in_mat1 = h_in_mat1;
    thrust::device_vector<float> d_in_mat2 = h_in_mat2;

    thrust::device_vector<float> d_out(rows1 * cols2);

    dim3 blockDim(t_x, t_x);
    dim3 gridDim((cols2 + blockDim.x - 1) / blockDim.x, (rows1 + blockDim.y - 1) / blockDim.y);
    mat_mat_mul<float><<<gridDim, blockDim>>>(
        thrust::raw_pointer_cast(d_in_mat1.data()),
        thrust::raw_pointer_cast(d_in_mat2.data()),
        thrust::raw_pointer_cast(d_out.data()),
        rows1, cols1rows2, cols1rows2, cols2);

    bool success = true;
    for(int i = 0; i < rows1 * cols2; ++i)
        if(abs(h_out_host[i] - d_out[i]) >= 0.001){
            std::cout << "Error at " << i << ": " << h_out_host[i] << " != " << d_out[i] << " (Mat Mat Mul)" << std::endl;
            success = false;
            break;
        }
    if(success)
        std::cout << "Success (Mat Mat Mul)" << std::endl;


    blockDim = dim3(t_x, t_x);
    gridDim = dim3((cols2 + blockDim.x - 1) / blockDim.x, (rows1 + blockDim.y - 1) / blockDim.y);
    mat_mat_mul_tiles<float><<<gridDim, blockDim, 2 * blockDim.x * blockDim.x * sizeof(float)>>>(
        thrust::raw_pointer_cast(d_in_mat1.data()),
        thrust::raw_pointer_cast(d_in_mat2.data()),
        thrust::raw_pointer_cast(d_out.data()),
        rows1, cols1rows2, cols1rows2, cols2);

    success = true;
    for(int i = 0; i < rows1 * cols2; ++i)
        if(abs(h_out_host[i] - d_out[i]) >= 0.001){
            std::cout << "Error at " << i << ": " << h_out_host[i] << " != " << d_out[i] << " (Mat Mat Mul Tiles)" << std::endl;
            success = false;
            break;
        }
    if(success)
        std::cout << "Success (Mat Mat Mul Tiles)" << std::endl;

    
    blockDim = dim3(t_x, t_x);
    gridDim = dim3((cols2 + blockDim.x - 1) / blockDim.x / 2, (rows1 + blockDim.y - 1) / blockDim.y);
    mat_mat_mul_rec_tiles<float><<<gridDim, blockDim, 3 * blockDim.x * blockDim.x * sizeof(float)>>>(
        thrust::raw_pointer_cast(d_in_mat1.data()),
        thrust::raw_pointer_cast(d_in_mat2.data()),
        thrust::raw_pointer_cast(d_out.data()),
        rows1, cols1rows2, cols1rows2, cols2);

    success = true;
    for(int i = 0; i < rows1 * cols2; ++i)
        if(abs(h_out_host[i] - d_out[i]) >= 0.001){
            std::cout << "Error at " << i << ": " << h_out_host[i] << " != " << d_out[i] << " (Mat Mat Mul Rec Tiles)" << std::endl;
            success = false;
            break;
        }
    if(success)
        std::cout << "Success (Mat Mat Mul Rec Tiles)" << std::endl;

    return 0;

}

以下是kernel的展示。

基础方法

template<typename T> __global__
void mat_mat_mul(T *in_mat1, T *in_mat2, T *out_mat, 
                 int mat1_rows, int mat1_cols, 
                 int mat2_rows, int mat2_cols) {

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row >= mat1_rows || col >= mat2_cols) return;

    T sum = 0;
    for (int k = 0; k < mat1_cols; ++k)
        sum += in_mat1[row * mat1_cols + k] * in_mat2[k * mat2_cols + col];

    out_mat[row * mat2_cols + col] = sum;
}

在这种基本方法中，kernel非常简单。

线程首先计算它们的行和列索引。如果行或列索引大于输入矩阵的行数或列数，则线程返回。

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row >= mat1_rows || col >= mat2_cols) return;

然后，这些线程会对行和列进行相乘，并将结果存储在输出矩阵的相应元素中。

T sum = 0;
for (int k = 0; k < mat1_cols; ++k)
    sum += in_mat1[row * mat1_cols + k] * in_mat2[k * mat2_cols + col];

out_mat[row * mat2_cols + col] = sum;

我们可以观察到，线程不仅会多次加载相同的行和列，而且它们还会以非合并的方式加载M的元素。这意味着线程将无法充分利用内存带宽。

正方形tile

template<typename T> __global__
void mat_mat_mul_tiles(T *in_mat1, T *in_mat2, T *out_mat,
                       int mat1_rows, int mat1_cols,
                       int mat2_rows, int mat2_cols) {
    
    // Initialize shared memory
    int TILE_WIDTH = blockDim.x;
    extern __shared__ uint8_t shared_mem[];
    T *ds_mat1 = reinterpret_cast<T*>(shared_mem);
    T *ds_mat2 = reinterpret_cast<T*>(shared_mem + TILE_WIDTH * TILE_WIDTH * sizeof(T));
    
    int bx = blockIdx.x,  by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;

    int row = by * TILE_WIDTH + ty;
    int col = bx * TILE_WIDTH + tx;

    T out_value = 0;
    // Loop over the in_mat1 and in_mat2 tiles required to compute the out_mat element
    for (int ph = 0; ph < (mat1_cols + TILE_WIDTH - 1) / TILE_WIDTH; ++ph) {

        // Collaborative loading of in_mat1 and in_mat2 tiles into shared memory
        ds_mat1[ty * TILE_WIDTH + tx] = row < mat1_rows && ph * TILE_WIDTH + tx < mat1_cols ?
                                        in_mat1[row * mat1_cols + ph * TILE_WIDTH + tx] : 0;
                                        
        ds_mat2[ty * TILE_WIDTH + tx] = ph * TILE_WIDTH + ty < mat2_rows && col < mat2_cols ?
                                        in_mat2[(ph * TILE_WIDTH + ty) * mat2_cols + col] : 0;

        // Synchronize to make sure the tiles are loaded
        __syncthreads();

        // Compute the out_mat element
        for (int k = 0; k < TILE_WIDTH; ++k)
            out_value += ds_mat1[ty * TILE_WIDTH + k] * ds_mat2[k * TILE_WIDTH + tx];
        
        // Synchronize to make sure the out_mat element is computed
        // before other threads load new tiles
        __syncthreads();
    }

    // Store the out_mat element in out_mat
    if (row < mat1_rows && col < mat2_cols)
        out_mat[row * mat2_cols + col] = out_value;
}

在这种方法中，我们使用共享内存来存储输入矩阵的块。线程首先计算它们的行和列索引。如果行或列索引大于输入矩阵的行数或列数，则线程返回。

int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;

if (row >= mat1_rows || col >= mat2_cols) return;

在每个阶段：

线程将tiles加载到共享内存中。这些tiles以合并访存的方式加载。

// Collaborative loading of in_mat1 and in_mat2 tiles into shared memory
ds_mat1[ty * TILE_WIDTH + tx] = row < mat1_rows && ph * TILE_WIDTH + tx < mat1_cols ?
                                in_mat1[row * mat1_cols + ph * TILE_WIDTH + tx] : 0;

ds_mat2[ty * TILE_WIDTH + tx] = ph * TILE_WIDTH + ty < mat2_rows && col < mat2_cols ?
                                in_mat2[(ph * TILE_WIDTH + ty) * mat2_cols + col] : 0;

// Synchronize to make sure the tiles are loaded
__syncthreads();

当线程加载完tile后，它们会计算输出矩阵元素。线程通过将tiles的相应行和列相乘，并将结果相加来计算输出矩阵元素。

// Compute the out_mat element
for (int k = 0; k < TILE_WIDTH; ++k)
    out_value += ds_mat1[ty * TILE_WIDTH + k] * ds_mat2[k * TILE_WIDTH + tx];

// Synchronize to make sure the out_mat element is computed
// before other threads load new tiles
__syncthreads();

最后，线程将输出矩阵元素存储在输出矩阵中。

// Store the out_mat element in out_mat
out_mat[row * mat2_cols + col] = out_value;

长方形tiles

template<typename T> __global__
void mat_mat_mul_rec_tiles(T* in_mat1, T* in_mat2, T* out_mat,
    int mat1_rows, int mat1_cols,
    int mat2_rows, int mat2_cols) {

    // Initialize shared memory
    int TILE_WIDTH = blockDim.x;
    extern __shared__ uint8_t shared_mem[];
    T* ds_mat1 = reinterpret_cast<T*>(shared_mem);
    T* ds_mat2 = reinterpret_cast<T*>(shared_mem + TILE_WIDTH * TILE_WIDTH * sizeof(T));
    T* ds_mat3 = reinterpret_cast<T*>(shared_mem + 2 * TILE_WIDTH * TILE_WIDTH * sizeof(T));

    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;

    int row = by * TILE_WIDTH + ty;
    int col = bx * TILE_WIDTH * 2 + tx;

    T out_value1 = 0;
    T out_value2 = 0;
    // Loop over the in_mat1 and in_mat2 tiles required to compute the out_mat element
    for (int ph = 0; ph < (mat1_cols + TILE_WIDTH - 1) / TILE_WIDTH; ++ph) {

        // Collaborative loading of in_mat1 and in_mat2 tiles into shared memory
        ds_mat1[ty * TILE_WIDTH + tx] = row < mat1_rows&& ph* TILE_WIDTH + tx < mat1_cols ?
            in_mat1[row * mat1_cols + ph * TILE_WIDTH + tx] : 0;

        ds_mat2[ty * TILE_WIDTH + tx] = ph * TILE_WIDTH + ty < mat2_rows&& col < mat2_cols ?
            in_mat2[(ph * TILE_WIDTH + ty) * mat2_cols + col] : 0;

        ds_mat3[ty * TILE_WIDTH + TILE_WIDTH + tx] = ph * TILE_WIDTH + ty < mat2_rows&& TILE_WIDTH + col < mat2_cols ?
            in_mat2[(ph * TILE_WIDTH + ty) * mat2_cols + TILE_WIDTH + col] : 0;

        // Synchronize to make sure the tiles are loaded
        __syncthreads();

        // Compute the out_mat element
        for (int k = 0; k < TILE_WIDTH; k++) {
            out_value1 += ds_mat1[ty * TILE_WIDTH + k] * ds_mat2[k * TILE_WIDTH + tx];
            out_value2 += ds_mat1[ty * TILE_WIDTH + k] * ds_mat3[k * TILE_WIDTH + TILE_WIDTH + tx];
        }

        // Synchronize to make sure the Pvalues are computed
        // before other threads load new tiles
        __syncthreads();
    }

    // Store the Pvalues in out_mat
    if (row < mat1_rows && col < mat2_cols)
        //out_mat[row][col];
        out_mat[row * mat2_cols + col] = out_value1;

    if (row < mat1_rows && TILE_WIDTH + col < mat2_cols)
        //out_mat[row][TILE_WIDTH + col];
        out_mat[row * mat2_cols + TILE_WIDTH + col] = out_value2;
}

这个kernel函数几乎与正方形tile函数相同。

首先，线程计算出他们将计算的输出矩阵元素的行和列。

int row = by * TILE_WIDTH     + ty;
int col = bx * TILE_WIDTH * 2 + tx;

然后线程将tile加载到共享内存中。唯一的区别是N加载2个tile。

// Collaborative loading of in_mat1 and in_mat2 tiles into shared memory
        ds_mat1[ty * TILE_WIDTH + tx] = row < mat1_rows&& ph* TILE_WIDTH + tx < mat1_cols ?
            in_mat1[row * mat1_cols + ph * TILE_WIDTH + tx] : 0;

        ds_mat2[ty * TILE_WIDTH + tx] = ph * TILE_WIDTH + ty < mat2_rows&& col < mat2_cols ?
            in_mat2[(ph * TILE_WIDTH + ty) * mat2_cols + col] : 0;

        ds_mat3[ty * TILE_WIDTH + TILE_WIDTH + tx] = ph * TILE_WIDTH + ty < mat2_rows&& TILE_WIDTH + col < mat2_cols ?
            in_mat2[(ph * TILE_WIDTH + ty) * mat2_cols + TILE_WIDTH + col] : 0;

然后线程计算两个输出矩阵元素。

// Compute the out_mat element
        for (int k = 0; k < TILE_WIDTH; k++) {
            out_value1 += ds_mat1[ty * TILE_WIDTH + k] * ds_mat2[k * TILE_WIDTH + tx];
            out_value2 += ds_mat1[ty * TILE_WIDTH + k] * ds_mat3[k * TILE_WIDTH + TILE_WIDTH + tx];
        }

最后，存储两个输出矩阵元素。

// Store the Pvalues in out_mat
if (row < mat1_rows && col < mat2_cols)
  //out_mat[row][col];
    out_mat[row * mat2_cols + col] = out_value1;

if (row < mat1_rows && TILE_WIDTH + col < mat2_cols)
  //out_mat[row][TILE_WIDTH + col];
    out_mat[row * mat2_cols + TILE_WIDTH + col] = out_value2;

性能分析

运行时间：

矩阵维度：1024 × 1024

线程块维度：32 × 32

可见使用共享内存可以有效降低运行速度。但长方形tile反而耗时更久。因为L1缓存与共享内存公用硬件空间。可能共享内存占据大部分空间，而L1缓存所剩无几，从而导致长方形tile耗时更久。具体情况需要做性能分析，之后再补充。也许是长方形tile的复杂性增加。kernel引入了许多分支和同步点。这可能导致耗时更久。