GPU Puzzles讲解（二）

GPU-Puzzles项目是一个很棒的学习cuda编程的项目，可以让你学习到GPU编程和cuda核心并行编程的概念，通过一个个小问题让你理解cuda的编程和调用，创建共享显存空间，实现卷积和矩阵乘法等

https://github.com/srush/GPU-Puzzleshttps://github.com/srush/GPU-Puzzles

本文是接续我上一篇文章的讲解，深入分析几个比较困难的puzzles，讲解实现和优化原理

，GPU Puzzles讲解（一）-CSDN博客GPU Puzzles是一个很棒的cuda编程学习仓库，本文是对其中题目介绍和思量讲解https://blog.csdn.net/lijj0304/article/details/142737859GPU Puzzles是一个很棒的cuda编程学习仓库，本文是对其中题目介绍和思量讲解https://blog.csdn.net/lijj0304/article/details/142737859

Puzzle 11 - 1D Convolution

Implement a kernel that computes a 1D convolution between a and b and stores it in out. You need to handle the general case. You only need 2 global reads and 1 global write per thread.

实现一维卷积的教学，这里要求控制全局读写次数。一开始很自然的会想到把待卷积的向量和卷积核内容分别存下来。同时考虑到共享内存空间一般和每块的线程数直接关联，而卷积需要乘周围的几个元素，我们需要额外存储边界的几个元素。不仅要在共享内存中存储卷积核的内容，又为了控制到2次的全局读写，这里用了一个trick：即利用不需要存卷积核时的索引去存边界的额外元素

def conv_spec(a, b):
    out = np.zeros(*a.shape)
    len = b.shape[0]
    for i in range(a.shape[0]):
        out[i] = sum([a[i + j] * b[j] for j in range(len) if i + j < a.shape[0]])
    return out


MAX_CONV = 4
TPB = 8
TPB_MAX_CONV = TPB + MAX_CONV
def conv_test(cuda):
    def call(out, a, b, a_size, b_size) -> None:
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x

        # FILL ME IN (roughly 17 lines)
        shared_a = cuda.shared.array(TPB_MAX_CONV, numba.float32)
        shared_b = cuda.shared.array(MAX_CONV, numba.float32)
        
        if i < a_size:
            shared_a[local_i] = a[i]

        if local_i < b_size:
            shared_b[local_i] = b[local_i]
        else:
            local_j = local_i - b_size
            if i-b_size+TPB < a_size and local_j < b_size:
                shared_a[local_j+TPB] = a[i-b_size+TPB]
        cuda.syncthreads()
        
        total = 0.0
        for j in range(b_size):
            if i+j < a_size:
                total += shared_a[local_i+j] * shared_b[j]
        
        if i < a_size:
            out[i] = total

    return call


# Test 1

SIZE = 6
CONV = 3
out = np.zeros(SIZE)
a = np.arange(SIZE)
b = np.arange(CONV)
problem = CudaProblem(
    "1D Conv (Simple)",
    conv_test,
    [a, b],
    out,
    [SIZE, CONV],
    Coord(1, 1),
    Coord(TPB, 1),
    spec=conv_spec,
)
problem.show()

可视化效果

# 1D Conv (Simple)
 
   Score (Max Per Thread):
   |  Global Reads | Global Writes |  Shared Reads | Shared Writes |
   |             2 |             1 |             6 |             2 |

一维卷积Test2：向量长度超过TPB

out = np.zeros(15)
a = np.arange(15)
b = np.arange(4)
problem = CudaProblem(
    "1D Conv (Full)",
    conv_test,
    [a, b],
    out,
    [15, 4],
    Coord(2, 1),
    Coord(TPB, 1),
    spec=conv_spec,
)
problem.show()

可视化效果

# 1D Conv (Full)
 
   Score (Max Per Thread):
   |  Global Reads | Global Writes |  Shared Reads | Shared Writes |
   |             2 |             1 |             8 |             2 |

Puzzle 12 - Prefix Sum

Implement a kernel that computes a sum over a and stores it in out. If the size of a is greater than the block size, only store the sum of each block.

这个是利用共享内存来计算前缀和，减少访存次数。可以利用两两归并相加的原理，最终结果存储在共享内存块的第1个。利用TPB分割向量长度，输出的out是每一段TPB长度的向量和

TPB = 8
def sum_spec(a):
    out = np.zeros((a.shape[0] + TPB - 1) // TPB)
    for j, i in enumerate(range(0, a.shape[-1], TPB)):
        out[j] = a[i : i + TPB].sum()
    return out


def sum_test(cuda):
    def call(out, a, size: int) -> None:
        cache = cuda.shared.array(TPB, numba.float32)
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x
        # FILL ME IN (roughly 12 lines)
        if i < size:
            cache[local_i] = a[i]
        else:
            cache[local_i] = 0
        cuda.syncthreads()
        p = 2
        while (p <= TPB):
            if local_i % p == 0:
                cache[local_i] += cache[local_i+p/2]
            cuda.syncthreads()
            p *= 2
        if local_i == 0:
            out[cuda.blockIdx.x] = cache[local_i]

    return call


# Test 1

SIZE = 8
out = np.zeros(1)
inp = np.arange(SIZE)
problem = CudaProblem(
    "Sum (Simple)",
    sum_test,
    [inp],
    out,
    [SIZE],
    Coord(1, 1),
    Coord(TPB, 1),
    spec=sum_spec,
)
problem.show()

可视化效果

# Sum (Simple)
 
   Score (Max Per Thread):
   |  Global Reads | Global Writes |  Shared Reads | Shared Writes |
   |             1 |             1 |             7 |             4 |

前缀和Test2：

SIZE = 15
out = np.zeros(2)
inp = np.arange(SIZE)
problem = CudaProblem(
    "Sum (Full)",
    sum_test,
    [inp],
    out,
    [SIZE],
    Coord(2, 1),
    Coord(TPB, 1),
    spec=sum_spec,
)
problem.show()

可视化效果

# Sum (Full)
 
   Score (Max Per Thread):
   |  Global Reads | Global Writes |  Shared Reads | Shared Writes |
   |             1 |             1 |             7 |             4 |

Puzzle 13 - Axis Sum

Implement a kernel that computes a sum over each column of a and stores it in out.

其实就是前缀和的运用，这里加入了batch作为列标计，每个batch计算和，out尺寸和bath大小一致

TPB = 8
def sum_spec(a):
    out = np.zeros((a.shape[0], (a.shape[1] + TPB - 1) // TPB))
    for j, i in enumerate(range(0, a.shape[-1], TPB)):
        out[..., j] = a[..., i : i + TPB].sum(-1)
    return out


def axis_sum_test(cuda):
    def call(out, a, size: int) -> None:
        cache = cuda.shared.array(TPB, numba.float32)
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x
        batch = cuda.blockIdx.y
        # FILL ME IN (roughly 12 lines)
        if i < size:
            cache[local_i] = a[batch, i]
        else:
            cache[local_i] = 0
        cuda.syncthreads() 
        p = 2
        while (p <= TPB):
            if local_i % p == 0:
                cache[local_i] += cache[local_i+p/2]
            cuda.syncthreads()
            p *= 2
        if local_i == 0:
            out[batch, 0] += cache[local_i]

    return call


BATCH = 4
SIZE = 6
out = np.zeros((BATCH, 1))
inp = np.arange(BATCH * SIZE).reshape((BATCH, SIZE))
problem = CudaProblem(
    "Axis Sum",
    axis_sum_test,
    [inp],
    out,
    [SIZE],
    Coord(1, BATCH),
    Coord(TPB, 1),
    spec=sum_spec,
)
problem.show()

可视化效果

# Axis Sum
 
   Score (Max Per Thread):
   |  Global Reads | Global Writes |  Shared Reads | Shared Writes |
   |             2 |             1 |             7 |             4 |

Puzzle 14 - Matrix Multiply!

Implement a kernel that multiplies square matrices a and b and stores the result in out.

矩阵乘法实现教学。第一直觉肯定就是利用二维的线程矩阵，循环行列遍历乘积求和。但是会有一个致命的问题就是线程矩阵比实际的矩阵小，而且想要控制全局读取的次数。这时就要利用好二维的共享空间，可以从对应行列出发，按行横向，列竖向顺序取部分的（TPB，TPB）矩阵块然后求乘积。从Test2的可视化图可以很直观看出这个思想

def matmul_spec(a, b):
    return a @ b


TPB = 3
def mm_oneblock_test(cuda):
    def call(out, a, b, size: int) -> None:
        a_shared = cuda.shared.array((TPB, TPB), numba.float32)
        b_shared = cuda.shared.array((TPB, TPB), numba.float32)

        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
        local_i = cuda.threadIdx.x
        local_j = cuda.threadIdx.y
        # FILL ME IN (roughly 14 lines)
        total = 0.0
        for k in range(0, size, TPB): 
            if local_j + k < size and i < size:
                a_shared[local_i, local_j] = a[i, local_j+k]
            if local_i + k < size and j < size:
                b_shared[local_i, local_j] = b[local_i+k, j]
            cuda.syncthreads()
            
            for local_k in range(min(TPB, size-k)):
                total += a_shared[local_i, local_k] * b_shared[local_k, local_j]
        if i < size and j < size:
            out[i, j] = total

    return call

# Test 1

SIZE = 2
out = np.zeros((SIZE, SIZE))
inp1 = np.arange(SIZE * SIZE).reshape((SIZE, SIZE))
inp2 = np.arange(SIZE * SIZE).reshape((SIZE, SIZE)).T

problem = CudaProblem(
    "Matmul (Simple)",
    mm_oneblock_test,
    [inp1, inp2],
    out,
    [SIZE],
    Coord(1, 1),
    Coord(TPB, TPB),
    spec=matmul_spec,
)
problem.show(sparse=True)

可视化效果

# Matmul (Simple)
 
   Score (Max Per Thread):
   |  Global Reads | Global Writes |  Shared Reads | Shared Writes |
   |             2 |             1 |             4 |             2 |

Test2大于线程矩阵的矩阵乘法：

SIZE = 8
out = np.zeros((SIZE, SIZE))
inp1 = np.arange(SIZE * SIZE).reshape((SIZE, SIZE))
inp2 = np.arange(SIZE * SIZE).reshape((SIZE, SIZE)).T

problem = CudaProblem(
    "Matmul (Full)",
    mm_oneblock_test,
    [inp1, inp2],
    out,
    [SIZE],
    Coord(3, 3),
    Coord(TPB, TPB),
    spec=matmul_spec,
)
problem.show(sparse=True)

可视化效果

# Matmul (Full)
 
   Score (Max Per Thread):
   |  Global Reads | Global Writes |  Shared Reads | Shared Writes |
   |             6 |             1 |            16 |             6 |