文章目录
- 一、理解CUDA的grid和Block
- 1)第一个cuda项目
- 二、理解.cu和.cpp的相互引用及Makefile
- 三、利用CUDA矩阵乘法(matmul)计算、Error Handle 及硬件信息获取
- 1)矩阵乘法
- 2)Error Handle
- 3)硬件信息获取
- 四、安装Nsight system and compute
- 五、共享内存、Bank Conflict原因和解决方法、TRT用Cuda进行预处理/后处理来加速、Stream 与Event(用Cuda写流提高并发性)
- 六、双线性插值与仿射变换
一、理解CUDA的grid和Block
- 目标
理解Cuda中一维、二维、三维的grid、block的写法,以及遍历thread的方法
1)第一个cuda项目
- 修改项目的Makefile.config
- 总体文件目录
- 代码
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void print_idx_kernel(){
printf("block idx: (%3d, %3d, %3d), thread idx: (%3d, %3d, %3d)\n",
blockIdx.z, blockIdx.y, blockIdx.x,
threadIdx.z, threadIdx.y, threadIdx.x);
}
__global__ void print_dim_kernel(){
printf("grid dimension: (%3d, %3d, %3d), block dimension: (%3d, %3d, %3d)\n",
gridDim.z, gridDim.y, gridDim.x,
blockDim.z, blockDim.y, blockDim.x);
}
__global__ void print_thread_idx_per_block_kernel(){
int index = threadIdx.z * blockDim.x * blockDim.y + \
threadIdx.y * blockDim.x + \
threadIdx.x;
printf("block idx: (%3d, %3d, %3d), thread idx: %3d\n",
blockIdx.z, blockIdx.y, blockIdx.x,
index);
}
__global__ void print_thread_idx_per_grid_kernel(){
int bSize = blockDim.z * blockDim.y * blockDim.x;
int bIndex = blockIdx.z * gridDim.x * gridDim.y + \
blockIdx.y * gridDim.x + \
blockIdx.x;
int tIndex = threadIdx.z * blockDim.x * blockDim.y + \
threadIdx.y * blockDim.x + \
threadIdx.x;
int index = bIndex * bSize + tIndex;
printf("block idx: %3d, thread idx in block: %3d, thread idx: %3d\n",
bIndex, tIndex, index);
}
__global__ void print_cord_kernel(){
int index = threadIdx.z * blockDim.x * blockDim.y + \
threadIdx.y * blockDim.x + \
threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
printf("block idx: (%3d, %3d, %3d), thread idx: %3d, cord: (%3d, %3d)\n",
blockIdx.z, blockIdx.y, blockIdx.x,
index, x, y);
}
void print_one_dim(){
int inputSize = 8;
int blockDim = 4;
int gridDim = inputSize / blockDim;
dim3 block(blockDim);
dim3 grid(gridDim);
/* 这里建议大家吧每一函数都试一遍*/
// print_idx_kernel<<<grid, block>>>();
// print_dim_kernel<<<grid, block>>>();
// print_thread_idx_per_block_kernel<<<grid, block>>>();
print_thread_idx_per_grid_kernel<<<grid, block>>>();
cudaDeviceSynchronize();
}
void print_two_dim(){
int inputWidth = 4;
int blockDim = 2;
int gridDim = inputWidth / blockDim;
dim3 block(blockDim, blockDim);
dim3 grid(gridDim, gridDim);
/* 这里建议大家吧每一函数都试一遍*/
// print_idx_kernel<<<grid, block>>>();
// print_dim_kernel<<<grid, block>>>();
// print_thread_idx_per_block_kernel<<<grid, block>>>();
print_thread_idx_per_grid_kernel<<<grid, block>>>();
cudaDeviceSynchronize();
}
void print_cord(){
int inputWidth = 4;
int blockDim = 2;
int gridDim = inputWidth / blockDim;
dim3 block(blockDim, blockDim);
dim3 grid(gridDim, gridDim);
print_cord_kernel<<<grid, block>>>();
cudaDeviceSynchronize();
}
int main() {
/*
synchronize是同步的意思,有几种synchronize
cudaDeviceSynchronize: CPU与GPU端完成同步,CPU不执行之后的语句,知道这个语句以前的所有cuda操作结束
cudaStreamSynchronize: 跟cudaDeviceSynchronize很像,但是这个是针对某一个stream的。只同步指定的stream中的cpu/gpu操作,其他的不管
cudaThreadSynchronize: 现在已经不被推荐使用的方法
__syncthreads: 线程块内同步
*/
// print_one_dim();
// print_two_dim();
print_cord();
return 0;
}
-
注意
__global__
表示核函数kernel -
需求:找到某个block下面的thread
代码如下(先走z,然后y,最后z)
一般的优化
二、理解.cu和.cpp的相互引用及Makefile
- 编译器
不再是gcc或g++,而是nvcc,这样才不会编译报错 - 编译项目一指令
nvcc print_index.cu -o app -I /usr/local/cuda/include/
- cuda_check作用
发生错误的时候告诉你错误发生在哪里
#define CUDA_CHECK(call) { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
printf("ERROR: %s:%d, ", __FILE__, __LINE__); \
printf("CODE:%d, DETAIL:%s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
三、利用CUDA矩阵乘法(matmul)计算、Error Handle 及硬件信息获取
1)矩阵乘法
- 目的
理解使用cuda进行矩阵运算的加速方法,tile的用意 - 项目目录
2)Error Handle
- 项目目录
3)硬件信息获取
- 目标
学习使用cuda runtime api显示GPU硬件信息,以及理解GPU硬件信息重要性 - 项目布局
- 打印效果
- 相关代码
int main(){
int count;
int index = 0;
cudaGetDeviceCount(&count);
while (index < count) {
cudaSetDevice(index);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, index);
LOG("%-40s", "*********************Architecture related**********************");
LOG("%-40s%d%s", "Device id: ", index, "");
LOG("%-40s%s%s", "Device name: ", prop.name, "");
LOG("%-40s%.1f%s", "Device compute capability: ", prop.major + (float)prop.minor / 10, "");
LOG("%-40s%.2f%s", "GPU global meory size: ", (float)prop.totalGlobalMem / (1<<30), "GB");
LOG("%-40s%.2f%s", "L2 cache size: ", (float)prop.l2CacheSize / (1<<20), "MB");
LOG("%-40s%.2f%s", "Shared memory per block: ", (float)prop.sharedMemPerBlock / (1<<10), "KB");
LOG("%-40s%.2f%s", "Shared memory per SM: ", (float)prop.sharedMemPerMultiprocessor / (1<<10), "KB");
LOG("%-40s%.2f%s", "Device clock rate: ", prop.clockRate*1E-6, "GHz");
LOG("%-40s%.2f%s", "Device memory clock rate: ", prop.memoryClockRate*1E-6, "Ghz");
LOG("%-40s%d%s", "Number of SM: ", prop.multiProcessorCount, "");
LOG("%-40s%d%s", "Warp size: ", prop.warpSize, "");
LOG("%-40s", "*********************Parameter related************************");
LOG("%-40s%d%s", "Max block numbers: ", prop.maxBlocksPerMultiProcessor, "");
LOG("%-40s%d%s", "Max threads per block: ", prop.maxThreadsPerBlock, "");
LOG("%-40s%d:%d:%d%s", "Max block dimension size:", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2], "");
LOG("%-40s%d:%d:%d%s", "Max grid dimension size: ", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2], "");
index ++;
printf("\n");
}
return 0;
}
- 知道参数的重要性
很多时候编译.cu代码需要在nvcc之后加上编译信息,就需要打印GPU信息出来方便编译(比如共享内存的使用对cuda程序的加速很重要,可以动态修改共享内存和L1 Cache,而且知道作为调度thread的warp是由多少个thread组成的,也可以提高利用率)