一. block,grid 的 idx & dim
注意区分threadIdx,blockIdx
1.1 blockIdx
每一个线程在cuda运行时唯一初始化的blockIdx变量只取决于所属的坐标,blockIdx同样也是dim3类型
1.1. 对比blockIdx和threadIdx
blockIdx只取决于当前block在grid中的位置
1.2 blockDim
1.3 gridDim
和 blockDim的区别主要是block和grid的区别:block是组成grid的较小单元
1.4代码展示
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void print_details()
{
printf(
"blockIdx.x : %d , blockIdx.y : %d,blockIdx.z : %d ,blockDim.x : %d , blockDim.y : %d,gridDim.x : %d , gridDim.y : %d \n",
blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, gridDim.x, gridDim.y
);
}
int main()
{
int nx, ny;
nx = 16;
ny = 16;
dim3 block(8, 8);
dim3 grid(nx / block.x, ny / block.y);
print_details << <grid, block >> > ();
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
blockDim,gridDim有着基本相同的值
二.使用内核访问一维block
2.1 只使用threadIdx
不是唯一指定,因此只会访问前四个
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void unique_idx_calc_threadIdx(int * input)
{
int tid = threadIdx.x;
printf("threadIdx : %d ,value: %d \n", tid, input[tid]);
}
int main()
{
int arrary_size = 8;
int arrary_byte_size = sizeof(int) * arrary_size;
int h_data[] = { 1,34,56,54,23,67,93,30 };
for (int i = 0; i < arrary_size; i++)
{
printf("%d ", h_data[i]);
}
printf("\n");
//传递到数字指针
int* d_data;
cudaMalloc((void**)&d_data, arrary_byte_size);
cudaMemcpy(d_data, h_data, arrary_byte_size, cudaMemcpyHostToDevice);
//只会访问前一半数据
dim3 block(4);
dim3 grid(2);
//访问全部数据
//dim3 block(8);
//dim3 grid(1);
unique_idx_calc_threadIdx << <grid, block >> > (d_data);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
2.2 一维全局索引计算
因为每一个block,threadIdx都是从0开始按照相对位置进行初始化,因此需要一种方法唯一确定线程。
因此我们使用的方法是将threadIdx添加偏置量以便获得每一个线程的全局索引,即
__global__ void unique_gid_calculation(int* input)
{
int tid = threadIdx.x;
int offset = blockIdx.x * blockDim.x;
printf("blockIdx.x : %d ,threadIdx : %d,threadgid : %d ,value: %d \n",blockIdx.x,threadIdx.x, tid+ offset, input[tid + offset]);
}
三.使用全局索引访问二维block
最终推得
__global__ void unique_gid_calculation_2d(int* input)
{
int tid = threadIdx.x;
int block_offset = blockIdx.x * blockDim.x;
int row_offset = blockDim.x * gridDim.x * blockIdx.y;
int gid = tid + row_offset+ block_offset;
printf("blockIdx.x : %d ,blockIdx.y : %d, tid : %d,gid : %d ,value: %d \n",
blockIdx.x, blockIdx.y, tid , gid, input[gid]);
}
四.访问二维grid
比较推荐的计算全局索引的方式是在同一个线程块中的线程使用连续的地址或元素
进行访问
基于上述:
1.tid :在自身block的相对位置
2. block offset :水平方向上的偏差量
3. row_offset:竖直方向上的偏移量
__global__ void unique_gid_calculation_2d_2d(int* input)
{
int tid = blockDim.x * threadIdx.y + threadIdx.x;
int num_threads_in_a_block = blockDim.x * blockDim.y;
int block_offset = blockIdx.x * num_threads_in_a_block;
int num_threads_in_a_row = num_threads_in_a_block * gridDim.x;
int row_offset = num_threads_in_a_row * blockIdx.y;
int gid = tid + row_offset + block_offset;
printf("blockIdx.x : %d ,blockIdx.y : %d, tid : %d,gid : %d ,value: %d \n",
blockIdx.x, blockIdx.y, tid, gid, input[gid]);
}