一. block，grid 的 idx & dim

注意区分threadIdx，blockIdx

1.1 blockIdx

每一个线程在cuda运行时唯一初始化的blockIdx变量只取决于所属的坐标，blockIdx同样也是dim3类型
在这里插入图片描述

1.1. 对比blockIdx和threadIdx

在这里插入图片描述 blockIdx只取决于当前block在grid中的位置

1.2 blockDim

在这里插入图片描述

1.3 gridDim

在这里插入图片描述
和 blockDim的区别主要是block和grid的区别：block是组成grid的较小单元

1.4代码展示

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void print_details()
{
	printf(
		"blockIdx.x : %d , blockIdx.y : %d,blockIdx.z : %d ,blockDim.x : %d , blockDim.y : %d,gridDim.x : %d , gridDim.y : %d \n",
		blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, gridDim.x, gridDim.y
	);
}

int main()
{

	int nx, ny;
	nx = 16;
	ny = 16;
	dim3 block(8, 8);
	dim3 grid(nx / block.x, ny / block.y);
	print_details << <grid, block >> > ();

	cudaDeviceSynchronize();

	cudaDeviceReset();

	return 0;
}

blockDim，gridDim有着基本相同的值

二.使用内核访问一维block

2.1 只使用threadIdx

在这里插入图片描述
不是唯一指定，因此只会访问前四个

在这里插入图片描述

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void unique_idx_calc_threadIdx(int * input)
{
	int tid = threadIdx.x;
	printf("threadIdx : %d ,value: %d \n", tid, input[tid]);

}

int main()
{
	int arrary_size = 8;
	int arrary_byte_size = sizeof(int) * arrary_size;
	int h_data[] = { 1,34,56,54,23,67,93,30 };

	for (int i = 0; i < arrary_size; i++)
	{
		printf("%d ", h_data[i]);
	}
	printf("\n");

	//传递到数字指针
	int* d_data;
	cudaMalloc((void**)&d_data, arrary_byte_size);
	cudaMemcpy(d_data, h_data, arrary_byte_size, cudaMemcpyHostToDevice);
	//只会访问前一半数据
	dim3 block(4);
	dim3 grid(2);

	//访问全部数据
	//dim3 block(8);
	//dim3 grid(1);
	unique_idx_calc_threadIdx << <grid, block >> > (d_data);

	cudaDeviceSynchronize();

	cudaDeviceReset();

	return 0;
}

2.2 一维全局索引计算

因为每一个block，threadIdx都是从0开始按照相对位置进行初始化，因此需要一种方法唯一确定线程。
在这里插入图片描述

因此我们使用的方法是将threadIdx添加偏置量以便获得每一个线程的全局索引，即
在这里插入图片描述

__global__ void unique_gid_calculation(int* input)
{
	int tid = threadIdx.x;
	int offset = blockIdx.x * blockDim.x;
	printf("blockIdx.x : %d ,threadIdx : %d,threadgid : %d ,value: %d \n",blockIdx.x,threadIdx.x, tid+ offset, input[tid + offset]);

}

三.使用全局索引访问二维block

在这里插入图片描述
最终推得

__global__ void unique_gid_calculation_2d(int* input)
{
	int tid = threadIdx.x;
	int block_offset = blockIdx.x * blockDim.x;
	int row_offset = blockDim.x * gridDim.x * blockIdx.y;
	int gid = tid + row_offset+ block_offset;
	printf("blockIdx.x : %d ,blockIdx.y : %d, tid : %d，gid : %d ,value: %d \n", 
		blockIdx.x, blockIdx.y, tid , gid, input[gid]);

}

四.访问二维grid

比较推荐的计算全局索引的方式是在同一个线程块中的线程使用连续的地址或元素进行访问
在这里插入图片描述
基于上述：
1.tid ：在自身block的相对位置

2. block offset ：水平方向上的偏差量

3. row_offset:竖直方向上的偏移量

__global__ void unique_gid_calculation_2d_2d(int* input)
{
	int tid = blockDim.x * threadIdx.y + threadIdx.x;
	int num_threads_in_a_block = blockDim.x * blockDim.y;

	int block_offset = blockIdx.x * num_threads_in_a_block;

	int num_threads_in_a_row = num_threads_in_a_block * gridDim.x;

	int row_offset = num_threads_in_a_row * blockIdx.y;
	int gid = tid + row_offset + block_offset;
	printf("blockIdx.x : %d ,blockIdx.y : %d, tid : %d，gid : %d ,value: %d \n",
		blockIdx.x, blockIdx.y, tid, gid, input[gid]);

}