2.1.1 检查块和线程索引
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime.h>
#define CHECK(call) \
{\
const cudaError_t error = call; \
if (error != cudaSuccess)\
{\
printf("Error: %s: %d\n", __FILE__, __LINE__);\
printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
exit(1);\
}\
}
void initialInt( int * ip, int size)
{
for (int i =0; i < size; i ++)
{
ip[i] = i;
}
}
void printMatrix(int *C, const int nx, const int ny)
{
int *ic = C;
printf("\n matrix : (%d, %d)\n", nx, ny);
for (int iy = 0; iy < ny; iy++)
{
for (int ix =0; ix < nx; ix++){
printf("%3d",ic[ix]);
}
ic += nx;
printf("\n");
}
printf("\n");
}
__global__ void printThreadIndex(int *A, const int nx, const int ny)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy*nx + ix;
printf("threadidx: (%d ,%d) blockidx:(%d ,%d) coordinate: (%d ,%d) global index: (%2d ival %2d)\n",
threadIdx.x, threadIdx.y,
blockIdx.x, blockIdx.y,
ix, iy,
idx, A[idx]
);
}
int main(int argc , char **argv)
{
printf("%s starting\n", argv[0]);
int dev = 0;
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("Using Device %d : %s\n", dev, deviceprop.name);
CHECK(cudaSetDevice(dev));
// set matrix
int nx = 8;
int ny = 6;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
// malloc host memory
int * h_A;
h_A = (int *) malloc(nBytes);
//initial int
initialInt(h_A, nxy);
printMatrix(h_A, nx, ny);
// device
int *d_MatA;
cudaMalloc((void **)&d_MatA, nBytes);
cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
dim3 block(4,2);
dim3 grid ((nx + block.x - 1)/block.x, (ny + block.y - 1)/ block.y);
printf("execution config grid (%d, %d), block (%d, %d)\n", grid.x, grid.y, block.x, block.y);
printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);
cudaDeviceSynchronize();
cudaFree(d_MatA);
free(h_A);
cudaDeviceReset();
return 0;
}
生成的2D 矩阵 8X6 在HOST里的样子
matrix : (8, 6)
0 1 2 3 4 5 6 7
8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47
因为block(4,2)定义的block.x =4, block.y = 2, 所以grid的尺寸 grid(2,3), grid.x = 2, grid.y = 3
而后在printThreadindex里输出
threadidx: (0 ,0) blockidx:(0 ,1) coordinate: (0 ,2) global index: (16 ival 16)
threadidx: (1 ,0) blockidx:(0 ,1) coordinate: (1 ,2) global index: (17 ival 17)
threadidx: (2 ,0) blockidx:(0 ,1) coordinate: (2 ,2) global index: (18 ival 18)
threadidx: (3 ,0) blockidx:(0 ,1) coordinate: (3 ,2) global index: (19 ival 19)
threadidx: (0 ,1) blockidx:(0 ,1) coordinate: (0 ,3) global index: (24 ival 24)
threadidx: (1 ,1) blockidx:(0 ,1) coordinate: (1 ,3) global index: (25 ival 25)
threadidx: (2 ,1) blockidx:(0 ,1) coordinate: (2 ,3) global index: (26 ival 26)
threadidx: (3 ,1) blockidx:(0 ,1) coordinate: (3 ,3) global index: (27 ival 27)
threadidx: (0 ,0) blockidx:(1 ,1) coordinate: (4 ,2) global index: (20 ival 20)
threadidx: (1 ,0) blockidx:(1 ,1) coordinate: (5 ,2) global index: (21 ival 21)
threadidx: (2 ,0) blockidx:(1 ,1) coordinate: (6 ,2) global index: (22 ival 22)
threadidx: (3 ,0) blockidx:(1 ,1) coordinate: (7 ,2) global index: (23 ival 23)
threadidx: (0 ,1) blockidx:(1 ,1) coordinate: (4 ,3) global index: (28 ival 28)
threadidx: (1 ,1) blockidx:(1 ,1) coordinate: (5 ,3) global index: (29 ival 29)
threadidx: (2 ,1) blockidx:(1 ,1) coordinate: (6 ,3) global index: (30 ival 30)
threadidx: (3 ,1) blockidx:(1 ,1) coordinate: (7 ,3) global index: (31 ival 31)
threadidx: (0 ,0) blockidx:(1 ,0) coordinate: (4 ,0) global index: ( 4 ival 4)
threadidx: (1 ,0) blockidx:(1 ,0) coordinate: (5 ,0) global index: ( 5 ival 5)
threadidx: (2 ,0) blockidx:(1 ,0) coordinate: (6 ,0) global index: ( 6 ival 6)
threadidx: (3 ,0) blockidx:(1 ,0) coordinate: (7 ,0) global index: ( 7 ival 7)
threadidx: (0 ,1) blockidx:(1 ,0) coordinate: (4 ,1) global index: (12 ival 12)
threadidx: (1 ,1) blockidx:(1 ,0) coordinate: (5 ,1) global index: (13 ival 13)
threadidx: (2 ,1) blockidx:(1 ,0) coordinate: (6 ,1) global index: (14 ival 14)
threadidx: (3 ,1) blockidx:(1 ,0) coordinate: (7 ,1) global index: (15 ival 15)
threadidx: (0 ,0) blockidx:(0 ,2) coordinate: (0 ,4) global index: (32 ival 32)
threadidx: (1 ,0) blockidx:(0 ,2) coordinate: (1 ,4) global index: (33 ival 33)
threadidx: (2 ,0) blockidx:(0 ,2) coordinate: (2 ,4) global index: (34 ival 34)
threadidx: (3 ,0) blockidx:(0 ,2) coordinate: (3 ,4) global index: (35 ival 35)
threadidx: (0 ,1) blockidx:(0 ,2) coordinate: (0 ,5) global index: (40 ival 40)
threadidx: (1 ,1) blockidx:(0 ,2) coordinate: (1 ,5) global index: (41 ival 41)
threadidx: (2 ,1) blockidx:(0 ,2) coordinate: (2 ,5) global index: (42 ival 42)
threadidx: (3 ,1) blockidx:(0 ,2) coordinate: (3 ,5) global index: (43 ival 43)
threadidx: (0 ,0) blockidx:(0 ,0) coordinate: (0 ,0) global index: ( 0 ival 0)
threadidx: (1 ,0) blockidx:(0 ,0) coordinate: (1 ,0) global index: ( 1 ival 1)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3)
threadidx: (0 ,1) blockidx:(0 ,0) coordinate: (0 ,1) global index: ( 8 ival 8)
threadidx: (1 ,1) blockidx:(0 ,0) coordinate: (1 ,1) global index: ( 9 ival 9)
threadidx: (2 ,1) blockidx:(0 ,0) coordinate: (2 ,1) global index: (10 ival 10)
threadidx: (3 ,1) blockidx:(0 ,0) coordinate: (3 ,1) global index: (11 ival 11)
threadidx: (0 ,0) blockidx:(1 ,2) coordinate: (4 ,4) global index: (36 ival 36)
threadidx: (1 ,0) blockidx:(1 ,2) coordinate: (5 ,4) global index: (37 ival 37)
threadidx: (2 ,0) blockidx:(1 ,2) coordinate: (6 ,4) global index: (38 ival 38)
threadidx: (3 ,0) blockidx:(1 ,2) coordinate: (7 ,4) global index: (39 ival 39)
threadidx: (0 ,1) blockidx:(1 ,2) coordinate: (4 ,5) global index: (44 ival 44)
threadidx: (1 ,1) blockidx:(1 ,2) coordinate: (5 ,5) global index: (45 ival 45)
threadidx: (2 ,1) blockidx:(1 ,2) coordinate: (6 ,5) global index: (46 ival 46)
threadidx: (3 ,1) blockidx:(1 ,2) coordinate: (7 ,5) global index: (47 ival 47)