CUDA __threadfence测试
- 一.测试小结
- 二.复现过程
- 三.截图
测试CUDA __threadfence的行为
一.测试小结
- 测例0:没有任何同步,执行到left+=t0时,left的数据未加载完成,出现long soreboard的stall 405次
- 测例1:__threadfence会等待memory数据加载完成,left+=t0没有出现long scoreboard的stall
- 测例2:fence.proxy.alias相当于执行二次__threadfence
- 测例3:__syncthreads不会等待memory数据加载完成.因此,执行到left+=t0时,会出现stall
二.复现过程
tee threadfence_test.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \
std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void kernel_add_float_0(float *addr)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
float left=addr[tid]*tid;
float t0=(float)clock64();
left+=t0;
addr[tid]=left;
}
__global__ void kernel_add_float_1(float *addr)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
float left=addr[tid]*tid;
__threadfence();
float t0=(float)clock64();
left+=t0;
addr[tid]=left;
}
__global__ void kernel_add_float_2(float *addr)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
float left=addr[tid]*tid;
asm volatile ("fence.proxy.alias;" ::: "memory");
float t0=(float)clock64();
left+=t0;
addr[tid]=left;
}
__global__ void kernel_add_float_3(float *addr)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
float left=addr[tid]*tid;
__syncthreads();
float t0=(float)clock64();
left+=t0;
addr[tid]=left;
}
int main(int argc,char *argv[])
{
int deviceid=0;cudaSetDevice(deviceid);
int block_count=28;int block_size=32*4;
int thread_size=block_count*block_size;
{
float *addr;CHECK_CUDA(cudaMalloc(&addr, thread_size*4));
printf("%-64s:","cudaMalloc");
kernel_add_float_0<<<block_count, block_size>>>(addr);CHECK_CUDA(cudaDeviceSynchronize());
kernel_add_float_1<<<block_count, block_size>>>(addr);CHECK_CUDA(cudaDeviceSynchronize());
kernel_add_float_2<<<block_count, block_size>>>(addr);CHECK_CUDA(cudaDeviceSynchronize());
kernel_add_float_3<<<block_count, block_size>>>(addr);CHECK_CUDA(cudaDeviceSynchronize());
}
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 -lineinfo -o threadfence_test threadfence_test.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
/usr/local/NVIDIA-Nsight-Compute/ncu --warp-sampling-interval 0 --set full --target-processes all --export ncu_report_threadfence_test -f ./threadfence_test
三.截图
-
测例0
-
测例1
-
测例2
-
测例3