Nsight-Compute Global Load相关Metric测试
- 1.参考链接
- 2.生成测试用例
- 3.编译
- 4.Profiling并将结果导出到csv文件
- 5.截图
本文使用ptx指令直接从global memory获取数据,了解相关metrics及其计算过程
1.参考链接
- PTX Cache Operators
- PTX LD指令
- Kernel Profiling Guide Caches
2.生成测试用例
tee sample_2.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
__global__ void kernel2(float *d_in, float *d_out) {
float d;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//直接从系统内存读写数据,不过cache
asm("ld.global.cv.f32 %0, [%1];" : "=f"(d) : "l"(&d_in[tid]));
asm("st.global.wt.f32 [%0],%1;" :: "l"(&d_out[tid]),"f"(d));
}
int main() {
float *d_in;
float *d_out;
int sm_count=28;
int smsp_count=4;
int warpsize=32;
int total_count=sm_count*smsp_count*warpsize;
cudaMalloc((void**)&d_in, total_count * sizeof(float));
cudaMalloc((void**)&d_out, total_count * sizeof(float));
//每个smsp一个warp
kernel2<<<sm_count, warpsize*smsp_count>>>(d_in, d_out);cudaDeviceSynchronize();
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
EOF
3.编译
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_2.cu -o sample_2
/usr/local/cuda/bin/nvcc -std=c++17 -O2 -arch=sm_86 -ptx sample_2.cu -o sample_2.ptx
cat sample_2.ptx
/usr/local/cuda/bin/nvcc -O2 -arch=sm_86 sample_2.ptx -cubin -o sample_2.cubin
/usr/local/cuda/bin/cuobjdump --dump-sass sample_2.cubin
4.Profiling并将结果导出到csv文件
/usr/local/NVIDIA-Nsight-Compute/ncu --csv --metrics \
smsp__sass_inst_executed_op_global_ld.sum,\
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,\
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum,\
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum,\
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum,\
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum.pct_of_peak_sustained_elapsed,\
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum.peak_sustained,\
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,\
sm__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ldgsts_cache_access.sum,\
sm__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum,\
l1tex__t_sector_pipe_lsu_mem_global_op_ld_hit_rate.pct,\
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.sum,\
smsp__sass_l1tex_m_xbar2l1tex_read_sectors_mem_global_op_ldgsts_cache_bypass.sum,\
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.sum.pct_of_peak_sustained_elapsed,\
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.sum.peak_sustained,\
smsp__sass_l1tex_m_xbar2l1tex_read_sectors_mem_global_op_ldgsts_cache_bypass.sum.pct_of_peak_sustained_elapsed,\
smsp__sass_l1tex_m_xbar2l1tex_read_sectors_mem_global_op_ldgsts_cache_bypass.sum.peak_sustained,\
l1tex__lsu_writeback_active_mem_lg.sum,\
l1tex__lsu_writeback_active_mem_lg.sum.pct_of_peak_sustained_elapsed,\
l1tex__lsu_writeback_active_mem_lg.sum.peak_sustained,\
lts__t_requests_srcunit_tex_op_read.sum,\
lts__t_sectors_srcunit_tex_op_read.sum,\
lts__t_sectors_srcunit_tex_op_read.avg.pct_of_peak_sustained_elapsed,\
lts__t_sectors_srcunit_tex_op_read.avg.peak_sustained,\
lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,\
lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum,\
lts__t_sectors_srcunit_tex_op_read.sum.per_second,\
lts__t_sectors_srcunit_tex_aperture_device_op_read_lookup_miss.sum,\
lts__t_sectors_srcunit_tex_aperture_sysmem_op_read_lookup_miss.sum,\
lts__t_sectors_srcunit_tex_aperture_peer_op_read_lookup_miss.sum,\
dram__sectors_read.sum,\
dram__bytes_read.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_read.sum.peak_sustained,\
dram__bytes_read.sum,\
l1tex__cycles_active,\
lts__cycles_active,\
dram__cycles_active,\
smsp__cycles_active,\
sm__cycles_active,\
sm__cycles_elapsed,\
l1tex__cycles_elapsed,\
lts__cycles_elapsed,\
smsp__cycles_elapsed,\
dram__cycles_elapsed,\
lts__cycles_elapsed.avg.per_second,\
dram__cycles_elapsed.avg.per_second,\
dram__bytes_read.sum.per_second ./sample_2 | tail -n +3 | tee summary_v2.csv
5.截图