nvprof 是一个可执行文件,使用everything搜索可以看到它在:
执行命令:
nvprof exe_name
如何在windows 下使用,可以参看: windows下使用nvcc和nvprof。
示例
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
// 设备函数
__device__ float add(const float x, const float y)
{
return x + y;
}
__global__ void addFromGPU(float* A, float* B, float* C, const int N)
{
int blockId = blockIdx.x;
int id = blockId * blockDim.x + threadIdx.x;
if (id >= N)
{
return;
}
C[id] = add(A[id], B[id]);
}
void initialData(float* addr, int nCount)
{
for (size_t i = 0; i < nCount; i++)
{
addr[i] = (float)(rand() & 0xFFF) / 100.f;
}
}
int main()
{
int iElemntCount = 4096 * 10;
size_t stBytesCount = iElemntCount * sizeof(float); // 字节数
// 分配主机内存和设备内存并初始化
float* fpHost_A = new float[iElemntCount];
float* fpHost_B = new float[iElemntCount];
float* fpHost_C = new float[iElemntCount];
memset(fpHost_A, 0, stBytesCount);
memset(fpHost_B, 0, stBytesCount);
memset(fpHost_C, 0, stBytesCount);
float* fpDevice_A, * fpDevice_B, * fpDevice_C;
CUDA_CHECK_RETURN(cudaMalloc((void**)&fpDevice_A, stBytesCount));
CUDA_CHECK_RETURN(cudaMalloc((void**)&fpDevice_B, stBytesCount));
CUDA_CHECK_RETURN(cudaMalloc((void**)&fpDevice_C, stBytesCount));
CUDA_CHECK_RETURN(cudaMemset(fpDevice_C, 0, stBytesCount));
srand(666);
initialData(fpHost_A, iElemntCount);
initialData(fpHost_B, iElemntCount);
CUDA_CHECK_RETURN(cudaMemcpy(fpDevice_A, fpHost_A, stBytesCount, cudaMemcpyHostToDevice));
CUDA_CHECK_RETURN(cudaMemcpy(fpDevice_B, fpHost_B, stBytesCount, cudaMemcpyHostToDevice));
dim3 block(32);
dim3 grid((iElemntCount + block.x - 1) / block.x);
addFromGPU <<<grid, block >>> (fpDevice_A, fpDevice_B, fpDevice_C, iElemntCount);
cudaFree(fpDevice_A);
cudaFree(fpDevice_B);
cudaFree(fpDevice_C);
delete[]fpHost_A;
delete[]fpHost_B;
delete[]fpHost_C;
fpHost_A = nullptr;
fpHost_B = nullptr;
fpHost_C = nullptr;
printf("***********finish**************\n");
return 0;
}
static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{
if (err == cudaSuccess)
return;
std::cerr << statement << " returned: " << cudaGetErrorName(err) << " \t : " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;
exit(1);
}
分析结果:
这个就包含 命令耗时、调用次数、平均用时、最小用时、最大用时、命令名。