【cuda学习日记】2.cuda编程模型

news2025/1/8 15:00:48

2.1 在CPU上执行sum array

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx< N; idx ++)
    {
        C[idx] = A[idx] + B[idx];
    }
}

void initialData(float *ip, int size)
{
    time_t t;
    srand((unsigned int) time(&t));

    for (int i = 0; i < size; i++) {
        ip[i] = (float) (rand() & 0xff) / 10.0f;
    }
}

int main(int argc , char **argv)
{
    int nElem = 1024;
    size_t nBytes = nElem * sizeof(float);
    float *h_A, *h_B, *h_C;

    h_A = (float *) malloc (nBytes);
    h_B = (float *) malloc (nBytes);
    h_C = (float *) malloc (nBytes);

    initialData(h_A, nElem);
    initialData(h_B, nElem);

    sumArraysOnHost(h_A, h_B, h_C, nElem);

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

2.2 查看grid, block的索引维度

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime.h>

__global__ void checkIndex(void)
{
    printf("threadidx: (%d ,%d ,%d) blockidx:(%d ,%d ,%d) blockdim: (%d ,%d ,%d) gridDim: (%d ,%d ,%d)\n", 
    threadIdx.x, threadIdx.y, threadIdx.z,
    blockIdx.x, blockIdx.y, blockIdx.z,
    blockDim.x,blockDim.y,blockDim.z,
    gridDim.x, gridDim.y, gridDim.z
    );
}


int main(int argc , char **argv)
{
    int nElem = 6;
    
    dim3 block(3);
    dim3 grid ((nElem + block.x -1)/block.x);

    printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
    printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);

    checkIndex<<<grid, block>>>();

    cudaDeviceReset();

    return 0;
}

输出
grid.x 2 grid.y 1 grid.z 1
block.x 3 block.y 1 block.z 1
threadidx: (0 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (1 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (2 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (0 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (1 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (2 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)

2.4 cuda核函数
kernel_name<<<grid, block>>> (argument list);

2.5 核函数限定符
global 在设备端执行,主机端或计算能力>3的设备端调用,必须有void 返回类型
device 在设备端执行,设备端调用
host 在主机端执行,主机端调用

2.6 在GPU中sum,和HOST sum array做对比

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define CHECK(call) \
 {\
    const cudaError_t error = call; \
    if (error != cudaSuccess)\
    {\
        printf("Error: %s: %d\n", __FILE__, __LINE__);\
        printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
        exit(1);\
    }\
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;
    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i])> epsilon)
        {
            match = 0;
            printf("Array do not match\n");
            printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
            break;

        }
    }
    if (match) printf("array matches\n");
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx< N; idx ++)
    {
        C[idx] = A[idx] + B[idx];
    }
}

void initialData(float *ip, int size)
{
    time_t t;
    srand((unsigned int) time(&t));

    for (int i = 0; i < size; i++) {
        ip[i] = (float) (rand() & 0xff) / 10.0f;
    }
}

__global__ void sumArraysOnGPU(float *A, float *B, float *C)
{
    int i  = blockIdx.x * blockDim.x + threadIdx.x;
    C[i] = A[i] + B[i];
}


double cpusec()
{
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return ((double) tp.tv_sec + (double)tp.tv_usec* 1.e-6);
}

int main(int argc , char **argv)
{
    printf("%s starting\n", argv[0]);

    int dev = 0;
    cudaSetDevice(dev);


    //set up data
    int nElem = 32;
    size_t nBytes = nElem * sizeof(float);
    float *h_A, *h_B, *hostRef, *gpuRef;

    h_A = (float *) malloc (nBytes);
    h_B = (float *) malloc (nBytes);
    hostRef = (float *) malloc (nBytes);
    gpuRef = (float *) malloc (nBytes);

    initialData(h_A, nElem);
    initialData(h_B, nElem);

    memset(hostRef,0, nBytes);
    memset(gpuRef,0, nBytes);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((float**)&d_A, nBytes);
    cudaMalloc((float**)&d_B, nBytes);
    cudaMalloc((float**)&d_C, nBytes);

    //transfer data from host to device
    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);

    dim3 block(nElem);
    dim3 grid(nElem/block.x);

    sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
    printf("execution config <<<%d, %d>>>\n", grid.x, block.x);

    //copy kernel result back to host
    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);

    sumArraysOnHost(h_A, h_B, hostRef, nElem);

    checkResult(hostRef, gpuRef, nElem);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    return 0;
}

输出
execution config <<<1, 32>>>
array matches

2.8 尝试用nvprof 检查执行时间,报错找不到dll文件,可以参考
https://blog.csdn.net/qq_41607336/article/details/126741908
解决,
但最后还是报warning, 跟显卡相关
======= Warning: nvprof is not supported on devices with compute capability 8.0 and higher.

2.9 加上device信息打印,以及cudaevent 计时, 参考https://blog.csdn.net/qq_42681630/article/details/144895351

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>

typedef unsigned long DWORD;

#define CHECK(call) \
 {\
    const cudaError_t error = call; \
    if (error != cudaSuccess)\
    {\
        printf("Error: %s: %d\n", __FILE__, __LINE__);\
        printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
        exit(1);\
    }\
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;
    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i])> epsilon)
        {
            match = 0;
            printf("Array do not match\n");
            printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
            break;

        }
    }
    if (match) printf("array matches\n");
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx< N; idx ++)
    {
        C[idx] = A[idx] + B[idx];
    }
}

void initialData(float *ip, int size)
{
    time_t t;
    srand((unsigned int) time(&t));

    for (int i = 0; i < size; i++) {
        ip[i] = (float) (rand() & 0xff) / 10.0f;
    }
}

__global__ void sumArraysOnGPU(float *A, float *B, float *C)
{
    int i  = blockIdx.x * blockDim.x + threadIdx.x;
    C[i] = A[i] + B[i];
}



int main(int argc , char **argv)
{
    printf("%s starting\n", argv[0]);

    int dev = 0;
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("Using Device %d : %s\n", dev, deviceprop.name);
    CHECK(cudaSetDevice(dev));
    

    printf( "Compute capability:  %d.%d\n", deviceprop.major, deviceprop.minor );
    printf( "Clock rate:  %d\n", deviceprop.clockRate );
    printf( "Memory Clock rate:  %d\n", deviceprop.memoryClockRate );
    printf( "Memory busWidth:  %d\n", deviceprop.memoryBusWidth );
    printf( "   --- Memory Information for device  ---\n");
    // printf( "Total global mem:  %ld\n", prop.totalGlobalMem );
    printf( "Total global mem:  %zu\n", deviceprop.totalGlobalMem );
    printf( "Total constant Mem:  %ld\n", deviceprop.totalConstMem );
    printf( "Max mem pitch:  %ld\n", deviceprop.memPitch );
    printf( "Texture Alignment:  %ld\n", deviceprop.textureAlignment );
    
    printf( "   --- MP Information for device  ---\n" );
    printf( "Multiprocessor count:  %d\n",
                deviceprop.multiProcessorCount );
    printf( "Shared mem per mp:  %ld\n", deviceprop.sharedMemPerBlock );
    printf( "Registers per mp:  %d\n", deviceprop.regsPerBlock );
    printf( "Threads in warp:  %d\n", deviceprop.warpSize );
    printf( "Max threads per block:  %d\n",
                deviceprop.maxThreadsPerBlock );
    printf( "Max thread dimensions:  (%d, %d, %d)\n",
                deviceprop.maxThreadsDim[0], deviceprop.maxThreadsDim[1],
                deviceprop.maxThreadsDim[2] );
    printf( "Max grid dimensions:  (%d, %d, %d)\n",
                deviceprop.maxGridSize[0], deviceprop.maxGridSize[1],
                deviceprop.maxGridSize[2] );
    printf( "\n" );

    //set up data
    int nElem = 1<<24;
    size_t nBytes = nElem * sizeof(float);
    float *h_A, *h_B, *hostRef, *gpuRef;

    h_A = (float *) malloc (nBytes);
    h_B = (float *) malloc (nBytes);
    hostRef = (float *) malloc (nBytes);
    gpuRef = (float *) malloc (nBytes);

    initialData(h_A, nElem);
    initialData(h_B, nElem);

    memset(hostRef,0, nBytes);
    memset(gpuRef,0, nBytes);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((float**)&d_A, nBytes);
    cudaMalloc((float**)&d_B, nBytes);
    cudaMalloc((float**)&d_C, nBytes);

    //transfer data from host to device
    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);


    int Ilen = 1024;
    dim3 block(Ilen);
    dim3 grid((nElem + block.x - 1)/block.x);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);

    sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
    printf("execution config <<<%d, %d>>>\n", grid.x, block.x);
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Kernel execution time: %f ms\n", milliseconds);



    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    
    //copy kernel result back to host
    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);

    sumArraysOnHost(h_A, h_B, hostRef, nElem);

    checkResult(hostRef, gpuRef, nElem);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    return 0;
}

输出:
Using Device 0 : NVIDIA GeForce RTX 4090
Compute capability: 8.9
Clock rate: 2520000
Memory Clock rate: 10501000
Memory busWidth: 384
— Memory Information for device —
Total global mem: 25756696576
Total constant Mem: 65536
Max mem pitch: 2147483647
Texture Alignment: 512
— MP Information for device —
Multiprocessor count: 128
Shared mem per mp: 49152
Registers per mp: 65536
Threads in warp: 32
Max threads per block: 1024
Max thread dimensions: (1024, 1024, 64)
Max grid dimensions: (2147483647, 65535, 65535)

execution config <<<16384, 1024>>>
execution config <<<16384, 1024>>>
Kernel execution time: 0.558752 ms
array matches

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/2273273.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

把vue项目或者vue组件发布成npm包或者打包成lib库文件本地使用

将vue项目发布成npm库文件&#xff0c;第三方通过npm依赖安装使用&#xff1b;使用最近公司接了一个项目&#xff0c;这个项目需要集成到第三方页面&#xff0c;在第三方页面点击项目名称&#xff0c;页面变成我们的项目页面&#xff1b;要求以npm库文件提供给他们&#xff1b;…

Kafka为什么要放弃Zookeeper

1.Kafka简介 Apache Kafka最早是由Linkedin公司开发&#xff0c;后来捐献给了Apack基金会。 Kafka被官方定义为分布式流式处理平台&#xff0c;因为具备高吞吐、可持久化、可水平扩展等特性而被广泛使用。目前Kafka具体如下功能&#xff1a; 消息队列,Kafka具有系统解耦、流…

【JVM】总结篇-类的加载篇之 类的加载器 和ClassLoader分析

文章目录 类的加载器ClassLoader自定义类加载器双亲委派机制概念源码分析优势劣势如何打破Tomcat 沙箱安全机制JDK9 双亲委派机制变化 类的加载器 获得当前类的ClassLoader clazz.getClassLoader() 获得当前线程上下文的ClassLoader Thread.currentThread().getContextClassLoa…

java 转义 反斜杠 Unexpected internal error near index 1

代码&#xff1a; String str"a\\c"; //出现异常&#xff0c;Unexpected internal error near index 1 //System.out.println(str.replaceAll("\\", "c"));//以下三种都正确 System.out.println(str.replace(\\, c)); System.out.println(str.r…

el-table 实现纵向多级表头

为了实现上图效果&#xff0c;最开始打算用el-row、el-col去实现&#xff0c;但发现把表头和数据分成两大列时&#xff0c;数据太多时会导致所在格高度变高。但由于每一格数据肯定不一样&#xff0c;为保持高度样式一致&#xff0c;就需要我们手动去获取最高格的高度之后再设置…

2025最新版Visual Studio Code安装使用指南

2025最新版Visual Studio Code安装使用指南 Installation and Usage Guide for the Latest Visual Studio Code in 2024 By JacksonML 2025-1-7 1. Visual Studio Code背景 早在二十年前&#xff0c;通用的集成开发环境&#xff08;Integrated Deveopment Environment, 简称…

Flutter 鸿蒙化 flutter和鸿蒙next混和渲染

前言导读 这一个节课我们讲一下PlatformView的是使用 我们在实战中有可能出现了在鸿蒙next只加载一部分Flutter的情况 我们今天就讲一下这种情况具体实现要使用到我们的PlatformView 效果图 具体实现: 一、Native侧 使用 DevEco Studio工具打开 platform_view_example\oho…

LabVIEW语言学习过程是什么?

学习LabVIEW语言的过程可以分为几个阶段&#xff0c;每个阶段的重点内容逐步加深&#xff0c;帮助你从入门到精通。以下是一个简洁的学习过程&#xff1a; ​ 1. 基础入门阶段 理解图形化编程&#xff1a;LabVIEW是一种图形化编程语言&#xff0c;与传统的文本编程语言不同&am…

Kubernetes Gateway API-4-TCPRoute和GRPCRoute

1 TCPRoute 目前 TCP routing 还处于实验阶段。 Gateway API 被设计为与多个协议一起工作&#xff0c;TCPRoute 就是这样一个允许管理TCP流量的路由。 在这个例子中&#xff0c;我们有一个 Gateway 资源和两个 TCPRoute 资源&#xff0c;它们按照以下规则分配流量&#xff1…

嵌入式SD/TF卡通用协议-SDIO协议

SD卡&#xff08;SecureDigital MemoryCard&#xff09;即&#xff1a;安全数码卡&#xff0c;它是在MMC的基础上发展而来&#xff0c;是一种基于半导体快闪记忆器的新一代记忆设备&#xff0c;它被广泛地于便携式装置上使用&#xff0c;例如数码相机、个人数码助理(PDA)和多媒…

性能测试05|JMeter:分布式、报告、并发数计算、性能监控

目录 一、JMeter分布式 1、应用场景 2、原理 3、分布式相关注意事项 4、分布式配置与运行 二、JMeter报告 1、聚合报告 2、HTML报告 三、并发用户数&#xff08;线程数&#xff09;计算 四、JMeter下载第三方插件 五、性能监控 1、Concurrency Thread Group 线程组…

wujie无界微前端框架初使用

先说一下项目需求&#xff1a;将单独的四套系统的登录操作统一放在一个入口页面进行登录&#xff0c;所有系统都使用的是vue3&#xff0c;&#xff08;不要问我为啥会这样设计&#xff0c;产品说的客户要求&#xff09; 1.主系统下载wujie 我全套都是vue3&#xff0c;所以直接…

SpringIOC循环依赖与三级缓存

SpringIOC循环依赖与三级缓存 Spring解决循环依赖的核心机制就是通过三级缓存&#xff1a; 一级缓存&#xff08;singletonObjects&#xff09;&#xff1a;存储完全初始化好的Bean&#xff1b;二级缓存&#xff08;earlySingletonObjects&#xff09;&#xff1a;存储原始实例…

【顶刊TPAMI 2025】多头编码(MHE)之极限分类 Part 3:算法实现

目录 1 三种多头编码&#xff08;MHE&#xff09;实现1.1 多头乘积&#xff08;MHP&#xff09;1.2 多头级联&#xff08;MHC&#xff09;1.3 多头采样&#xff08;MHS&#xff09;1.4 标签分解策略 论文&#xff1a;Multi-Head Encoding for Extreme Label Classification 作者…

前端 图片上鼠标画矩形框,标注文字,任意删除

效果&#xff1a; 页面描述&#xff1a; 对给定的几张图片&#xff0c;每张能用鼠标在图上画框&#xff0c;标注相关文字&#xff0c;框的颜色和文字内容能自定义改变&#xff0c;能删除任意画过的框。 实现思路&#xff1a; 1、对给定的这几张图片&#xff0c;用分页器绑定…

【办公利器】ReNamer (批量文件重命名工具) Pro v7.6.0.4 多语便携版,海量文件秒速精准改名!

ReNamer是一款功能强大的文件重命名工具&#xff0c;它可以帮助用户快速方便地批量重命名文件和文件夹。 软件功能 批量重命名&#xff1a;ReNamer可以同时处理多个文件和文件夹&#xff0c;并对其进行批量重命名&#xff0c;从而节省时间和劳动力。灵活的重命名规则&#xff…

unity学习13:gameobject的组件component以及tag, layer 归类

目录 1 gameobject component 是unity的基础 1.1 类比 1.2 为什么要这么设计&#xff1f; 2 从空物体开始 2.1 创建2个物体 2.2 给 empty gameobject添加组件 3 各种组件和新建组件 3.1 点击 add component可以添加各种组件 3.2 新建组件 3.3 组件的操作 3.4 特别的…

数据库模型全解析:从文档存储到搜索引擎

目录 前言1. 文档存储&#xff08;Document Store&#xff09;1.1 概念与特点1.2 典型应用1.3 代表性数据库 2. 图数据库&#xff08;Graph DBMS&#xff09;2.1 概念与特点2.2 典型应用2.3 代表性数据库 3. 原生 XML 数据库&#xff08;Native XML DBMS&#xff09;3.1 概念与…

CSS——1.优缺点

<!DOCTYPE html> <html><head><meta charset"UTF-8"><title></title><link rel"stylesheet" type"text/css" href"1-02.css"/></head><body><!--css&#xff1a;层叠样式表…

UE5本地化和国际化语言

翻译语言 工具 - 本地化控制板 Localization Dashboard 修改图中这几个地方就可以 点击箭头处&#xff0c;把中文翻译成英语&#xff0c;如果要更多语言就点 添加新语言 最后点击编译即可 编译完&#xff0c;会在目录生成文件夹 设置界面相关蓝图中设置 切换本地化语言 必须在…