cuda入门学习

news2024/11/24 7:39:05

最近接触cuda 编程,记录一下。

1 工作实现一个【0-100)的加法

如果用python

sum = 0
for i in range(200):
    sum+=i
print(sum)

2 cuda 的一些简单的概念

一维情况下大概是这样的
(1个grid * 2个blocks * 4个thread)
在这里插入图片描述

3 代码直接上代码

我把100分为20个blocks ,每个block 有5个threads。

int num_blocks = 20;
int block_size = data_len/num_blocks // 100/20 = 5;
sum_kernel << <num_blocks, block_size >> > (sum, dev_c, data_len); //将其送入到内核中去

内核函数计算加法

 int tid = blockIdx.x * blockDim.x + threadIdx.x; // blockDim.x =5  

原子相加,相当加了一个锁,保证运算的正确性。

atomicAdd(sum, data[tid]);

3 完整代码

#include <stdio.h>    
#include <stdlib.h>   
#include <cuda_runtime.h>  
__global__ void sum_kernel(int* sum, int* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;
    printf("stride=%d, blockIdx.x blockDim.x threadIdx.x [%d, %d, %d] \n", stride,blockIdx.x,blockDim.x,threadIdx.x);
    atomicAdd(sum, data[tid]);
    printf("data[%d] = %d  sum  in kernel:  %d\n",tid,data[tid],*sum);
}
int main() {
    const int data_len = 100;
    int* dev_c = 0;
    int *sum=0;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "选择GPU失败,您的电脑上没有GPU");
        return 0;
    }
    cudaStatus = cudaMalloc((void**)&dev_c, data_len * sizeof(int));
    cudaStatus = cudaMalloc((void**)&sum, data_len * sizeof(int));
    //cudaMalloc(&sum, sizeof(int));
    int data_cpu[data_len];
    for (int i = 0; i < data_len; ++i)
    {
        data_cpu[i] = i;
    }
    cudaStatus =cudaMemcpy(dev_c, data_cpu,sizeof(int)* data_len, cudaMemcpyHostToDevice);
    //cudaMemcpy(dev_histo, &threadSum, sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "dev_b复制失败");
    }
    int num_blocks = 20;
    int block_size = data_len/num_blocks;
    sum_kernel << <num_blocks, block_size >> > (sum, dev_c, data_len);

    int result;
    cudaMemcpy(&result, sum, sizeof(int), cudaMemcpyDeviceToHost);
    printf("sum = %d\n", result);
}

4 运行结果

stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 4]
data[25] = 25  sum  in kernel:  135
data[26] = 26  sum  in kernel:  135
data[27] = 27  sum  in kernel:  135
data[28] = 28  sum  in kernel:  135
data[29] = 29  sum  in kernel:  135
data[40] = 40  sum  in kernel:  345
data[41] = 41  sum  in kernel:  345
data[42] = 42  sum  in kernel:  345
data[43] = 43  sum  in kernel:  345
data[44] = 44  sum  in kernel:  345
data[45] = 45  sum  in kernel:  740
data[46] = 46  sum  in kernel:  740
data[47] = 47  sum  in kernel:  740
data[48] = 48  sum  in kernel:  740
data[49] = 49  sum  in kernel:  740
data[30] = 30  sum  in kernel:  740
data[31] = 31  sum  in kernel:  740
data[32] = 32  sum  in kernel:  740
data[33] = 33  sum  in kernel:  740
data[34] = 34  sum  in kernel:  740
data[50] = 50  sum  in kernel:  1110
data[51] = 51  sum  in kernel:  1110
data[52] = 52  sum  in kernel:  1110
data[53] = 53  sum  in kernel:  1110
data[54] = 54  sum  in kernel:  1110
data[85] = 85  sum  in kernel:  1545
data[86] = 86  sum  in kernel:  1545
data[87] = 87  sum  in kernel:  1545
data[88] = 88  sum  in kernel:  1545
data[89] = 89  sum  in kernel:  1545
data[55] = 55  sum  in kernel:  1830
data[56] = 56  sum  in kernel:  1830
data[57] = 57  sum  in kernel:  1830
data[58] = 58  sum  in kernel:  1830
data[59] = 59  sum  in kernel:  1830
data[20] = 20  sum  in kernel:  1110
data[21] = 21  sum  in kernel:  1110
data[22] = 22  sum  in kernel:  1110
data[23] = 23  sum  in kernel:  1110
data[24] = 24  sum  in kernel:  1110
data[90] = 90  sum  in kernel:  2290
data[91] = 91  sum  in kernel:  2290
data[92] = 92  sum  in kernel:  2290
data[93] = 93  sum  in kernel:  2290
data[94] = 94  sum  in kernel:  2290
data[10] = 10  sum  in kernel:  3155
data[11] = 11  sum  in kernel:  3155
data[12] = 12  sum  in kernel:  3155
data[13] = 13  sum  in kernel:  3155
data[14] = 14  sum  in kernel:  3155
data[15] = 15  sum  in kernel:  3155
data[16] = 16  sum  in kernel:  3155
data[17] = 17  sum  in kernel:  3155
data[18] = 18  sum  in kernel:  3155
data[19] = 19  sum  in kernel:  3155
data[60] = 60  sum  in kernel:  3155
data[61] = 61  sum  in kernel:  3155
data[62] = 62  sum  in kernel:  3155
data[63] = 63  sum  in kernel:  3155
data[64] = 64  sum  in kernel:  3155
data[80] = 80  sum  in kernel:  2700
data[81] = 81  sum  in kernel:  2700
data[82] = 82  sum  in kernel:  2700
data[83] = 83  sum  in kernel:  2700
data[84] = 84  sum  in kernel:  2700
data[95] = 95  sum  in kernel:  3675
data[96] = 96  sum  in kernel:  3675
data[97] = 97  sum  in kernel:  3675
data[98] = 98  sum  in kernel:  3675
data[99] = 99  sum  in kernel:  3675
data[5] = 5  sum  in kernel:  3190
data[6] = 6  sum  in kernel:  3190
data[7] = 7  sum  in kernel:  3190
data[8] = 8  sum  in kernel:  3190
data[9] = 9  sum  in kernel:  3190
data[70] = 70  sum  in kernel:  4035
data[71] = 71  sum  in kernel:  4035
data[72] = 72  sum  in kernel:  4035
data[73] = 73  sum  in kernel:  4035
data[74] = 74  sum  in kernel:  4035
data[75] = 75  sum  in kernel:  4615
data[76] = 76  sum  in kernel:  4615
data[77] = 77  sum  in kernel:  4615
data[78] = 78  sum  in kernel:  4615
data[79] = 79  sum  in kernel:  4615
data[0] = 0  sum  in kernel:  4615
data[1] = 1  sum  in kernel:  4615
data[2] = 2  sum  in kernel:  4615
data[3] = 3  sum  in kernel:  4615
data[4] = 4  sum  in kernel:  4615
data[35] = 35  sum  in kernel:  4615
data[36] = 36  sum  in kernel:  4615
data[37] = 37  sum  in kernel:  4615
data[38] = 38  sum  in kernel:  4615
data[39] = 39  sum  in kernel:  4615
data[65] = 65  sum  in kernel:  4950
data[66] = 66  sum  in kernel:  4950
data[67] = 67  sum  in kernel:  4950
data[68] = 68  sum  in kernel:  4950
data[69] = 69  sum  in kernel:  4950
sum = 4950

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/2211254.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

Jenkins---01

什么是敏捷开发 敏捷开发以用户的需求进化为核心&#xff0c;采用迭代、循序渐进的方法进行软件开发。在敏捷开 发中&#xff0c;软件项目在构建初期被切分成多个子项目&#xff0c;各个子项目的成果都经过测试&#xff0c;具备可视、 可集成和可运行使用的特征。换言之&…

2024年编程资料【9月份部分】

资料列表 「CSDN会员免费电子书1000本」 https://pan.quark.cn/s/5019390a751a 【黑马程序员】年度钻石会员-人工智能AI进阶 https://pan.quark.cn/s/1d14a2a179c2 JavaScript从入门到高级教程 - 带源码课件 https://pan.quark.cn/s/c16ed07eac93 【马哥教育】云原生微服务治理…

测试常用插件: ModHeader - Modify HTTP headers插件进行IP模拟/IP欺骗

由于公司是做海外项目的&#xff0c;所以付款时有要求进行模拟不同IP登录进去时会优先显示该地区的支付方式。 1.安装插件 这里以Microsoft Edge为例&#xff0c;打开扩展 搜索&#xff1a;ModHeader - Modify HTTP headers&#xff0c;进行获取安装即可 安装完成后&#xff…

CVESearch部署、使用与原理分析

文章目录 前言1、概述2、安装与使用2.1、源码安装2.1.1、部署系统依赖组件2.1.1.1、下载安装基础组件2.1.1.2、下载安装MongoDB Community Edition 7.0 2.1.2、使用源码安装系统2.1.2.1、安装CVESearch2.1.2.2、填充MongoDB数据库2.1.2.3、填充Redis数据库 2.2、使用方法 3、测…

LeetCode | 704.二分查找

标准的二分查找&#xff0c;直接上模板&#xff01; class Solution(object):def search(self, nums, target):""":type nums: List[int]:type target: int:rtype: int"""l 0r len(nums) - 1while l < r:mid (l r 1) / 2if nums[mid] …

Telnet命令详解:安装、用法及应用场景解析

&#x1f49d;&#x1f49d;&#x1f49d;欢迎莅临我的博客&#xff0c;很高兴能够在这里和您见面&#xff01;希望您在这里可以感受到一份轻松愉快的氛围&#xff0c;不仅可以获得有趣的内容和知识&#xff0c;也可以畅所欲言、分享您的想法和见解。 推荐&#xff1a;「storm…

笔试算法总结

文章目录 题目1题目2题目3题目4 题目1 使用 StringBuilder 模拟栈的行为&#xff0c;通过判断相邻2个字符是否相同&#xff0c;如果相同就进行删除 public class Main {public static String fun(String s) {if (s null || s.length() < 1) return s;StringBuilder builde…

EventLoop模块 --- 事件循环模块

目录 1 设计思想 eventfd 创建eventfd 2 实现 3 联合调试 4 整合定时器模块 5 联合超时模块调试 1 设计思想 EventLoop 模块是和线程一一绑定的&#xff0c;每一个EventLoop模块内部都管理了一个Poller对象进行事件监控&#xff0c;同时管理着多个Connection对象&…

python 使用faker库 生成数据

Welcome to Faker’s documentation! — Faker 30.3.0 documentationVersion1: Example from docs:from faker import Faker from faker.providers import internet for i in range(2): #批量生成数据fake Faker()name fake.name()address fake.address()text f…

el-动态表单的校验不触发/只触发了部分项

参考&#xff1a; 深入了解Element Form表单动态验证问题 转载vue elementUI组件表单动态验证失效的问题与解决办法 在别人的代码上开发新功能时&#xff0c;发现动态表单的校验功能突然出现问题&#xff1a; 重构前,只有两步&#xff0c;通过type来判断当前显示内容 <el-f…

Cesium.js(SuperMap iClient3D for Cesium)进行三维场景展示和图层动画

1&#xff09;&#xff1a;参考API文档&#xff1a;SuperMap iClient3D for Cesium 开发指南 2&#xff09;&#xff1a;官网示例&#xff1a;support.supermap.com.cn:8090/webgl/Cesium/examples/webgl/examples.html#layer 3&#xff09;&#xff1a;SuperMap iServer&…

自定义类型 - 结构体

2024 - 10 - 13 - 笔记 - 26 作者(Author): 郑龙浩 / 仟濹(CSDN账号名) 自定义类型 - 结构体 平时用的数组是一组相同类型的数据&#xff0c;如果想表示一组不同类型的数据&#xff0c;那么就可以结构体了。 ① 结构体的声明&#xff08;重要&#xff09; 自己起的名字&…

[论文阅读]: Detecting Copyrighted Content in Language Models Training Data

发布链接&#xff1a;http://arxiv.org/abs/2402.09910 核心目标&#xff1a;检测语言模型的训练过程中是否使用了受版权保护的内容 基于假设&#xff1a;语言模型有可能识别训练文本中的逐字节选 工作&#xff1a;提出了 DE-COP&#xff0c;一种确定训练中是否包含受版权保…

如何在Android平板上使用谷歌浏览器进行网页缩放

在使用Android平板时&#xff0c;我们经常会浏览各种网页&#xff0c;但有时网页内容可能无法适应屏幕大小&#xff0c;这时就需要用到网页缩放功能。本文将为您详细介绍如何在Android平 板上的谷歌浏览器中进行网页缩放&#xff0c;帮助您更好地浏览网页。&#xff08;本文由h…

Cursor 平替项目 bolt.new

Cursor 是一个全新的编程工具&#xff0c;旨在帮助开发者更高效地写代码。它不仅能提升编程速度&#xff0c;还能让代码更干净、更智能。无论你是编程新手还是经验丰富的开发者&#xff0c;Cursor AI都能为你提供智能辅助&#xff0c;显著提高编程效率。 但是目前 Cursor 免费…

QT开发--文件的读写操作

第十三章 文件的读写操作 Qt提供两种读写纯文本文件的方法&#xff1a; 1、直接使用 QFile 类的IO功能&#xff1b; 2、结合 QFile 和 QTextStream&#xff0c;利用流(Stream)进行操作。 13.1 文件读操作 13.1.1 使用QFile类 Qt封装了QFile类&#xff0c;方便我们对文件进行操…

物联网直播技术揭秘:如何保证超高可用性?

我是小米,一个喜欢分享技术的29岁程序员。如果你喜欢我的文章,欢迎关注我的微信公众号“软件求生”,获取更多技术干货! Hello,大家好!我是小米,一个29岁超爱分享技术的码农。今天跟大家聊一聊物联网时代下直播高可用方案的那些事儿。 随着物联网的快速发展,直播技术已…

针对考研的C语言学习(循环队列-链表版本以及2019循环队列大题)

题目 【注】此版本严格按照数字版循环队列的写法&#xff0c;rear所代表的永远是空数据 图解 1.初始化部分和插入部分 2出队 3.分部代码解析 初始化 void init_cir_link_que(CirLinkQue& q) {q.rear q.front (LinkList)malloc(sizeof(LNode));q.front->next NULL…

【宝可梦】游戏

pokemmo https://pokemmo.com/zh/ 写在最后&#xff1a;若本文章对您有帮助&#xff0c;请点个赞啦 ٩(๑•̀ω•́๑)۶

LeetCode讲解篇之1749. 任意子数组和的绝对值的最大值

文章目录 题目描述题解思路题解代码题解链接 题目描述 题解思路 这个我只需要求子数组和的最小值相反数和子数组和的最大值&#xff0c;本题答案为二者的最大值 设数组maxDp中第i号元素表示以nums[i]为结尾的子数组和的最大值 设数组minDp中第i号元素表示以nums[i]为结尾的子…