roctracer 的应用示例

news2025/1/10 12:03:43

1,不用 roctracer 的普通场景

mt.cpp

/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE. */

#include <iostream>

// hip header file
#include <hip/hip_runtime.h>

#define HIP_CALL(call)                                                                             \
  do {                                                                                             \
    hipError_t err = call;                                                                         \
    if (err != hipSuccess) {                                                                       \
      fprintf(stderr, "%s\n", hipGetErrorString(err));                                             \
      abort();                                                                                     \
    }                                                                                              \
  } while (0)

#define WIDTH 1024


#define NUM (WIDTH * WIDTH)

#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1

// Device (Kernel) function, it must be void
__global__ void matrixTranspose(float* out, float* in, const int width) {
  int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
  int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

  out[y * width + x] = in[x * width + y];
}

// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
  for (unsigned int j = 0; j < width; j++) {
    for (unsigned int i = 0; i < width; i++) {
      output[i * width + j] = input[j * width + i];
    }
  }
}

int main() {
  float* Matrix;
  float* TransposeMatrix;
  float* cpuTransposeMatrix;

  float* gpuMatrix;
  float* gpuTransposeMatrix;

  hipDeviceProp_t devProp;
  HIP_CALL(hipGetDeviceProperties(&devProp, 0));

  std::cerr << "Device name " << devProp.name << std::endl;

  int i;
  int errors;

  Matrix = (float*)malloc(NUM * sizeof(float));
  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

  // initialize the input data
  for (i = 0; i < NUM; i++) {
    Matrix[i] = (float)i * 10.0f;
  }

  // allocate the memory on the device side
  HIP_CALL(hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)));
  HIP_CALL(hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)));

  uint32_t iterations = 100;
  while (iterations-- > 0) {
    std::cerr << "## Iteration (" << iterations << ") #################" << std::endl;

    // Memory transfer from host to device
    HIP_CALL(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice));

    // Lauching kernel from host
    hipLaunchKernelGGL(
        matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH);


    HIP_CALL(
        hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost));


    // CPU MatrixTranspose computation
    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);

    // verify the results
    errors = 0;
    double eps = 1.0E-6;
    for (i = 0; i < NUM; i++) {
      if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
        errors++;
      }
    }
    if (errors != 0) {
      fprintf(stderr, "FAILED: %d errors\n", errors);
    } else {
      fprintf(stderr, "PASSED!\n");
    }
  }

  // free the resources on device side
  HIP_CALL(hipFree(gpuMatrix));
  HIP_CALL(hipFree(gpuTransposeMatrix));

  // free the resources on host side
  free(Matrix);
  free(TransposeMatrix);
  free(cpuTransposeMatrix);

  return errors;
}

编译:

 $ hipcc mt.cpp -o mt

$ ./mt xxx

不会产生文件;

2,加入roctracer的源文件

MatrixTranspose.cpp:

/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE. */

#include <iostream>

// hip header file
#include <hip/hip_runtime.h>
#include "roctracer_ext.h"
// roctx header file
#include <roctx.h>

#define HIP_CALL(call)                                                                             \
  do {                                                                                             \
    hipError_t err = call;                                                                         \
    if (err != hipSuccess) {                                                                       \
      fprintf(stderr, "%s\n", hipGetErrorString(err));                                             \
      abort();                                                                                     \
    }                                                                                              \
  } while (0)

#define WIDTH 1024


#define NUM (WIDTH * WIDTH)

#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1

// Device (Kernel) function, it must be void
__global__ void matrixTranspose(float* out, float* in, const int width) {
  int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
  int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

  out[y * width + x] = in[x * width + y];
}

// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
  for (unsigned int j = 0; j < width; j++) {
    for (unsigned int i = 0; i < width; i++) {
      output[i * width + j] = input[j * width + i];
    }
  }
}

int main() {
  float* Matrix;
  float* TransposeMatrix;
  float* cpuTransposeMatrix;

  float* gpuMatrix;
  float* gpuTransposeMatrix;

  hipDeviceProp_t devProp;
  HIP_CALL(hipGetDeviceProperties(&devProp, 0));

  std::cerr << "Device name " << devProp.name << std::endl;

  int i;
  int errors;

  Matrix = (float*)malloc(NUM * sizeof(float));
  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

  // initialize the input data
  for (i = 0; i < NUM; i++) {
    Matrix[i] = (float)i * 10.0f;
  }

  // allocate the memory on the device side
  HIP_CALL(hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)));
  HIP_CALL(hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)));

  uint32_t iterations = 100;
  while (iterations-- > 0) {
    std::cerr << "## Iteration (" << iterations << ") #################" << std::endl;

    // Memory transfer from host to device
    HIP_CALL(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice));

    roctxMark("before hipLaunchKernel");
    int rangeId = roctxRangeStart("hipLaunchKernel range");
    roctxRangePush("hipLaunchKernel");
    // Lauching kernel from host
    hipLaunchKernelGGL(
        matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH);
    roctxMark("after hipLaunchKernel");

    // Memory transfer from device to host
    roctxRangePush("hipMemcpy");

    HIP_CALL(
        hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost));

    roctxRangePop();  // for "hipMemcpy"
    roctxRangePop();  // for "hipLaunchKernel"
    roctxRangeStop(rangeId);

    // CPU MatrixTranspose computation
    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);

    // verify the results
    errors = 0;
    double eps = 1.0E-6;
    for (i = 0; i < NUM; i++) {
      if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
        errors++;
      }
    }
    if (errors != 0) {
      fprintf(stderr, "FAILED: %d errors\n", errors);
    } else {
      fprintf(stderr, "PASSED!\n");
    }
  }

  // free the resources on device side
  HIP_CALL(hipFree(gpuMatrix));
  HIP_CALL(hipFree(gpuTransposeMatrix));

  // free the resources on host side
  free(Matrix);
  free(TransposeMatrix);
  free(cpuTransposeMatrix);

  return errors;
}

编译:

只使用hipcc无法直接编译这个源文件

需要指定include 目录和链接库:

$ hipcc ./MatrixTranspose.cpp  -I /opt/rocm/include/roctracer/ -lroctx64

运行:

./a.out

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/2136632.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

哪款宠物空气净化器是除浮毛王者?希喂、范罗士、霍尼韦尔宠物空气净化器实测

养宠人绕不过的痛——掉毛&#xff01;脱毛&#xff01;又到了掉毛季&#xff0c;就连空气中都有毛毛……不管遇到谁&#xff0c;都知道你养猫养狗了——只因T恤变身毛线衫、毛毛怎么都粘不干净。不止是衣服上&#xff0c;地板上、沙发上、桌面上&#xff0c;哪哪都是毛。开始养…

从头开始学MyBatis—02基于xml和注解分别实现的增删改查

首先介绍此次使用的数据库结构&#xff0c;然后引出注意事项。 通过基于xml和基于注解的方式分别实现了增删改查&#xff0c;还有获取参数值、返回值的不同类型对比&#xff0c;帮助大家一次性掌握两种代码编写能力。 目录 数据库 数据库表 实体类 对应的实体类如下&#x…

【目标检测】labelimg图像标注软件的使用流程

一、labelimg检测图片标注 1、下载labelimg.exe 链接&#xff1a;https://pan.baidu.com/s/1yk8ff56Xu40-ZLBghEQ5nw 提取码&#xff1a;vj8f 下载的文件是编译好的&#xff0c;可执行的labelImg.exe文件。直接将文件放在windows环境下&#xff0c;双击可执行。&#xff08;如果…

典型BUCK电路学习和设计

手把手教你设计12V3Abuck降压电路-2-相关输入参数讲解_哔哩哔哩_bilibili 这里是输入电容&#xff0c;先过大电容&#xff08;电解电容&#xff09;再过小电容&#xff08;陶瓷贴片电容&#xff0c;高频率波&#xff09; 输出也可以同理 开关电源不能带负载的原因&#xff0c…

uniapp vue3 梯形选项卡组件

实现的效果图&#xff1a; 切换选项卡显示不同的内容&#xff0c;把这个选项卡做成了一个组件&#xff0c;需要的自取。 // 组件名为 trapezoidalTab <template> <view class"pd24"><view class"nav"><!-- 左侧 --><view cla…

--链表--

一.链表的概述 二.逻辑图 三.代码详解 //1.定义关于链表的结构体 #include <iostream> #include <stdlib.h> #include <assert.h> using namespace std; typedef int SLTDateType;//适用于不同的数据类型 typedef struct SListNode {SLTDateType data;//数据…

【Day14-单例设计模式动态代理】

单例设计模式 什么是设计模式&#xff08;Design pattern&#xff09; ? 一个问题通常有n种解法&#xff0c;其中肯定有一种解法是最优的&#xff0c;这个最优的解法被人总结出来了&#xff0c;称之为设计模式。设计模式有20多种&#xff0c;对应20多种软件开发中会遇到的问题…

记录开发一个英语听力训练网站

背景 在当前全球经济衰退的背景下&#xff0c;IT相关的工作在国内的竞争也是越来越激烈&#xff0c;为了能够获得更多的可能性&#xff0c;英语的学习也许能为程序员打开一扇新的窗户&#xff0c;比如很多远程的工作尤其是国际化背景的工作团队&#xff0c;英语的协作沟通是必…

Modbus通信

Modbus是一种经典的工业通信协议&#xff0c;由Modicon&#xff08;现为施耐德电气&#xff09;在1979年首次发布。它广泛应用于各种工业自动化系统中&#xff0c;尤其是在PLC&#xff08;可编程逻辑控制器&#xff09;与其他设备之间的通信。Modbus的主要特点是其简单性和开放…

《Oracle(一)- 基础》

文章目录 一、Oracle简介&#xff08;一&#xff09;什么是ORACLE&#xff08;二&#xff09;ORACLE 体系结构1.数据库2.实例3.数据文件&#xff08;dbf&#xff09;4.表空间5.用户 二、ORACLE 安装与配置&#xff08;一&#xff09;VMware 挂载 windows server 2003&#xff0…

海外短剧系统一站式开发+h5,app双端

前言&#xff1a; 海外短剧是指那些制作时间短、剧情紧凑、内容丰富的剧集&#xff0c;主要在海外市场&#xff08;如北美、欧洲、东南亚等&#xff09;播放并受到欢迎。 而海外短剧系统是指一种用于制作和播放海外短剧的系统。该系统通常由电视台、制片公司或在线视频平台使…

C++:STL详解(一)string类的基本介绍与使用方式

✨ Blog’s 主页: 白乐天_ξ( ✿&#xff1e;◡❛) &#x1f308; 个人Motto&#xff1a;实践是检验真理的唯一标准&#xff01;&#xff01;&#xff01;敲代码需要勤快点&#xff01;&#xff01;&#xff01;&#xff01; &#x1f4ab; 欢迎来到我的学习笔记&#xff0…

十二,Spring Boot 异常处理(自定义异常页面,全局异常,自定义异常)

十二&#xff0c;Spring Boot 异常处理(自定义异常页面&#xff0c;全局异常&#xff0c;自定义异常) 文章目录 十二&#xff0c;Spring Boot 异常处理(自定义异常页面&#xff0c;全局异常&#xff0c;自定义异常)1. 基本介绍2. 自定义异常页面3. 全局异常4. 自定义异常5. 补充…

LineageOS刷机教程

版权归作者所有&#xff0c;如有转发&#xff0c;请注明文章出处&#xff1a;https://cyrus-studio.github.io/blog/ LineageOS 是一个基于 Android 开源项目&#xff08;AOSP&#xff09;的开源操作系统&#xff0c;主要由社区开发者维护。它起源于 CyanogenMod 项目&#xff…

数据库索引底层数据结构之B+树MySQL中的页索引分类【纯理论知识,干货分享,面试必备】

目录 1、索引简介 1.1 什么是索引 1.2 使用索引的原因 2、索引中数据结构的设计 —— B树 2.1 哈希 2.2 二叉搜索树 2.3 B树 2.4 最终选择之——B树 2.4.1 B树与B树的对比(面向索引)【面试题】 3、MySQL中的页 3.1 页的使用原因 3.2 页的结构 3.2.1 页文件头和页文件…

解锁定位服务:Flutter应用中的高德地图定位

前言 在现代移动应用开发中&#xff0c;定位服务已成为一项基本功能&#xff0c;它使得应用能够获取用户的地理位置信息&#xff0c;为用户提供更加个性化的服务。 Flutter 作为跨平台的移动应用开发框架&#xff0c;支持集成多种服务&#xff0c;包括定位服务。 本文将介绍如…

HR8870:可PWM控制,4.5A直流有刷电机驱动数据手册

HR8870芯片描述 HR8870是一款直流有刷电机驱动器&#xff0c;适用于打印机、电器、工业设备以及其他小型机器。两个逻辑输入控制H桥驱动器&#xff0c;该驱动器由四个N-MOS组成&#xff0c;能够以高达4.5A的峰值电流双向控制电机。利用电流衰减模式&#xff0c;可通过对输入进行…

故障码格式解析

中&#xff0c;诊断故障码&#xff08;DTC, Diagnostic Trouble Code&#xff09;是由一个字母前缀和三个后续字符组成的。这些字母前缀根据故障所属的系统类别来区分&#xff0c;具体如下&#xff1a; B0 -- B3&#xff1a;表示车身系统&#xff08;Body&#xff09;的故障码…

Linux CTF逆向入门

1.ELF格式 我们先来看看 ELF 文件头&#xff0c;如果想详细了解&#xff0c;可以查看ELF的man page文档。 关于ELF更详细的说明&#xff1a; e_shoff&#xff1a;节头表的文件偏移量&#xff08;字节&#xff09;。如果文件没有节头表&#xff0c;则此成员值为零。 sh_offset&…

Qt 菜单、工具栏 的基本使用

效果 代码 #include "mainwindow.h" #include "ui_mainwindow.h" #include<QToolBar> #include<QDebug> #include<QPushButton>MainWindow::MainWindow(QWidget *parent): QMainWindow(parent), ui(new Ui::MainWindow) {ui->setupU…