将 magma example 改写成 cusolver example eqrf

news2025/1/11 14:56:05

1,简单安装Magma

1.1 下载编译 OpenBLAS

$ git clone https://github.com/OpenMathLib/OpenBLAS.git
$ cd OpenBLAS/
$ make -j DEBUG=1
$ make install PREFIX=/home/hipper/ex_magma/local_d/OpenBLAS/

1.2 下载编译 magma

$ git clone https://bitbucket.org/icl/magma.git
$ cd magma/
$ cp make.inc-examples/make.inc.openblas ./make.inc
$ vim make.inc
// # edit openblasdir to abouve
// # -O2 -> -g
$ make -j

vim make.inc

2. 改写 testing_xxxqr_gpu.cpp

testing/testing_sgeqrf_gpu.cpp

运行效果:

原始代码: 

/*
    -- MAGMA (version 2.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date

       @generated from testing/testing_zgeqrf_gpu.cpp, normal z -> s, Mon Jul 29 01:23:15 2024
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include "flops.h"
#include "magma_v2.h"
#include "magma_lapack.h"
#include "testings.h"

/* 
   -- Testing sgeqrf
*/
int main( int argc, char** argv)
{
    TESTING_CHECK( magma_init() );
    magma_print_environment();

    const float             d_neg_one = MAGMA_D_NEG_ONE;
    const float             d_one     = MAGMA_D_ONE;
    const float c_neg_one = MAGMA_S_NEG_ONE;
    const float c_one     = MAGMA_S_ONE;
    const float c_zero    = MAGMA_S_ZERO;
    const magma_int_t        ione      = 1;

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float           Anorm, error=0, error2=0;
    float *h_A, *h_R, *tau, *h_work, tmp[1], unused[1];
    magmaFloat_ptr d_A, dT;
    magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size;
    magma_int_t ISEED[4] = {0,0,0,1};

    magma_opts opts;
    opts.parse_opts( argc, argv );

    int status = 0;
    float tol = opts.tolerance * lapackf77_slamch("E");

    // for expert API testing
    magma_device_t cdev;
    magma_queue_t queues[2];
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    // version 3 can do either check
    if (opts.check == 1 && ( opts.version == 1 || opts.version == 4 ) ) {
        opts.check = 2;
        printf( "%% versions 1 and 4 requires check 2 (solve A*x=b)\n" );
    }
    if (opts.check == 2 && opts.version == 2) {
        opts.check = 1;
        printf( "%% version 2 requires check 1 (R - Q^H*A)\n" );
    }

    printf( "%% version %lld\n", (long long) opts.version );
    if ( opts.check == 1 ) {
        printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)   |R - Q^H*A|   |I - Q^H*Q|\n");
        printf("%%==============================================================================\n");
    }
    else {
        printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)    |b - A*x|\n");
        printf("%%===============================================================\n");
    }
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min( M, N );
            lda    = M;
            n2     = lda*N;
            ldda   = magma_roundup( M, opts.align );  // multiple of 32 by default
            nb     = magma_get_sgeqrf_nb( M, N );
            gflops = FLOPS_SGEQRF( M, N ) / 1e9;

            // query for workspace size
            lwork = -1;
            lapackf77_sgeqrf( &M, &N, unused, &M, unused, tmp, &lwork, &info );
            lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] );

            TESTING_CHECK( magma_smalloc_cpu( &tau,    min_mn ));
            TESTING_CHECK( magma_smalloc_cpu( &h_A,    n2     ));
            TESTING_CHECK( magma_smalloc_cpu( &h_work, lwork  ));

            TESTING_CHECK( magma_smalloc_pinned( &h_R,    n2     ));

            TESTING_CHECK( magma_smalloc( &d_A,    ldda*N ));

            if ( opts.version == 1 || opts.version == 3 || opts.version == 4 ) {
                size = (2*min(M, N) + magma_roundup( N, 32 ) )*nb;
                TESTING_CHECK( magma_smalloc( &dT, size ));
                magmablas_slaset( MagmaFull, size, 1, c_zero, c_zero, dT, size, opts.queue );
            }

            /* Initialize the matrix */
            magma_generate_matrix( opts, M, N, h_A, lda );
            lapackf77_slacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda );
            magma_ssetmatrix( M, N, h_R, lda, d_A, ldda, opts.queue );

            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            if ( opts.version == 1 ) {
                // stores dT, V blocks have zeros, R blocks inverted & stored in dT
                gpu_time = magma_wtime();
                magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info );
                gpu_time = magma_wtime() - gpu_time;
            }
            else if ( opts.version == 2 ) {
                // LAPACK complaint arguments
                gpu_time = magma_wtime();
                magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info );
                gpu_time = magma_wtime() - gpu_time;
            }
            #if defined(MAGMA_HAVE_CUDA) || defined(MAGMA_HAVE_HIP)
            else if ( opts.version == 3 ) {
                // stores dT, V blocks have zeros, R blocks stored in dT
                gpu_time = magma_wtime();
                magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info );
                gpu_time = magma_wtime() - gpu_time;
            }
            #endif
            else if (opts.version == 4) {
                // expert API for magma_sgeqrf_gpu
                magma_mode_t mode = MagmaHybrid;

                // query workspace
                void *host_work = NULL, *device_work=NULL;
                magma_int_t lhwork[1] = {-1}, ldwork[1] = {-1};
                magma_sgeqrf_expert_gpu_work(
                    M, N, NULL, ldda,
                    NULL, NULL, &info,
                    mode, nb,
                    NULL, lhwork,
                    NULL, ldwork, queues );
                // alloc workspace
                if( lhwork[0] > 0 ) {
                    magma_malloc_pinned( (void**)&host_work, lhwork[0] );
                }

                if( ldwork[0] > 0 ) {
                    magma_malloc( (void**)&device_work, ldwork[0] );
                }

                // time actual call only
                gpu_time = magma_wtime();
                magma_sgeqrf_expert_gpu_work(
                    M, N, d_A, ldda, tau, dT, &info,
                    mode, nb,
                    host_work, lhwork, device_work, ldwork, queues );
                magma_queue_sync( queues[0] );
                magma_queue_sync( queues[1] );
                gpu_time = magma_wtime() - gpu_time;

                // free workspace
                if( host_work != NULL) {
                    magma_free_pinned( host_work );
                }

                if( device_work != NULL ) {
                    magma_free( device_work );
                }
            }
            else {
                printf( "Unknown version %lld\n", (long long) opts.version );
                return -1;
            }
            gpu_perf = gflops / gpu_time;

            if (info != 0) {
                printf("magma_sgeqrf returned error %lld: %s.\n",
                       (long long) info, magma_strerror( info ));
            }

            if ( opts.check == 1 && (opts.version == 2 || opts.version == 3) ) {
                if ( opts.version == 3 ) {
                    // copy diagonal blocks of R back to A
                    for( int i=0; i < min_mn-nb; i += nb ) {
                        magma_int_t ib = min( min_mn-i, nb );
                        magmablas_slacpy( MagmaUpper, ib, ib, &dT[min_mn*nb + i*nb], nb, &d_A[ i + i*ldda ], ldda, opts.queue );
                    }
                }

                /* =====================================================================
                   Check the result, following zqrt01 except using the reduced Q.
                   This works for any M,N (square, tall, wide).
                   Only for version 2, which has LAPACK complaint output.
                   Or   for version 3, after restoring diagonal blocks of A above.
                   =================================================================== */
                magma_sgetmatrix( M, N, d_A, ldda, h_R, lda, opts.queue );

                magma_int_t ldq = M;
                magma_int_t ldr = min_mn;
                float *Q, *R;
                float *work;
                TESTING_CHECK( magma_smalloc_cpu( &Q,    ldq*min_mn ));  // M by K
                TESTING_CHECK( magma_smalloc_cpu( &R,    ldr*N ));       // K by N
                TESTING_CHECK( magma_smalloc_cpu( &work, min_mn ));

                // generate M by K matrix Q, where K = min(M,N)
                lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq );
                lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info );
                assert( info == 0 );

                // copy K by N matrix R
                lapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr );
                lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda,        R, &ldr );

                // error = || R - Q^H*A || / (N * ||A||)
                blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M,
                               &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr );
                Anorm = lapackf77_slange( "1", &M,      &N, h_A, &lda, work );
                error = lapackf77_slange( "1", &min_mn, &N, R,   &ldr, work );
                if ( N > 0 && Anorm > 0 )
                    error /= (N*Anorm);

                // set R = I (K by K identity), then R = I - Q^H*Q
                // error = || I - Q^H*Q || / N
                lapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr );
                blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr );
                error2 = safe_lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work );
                if ( N > 0 )
                    error2 /= N;

                magma_free_cpu( Q    );  Q    = NULL;
                magma_free_cpu( R    );  R    = NULL;
                magma_free_cpu( work );  work = NULL;
            }
            else if ( opts.check == 2 && M >= N && (opts.version == 1 || opts.version == 3 || opts.version == 4) ) {
                /* =====================================================================
                   Check the result by solving consistent linear system, A*x = b.
                   Only for versions 1 & 3 with M >= N.
                   =================================================================== */
                magma_int_t lwork2;
                float *x, *b, *hwork;
                magmaFloat_ptr d_B;

                // initialize RHS, b = A*random
                TESTING_CHECK( magma_smalloc_cpu( &x, N ));
                TESTING_CHECK( magma_smalloc_cpu( &b, M ));
                lapackf77_slarnv( &ione, ISEED, &N, x );
                blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione );
                // copy to GPU
                TESTING_CHECK( magma_smalloc( &d_B, M ));
                magma_ssetvector( M, b, 1, d_B, 1, opts.queue );

                if ( opts.version == 1 || opts.version == 4) {
                    // allocate hwork
                    magma_sgeqrs_gpu( M, N, 1,
                                      d_A, ldda, tau, dT,
                                      d_B, M, tmp, -1, &info );
                    lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );
                    TESTING_CHECK( magma_smalloc_cpu( &hwork, lwork2 ));

                    // solve linear system
                    magma_sgeqrs_gpu( M, N, 1,
                                      d_A, ldda, tau, dT,
                                      d_B, M, hwork, lwork2, &info );
                    if (info != 0) {
                        printf("magma_sgeqrs returned error %lld: %s.\n",
                               (long long) info, magma_strerror( info ));
                    }
                    magma_free_cpu( hwork );
                }
                #if defined(MAGMA_HAVE_CUDA) || defined(MAGMA_HAVE_HIP)
                else if ( opts.version == 3 ) {
                    // allocate hwork
                    magma_sgeqrs3_gpu( M, N, 1,
                                       d_A, ldda, tau, dT,
                                       d_B, M, tmp, -1, &info );
                    lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );
                    TESTING_CHECK( magma_smalloc_cpu( &hwork, lwork2 ));

                    // solve linear system
                    magma_sgeqrs3_gpu( M, N, 1,
                                       d_A, ldda, tau, dT,
                                       d_B, M, hwork, lwork2, &info );
                    if (info != 0) {
                        printf("magma_sgeqrs3 returned error %lld: %s.\n",
                               (long long) info, magma_strerror( info ));
                    }
                    magma_free_cpu( hwork );
                }
                #endif
                else {
                    printf( "Unknown version %lld\n", (long long) opts.version );
                    return -1;
                }
                magma_sgetvector( N, d_B, 1, x, 1, opts.queue );

                // compute r = Ax - b, saved in b
                blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione );

                // compute residual |Ax - b| / (max(m,n)*|A|*|x|)
                float norm_x, norm_A, norm_r, work[1];
                norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work );
                norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work );
                norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work );

                magma_free_cpu( x );
                magma_free_cpu( b );
                magma_free( d_B );
                error = norm_r / (max(M,N) * norm_A * norm_x);
            }

            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_sgeqrf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0) {
                    printf("lapackf77_sgeqrf returned error %lld: %s.\n",
                           (long long) info, magma_strerror( info ));
                }
            }

            /* =====================================================================
               Print performance and error.
               =================================================================== */
            printf("%5lld %5lld   ", (long long) M, (long long) N );
            if ( opts.lapack ) {
                printf( "%7.2f (%7.2f)", cpu_perf, cpu_time );
            }
            else {
                printf("  ---   (  ---  )" );
            }
            printf( "   %7.2f (%7.2f)   ", gpu_perf, gpu_time );
            if ( opts.check == 1 ) {
                bool okay = (error < tol && error2 < tol);
                status += ! okay;
                printf( "%11.2e   %11.2e   %s\n", error, error2, (okay ? "ok" : "failed") );
            }
            else if ( opts.check == 2 ) {
                if ( M >= N ) {
                    bool okay = (error < tol);
                    status += ! okay;
                    printf( "%10.2e   %s\n", error, (okay ? "ok" : "failed") );
                }
                else {
                    printf( "(error check only for M >= N)\n" );
                }
            }
            else {
                printf( "    ---\n" );
            }

            magma_free_cpu( tau    );
            magma_free_cpu( h_A    );
            magma_free_cpu( h_work );

            magma_free_pinned( h_R );

            magma_free( d_A );

            if ( opts.version == 1 || opts.version == 3 || opts.version == 4 ) {
                magma_free( dT );
            }

            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );

    opts.cleanup();
    TESTING_CHECK( magma_finalize() );
    return status;
}

流程分析:

改写为:

testing_cusolver_sgeqrf_gpu.cpp

#include <>

待补。。。

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/1959790.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

专业且免费的重复文件查找与删除工具,文本,图片,音频和视频等

AllDup是一款专业的重复文件查找与删除工具。作为一款免费软件&#xff0c;AllDup以其出色的功能和简洁的操作界面广受欢迎。它不仅可以有效地识别和删除电脑硬盘以及外部设备如USB闪存驱动器中的重复文件&#xff0c;还能对多媒体文件如图片、音频和视频等进行特殊处理&#x…

GUI图形化界面操作(下部)

目录 ​编辑 前言 Swing 窗口 注意点 新增的组件 进度条组件 开关按钮 多面板和分割面板 多面板 分割面板 ​编辑 选项窗口 对话框带三个选项是&#xff0c;否&#xff0c;取消。 对话框提示输入文本: 前言 修炼中&#xff0c;该篇文章为俺很久前的学习笔记 Swi…

Tomcat的安装配置教程

一、服务器的安装 tomcat官方安装网站&#xff1a;http://tomcat.apache.org/ 点击选择想要安装的版本 选择与本机的字节匹配的压缩包进行安装 二、 环境配置 打开系统 进行高级系统配置 配置环境变量 新建系统变量 增加新变量&#xff0c;复制tomcat文件的安装路径为…

HTML,CSS,JavaScript实现——井字棋游戏

和大家分享一个经典的游戏项目——井字棋游戏。这个项目不仅能带你回味童年的乐趣&#xff0c;还能帮助你练习 HTML、CSS 和 JavaScript 编程。 项目介绍 井字棋游戏是一个两人对战游戏&#xff0c;玩家轮流在一个3x3的网格上标记 X 或 O。先将三个标记连成一条直线&#xff…

彻底解决Google浏览器自动删除下载文件或下载失败

需求背景 最近发现在阿里巴巴国际站聊天过程中,客户发的文件或软件,Goole浏览器居然无法下载,或者下载一会就提示失败,莫名其妙。错误提示如下:仔细看发现是【已拦截未经验证的下载内容】。 解决方案: 1、打开浏览器设置 2、打开隐私安全 3、配置安全浏览 4、配置完成-…

面试:CUDA Tiling 和 CPU tiling 技术详解

目录 一、CUDA Tiling 和 CPU Tiling 技术概述 &#xff08;一&#xff09;技术原理 &#xff08;二&#xff09;应用场景 &#xff08;三&#xff09;优势和劣势 二、Tiling 技术在深度学习中的应用 三、Tiling 技术的缺点 一、CUDA Tiling 和 CPU Tiling 技术概述 Til…

介绍五款广受好评的企业级加密软件

在当今信息化时代&#xff0c;数据安全已成为企业管理的重要环节。随着网络攻击和数据泄露事件的频繁发生&#xff0c;如何有效保护企业数据不被泄露&#xff0c;成为各大企业关注的焦点。加密软件作为一种有效的防护工具&#xff0c;通过对数据进行加密处理&#xff0c;确保敏…

react中使用Redux管理token以及token持久化

1.安装插件 npm i reduxjs/toolkit react-redux 2.新建状态管理文件 在src下新建store文件夹&#xff0c;store文件夹下新建模块文件夹(modules)和入口文件&#xff08;index.js&#xff09;&#xff0c;modules文件夹下新建setToken.js文件 3.配置setToken.js import { cr…

梅卡曼德 Mech-Eye 工业级3D相机

自研高性能工业级3D相机&#xff0c;精度高、速度快、抗环境光、成像质量高&#xff0c;可对各类材质物体生成高质量3D点云数据。产品线完整&#xff0c;满足远/中/近不同距离下对于抗环境光、高精度、大视野、高速度、小体积的需求。

oracle常用几个相似的恢复命令之间的区别

Oracle恢复数据库时有几个常用但非常相似的命令&#xff1a; recover databaserecover database until cancelrecover database until cancel using backup controlfilerecover database using backup controlfilerecover database using backup controlfile until cancel 它们…

ILI2326 触摸控制板解决方案

一、方案描述 ILI2326是一款专为工业电容式触摸面板应用而优化的单芯片电容式触摸传感器、主要应用在TV智慧屏&#xff0c;电子触控白板&#xff0c;商显等领域。ILI2326触摸控制板多达352通道&#xff0c;并最大支持86寸电容触摸面板。支持最大信噪比为200:1&#xff0c;IEC …

最新 【Navicat Premium 17.0.8】简体中文版破解激活永久教程

官方下载地址&#xff1a; https://www.navicat.com.cn/download/navicat-premium 百度网盘补丁链接 链接: https://pan.baidu.com/s/11hu414Honi3Y9dPQ6-07JQ?pwd04mu 提取码: 04mu 未安装过的用户可直接跳过该步骤&#xff0c;如果已安装Navicat&#xff0c;记得先卸载干净…

网络适配器中没有WSL网络怎么办?

网络适配器中没有WSL网络怎么办? 1、创建内部虚拟交换机 打开 PowerShell(以管理员身份运行),运行以下命令 New-VMSwitch -Name "WSL" -SwitchType Internal2、配置虚拟网络适配器 Get-NetAdapter -Name "vEthernet (WSL)"设置虚拟网络适配器的 IP 地…

clangd配置

clangd下载、配置和使用 更新背景环境一.优缺点对比二.下载1. ubuntu download clangd2. vscode download clangd3. github download clangd 三.配置1.简易配置&#xff1a;2.详细配置 四.使用结语&#xff1a;done&#xff01;&#xff01;&#xff01; 更新 日期&#xff1a;…

【环形链表 II】python刷题记录

R2-快慢指针&#xff08;双指针中的子问题&#xff09;。 k神 这类链表题目一般都是使用双指针法解决的&#xff0c;例如寻找距离尾部第 K 个节点、寻找环入口、寻找公共尾部入口等。 在本题的求解过程中&#xff0c;双指针会产生两次“相遇”。 抽象&#xff0c;太抽象了。…

盘点.软件测试模型

软件开发模型   软件开发模型(Software Development Model)是指软件开发全部过程、活动和任务的结构框架。软件开发包括需求、设计、编码和测试等阶段&#xff0c;有时也包括维护阶段。 软件开发模型能清晰、直观地表达软件开发全过程&#xff0c;明确规定了要完成的主要活动…

Java中的二叉搜索树(如果想知道Java中有关二叉搜索树的知识点,那么只看这一篇就足够了!)

前言&#xff1a;Java 提供了丰富的数据结构来处理和管理数据&#xff0c;其中 TreeSet 和 TreeMap 是基于红黑树实现的集合和映射接口。它们有序地存储数据&#xff0c;提供高效的搜索、插入和删除操作。 ✨✨✨这里是秋刀鱼不做梦的BLOG ✨✨✨想要了解更多内容可以访问我的主…

web期末大作业家乡-周口

代码下载: https://pan.quark.cn/s/64c61dfc0928

【学术大咖云集】第二届网络、通信与智能计算国际会议(NCIC 2024)

第二届网络、通信与智能计算国际会议 The 2nd International Conference on Networks, Communications and Intelligent Computing&#xff08;NCIC 2024&#xff09; 2024年11月22日-25日 中国 | 北京 www.icncic.org 第二届网络、通信与智能计算国际会议&#xff08;NCIC…

新书速览|动手学PyTorch建模与应用:从深度学习到大模型

《动手学PyTorch建模与应用&#xff1a;从深度学习到大模型》 本书内容 《动手学PyTorch建模与应用:从深度学习到大模型》是一本从零基础上手深度学习和大模型的PyTorch实战指南。《动手学PyTorch建模与应用:从深度学习到大模型》共11章&#xff0c;第1章主要介绍深度学习的概念…