将 magma example 改写成 cusolver example eqrf

1，简单安装Magma

1.1 下载编译 OpenBLAS

$ git clone https://github.com/OpenMathLib/OpenBLAS.git
$ cd OpenBLAS/
$ make -j DEBUG=1
$ make install PREFIX=/home/hipper/ex_magma/local_d/OpenBLAS/

1.2 下载编译 magma

$ git clone https://bitbucket.org/icl/magma.git
$ cd magma/
$ cp make.inc-examples/make.inc.openblas ./make.inc
$ vim make.inc
// # edit openblasdir to abouve
// # -O2 -> -g
$ make -j

vim make.inc

2. 改写 testing_xxxqr_gpu.cpp

testing/testing_sgeqrf_gpu.cpp

运行效果：

原始代码：

/*
    -- MAGMA (version 2.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date

       @generated from testing/testing_zgeqrf_gpu.cpp, normal z -> s, Mon Jul 29 01:23:15 2024
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include "flops.h"
#include "magma_v2.h"
#include "magma_lapack.h"
#include "testings.h"

/* 
   -- Testing sgeqrf
*/
int main( int argc, char** argv)
{
    TESTING_CHECK( magma_init() );
    magma_print_environment();

    const float             d_neg_one = MAGMA_D_NEG_ONE;
    const float             d_one     = MAGMA_D_ONE;
    const float c_neg_one = MAGMA_S_NEG_ONE;
    const float c_one     = MAGMA_S_ONE;
    const float c_zero    = MAGMA_S_ZERO;
    const magma_int_t        ione      = 1;

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float           Anorm, error=0, error2=0;
    float *h_A, *h_R, *tau, *h_work, tmp[1], unused[1];
    magmaFloat_ptr d_A, dT;
    magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size;
    magma_int_t ISEED[4] = {0,0,0,1};

    magma_opts opts;
    opts.parse_opts( argc, argv );

    int status = 0;
    float tol = opts.tolerance * lapackf77_slamch("E");

    // for expert API testing
    magma_device_t cdev;
    magma_queue_t queues[2];
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    // version 3 can do either check
    if (opts.check == 1 && ( opts.version == 1 || opts.version == 4 ) ) {
        opts.check = 2;
        printf( "%% versions 1 and 4 requires check 2 (solve A*x=b)\n" );
    }
    if (opts.check == 2 && opts.version == 2) {
        opts.check = 1;
        printf( "%% version 2 requires check 1 (R - Q^H*A)\n" );
    }

    printf( "%% version %lld\n", (long long) opts.version );
    if ( opts.check == 1 ) {
        printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)   |R - Q^H*A|   |I - Q^H*Q|\n");
        printf("%%==============================================================================\n");
    }
    else {
        printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)    |b - A*x|\n");
        printf("%%===============================================================\n");
    }
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min( M, N );
            lda    = M;
            n2     = lda*N;
            ldda   = magma_roundup( M, opts.align );  // multiple of 32 by default
            nb     = magma_get_sgeqrf_nb( M, N );
            gflops = FLOPS_SGEQRF( M, N ) / 1e9;

            // query for workspace size
            lwork = -1;
            lapackf77_sgeqrf( &M, &N, unused, &M, unused, tmp, &lwork, &info );
            lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] );

            TESTING_CHECK( magma_smalloc_cpu( &tau,    min_mn ));
            TESTING_CHECK( magma_smalloc_cpu( &h_A,    n2     ));
            TESTING_CHECK( magma_smalloc_cpu( &h_work, lwork  ));

            TESTING_CHECK( magma_smalloc_pinned( &h_R,    n2     ));

            TESTING_CHECK( magma_smalloc( &d_A,    ldda*N ));

            if ( opts.version == 1 || opts.version == 3 || opts.version == 4 ) {
                size = (2*min(M, N) + magma_roundup( N, 32 ) )*nb;
                TESTING_CHECK( magma_smalloc( &dT, size ));
                magmablas_slaset( MagmaFull, size, 1, c_zero, c_zero, dT, size, opts.queue );
            }

            /* Initialize the matrix */
            magma_generate_matrix( opts, M, N, h_A, lda );
            lapackf77_slacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda );
            magma_ssetmatrix( M, N, h_R, lda, d_A, ldda, opts.queue );

            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            if ( opts.version == 1 ) {
                // stores dT, V blocks have zeros, R blocks inverted & stored in dT
                gpu_time = magma_wtime();
                magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info );
                gpu_time = magma_wtime() - gpu_time;
            }
            else if ( opts.version == 2 ) {
                // LAPACK complaint arguments
                gpu_time = magma_wtime();
                magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info );
                gpu_time = magma_wtime() - gpu_time;
            }
            #if defined(MAGMA_HAVE_CUDA) || defined(MAGMA_HAVE_HIP)
            else if ( opts.version == 3 ) {
                // stores dT, V blocks have zeros, R blocks stored in dT
                gpu_time = magma_wtime();
                magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info );
                gpu_time = magma_wtime() - gpu_time;
            }
            #endif
            else if (opts.version == 4) {
                // expert API for magma_sgeqrf_gpu
                magma_mode_t mode = MagmaHybrid;

                // query workspace
                void *host_work = NULL, *device_work=NULL;
                magma_int_t lhwork[1] = {-1}, ldwork[1] = {-1};
                magma_sgeqrf_expert_gpu_work(
                    M, N, NULL, ldda,
                    NULL, NULL, &info,
                    mode, nb,
                    NULL, lhwork,
                    NULL, ldwork, queues );
                // alloc workspace
                if( lhwork[0] > 0 ) {
                    magma_malloc_pinned( (void**)&host_work, lhwork[0] );
                }

                if( ldwork[0] > 0 ) {
                    magma_malloc( (void**)&device_work, ldwork[0] );
                }

                // time actual call only
                gpu_time = magma_wtime();
                magma_sgeqrf_expert_gpu_work(
                    M, N, d_A, ldda, tau, dT, &info,
                    mode, nb,
                    host_work, lhwork, device_work, ldwork, queues );
                magma_queue_sync( queues[0] );
                magma_queue_sync( queues[1] );
                gpu_time = magma_wtime() - gpu_time;

                // free workspace
                if( host_work != NULL) {
                    magma_free_pinned( host_work );
                }

                if( device_work != NULL ) {
                    magma_free( device_work );
                }
            }
            else {
                printf( "Unknown version %lld\n", (long long) opts.version );
                return -1;
            }
            gpu_perf = gflops / gpu_time;

            if (info != 0) {
                printf("magma_sgeqrf returned error %lld: %s.\n",
                       (long long) info, magma_strerror( info ));
            }

            if ( opts.check == 1 && (opts.version == 2 || opts.version == 3) ) {
                if ( opts.version == 3 ) {
                    // copy diagonal blocks of R back to A
                    for( int i=0; i < min_mn-nb; i += nb ) {
                        magma_int_t ib = min( min_mn-i, nb );
                        magmablas_slacpy( MagmaUpper, ib, ib, &dT[min_mn*nb + i*nb], nb, &d_A[ i + i*ldda ], ldda, opts.queue );
                    }
                }

                /* =====================================================================
                   Check the result, following zqrt01 except using the reduced Q.
                   This works for any M,N (square, tall, wide).
                   Only for version 2, which has LAPACK complaint output.
                   Or   for version 3, after restoring diagonal blocks of A above.
                   =================================================================== */
                magma_sgetmatrix( M, N, d_A, ldda, h_R, lda, opts.queue );

                magma_int_t ldq = M;
                magma_int_t ldr = min_mn;
                float *Q, *R;
                float *work;
                TESTING_CHECK( magma_smalloc_cpu( &Q,    ldq*min_mn ));  // M by K
                TESTING_CHECK( magma_smalloc_cpu( &R,    ldr*N ));       // K by N
                TESTING_CHECK( magma_smalloc_cpu( &work, min_mn ));

                // generate M by K matrix Q, where K = min(M,N)
                lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq );
                lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info );
                assert( info == 0 );

                // copy K by N matrix R
                lapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr );
                lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda,        R, &ldr );

                // error = || R - Q^H*A || / (N * ||A||)
                blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M,
                               &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr );
                Anorm = lapackf77_slange( "1", &M,      &N, h_A, &lda, work );
                error = lapackf77_slange( "1", &min_mn, &N, R,   &ldr, work );
                if ( N > 0 && Anorm > 0 )
                    error /= (N*Anorm);

                // set R = I (K by K identity), then R = I - Q^H*Q
                // error = || I - Q^H*Q || / N
                lapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr );
                blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr );
                error2 = safe_lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work );
                if ( N > 0 )
                    error2 /= N;

                magma_free_cpu( Q    );  Q    = NULL;
                magma_free_cpu( R    );  R    = NULL;
                magma_free_cpu( work );  work = NULL;
            }
            else if ( opts.check == 2 && M >= N && (opts.version == 1 || opts.version == 3 || opts.version == 4) ) {
                /* =====================================================================
                   Check the result by solving consistent linear system, A*x = b.
                   Only for versions 1 & 3 with M >= N.
                   =================================================================== */
                magma_int_t lwork2;
                float *x, *b, *hwork;
                magmaFloat_ptr d_B;

                // initialize RHS, b = A*random
                TESTING_CHECK( magma_smalloc_cpu( &x, N ));
                TESTING_CHECK( magma_smalloc_cpu( &b, M ));
                lapackf77_slarnv( &ione, ISEED, &N, x );
                blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione );
                // copy to GPU
                TESTING_CHECK( magma_smalloc( &d_B, M ));
                magma_ssetvector( M, b, 1, d_B, 1, opts.queue );

                if ( opts.version == 1 || opts.version == 4) {
                    // allocate hwork
                    magma_sgeqrs_gpu( M, N, 1,
                                      d_A, ldda, tau, dT,
                                      d_B, M, tmp, -1, &info );
                    lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );
                    TESTING_CHECK( magma_smalloc_cpu( &hwork, lwork2 ));

                    // solve linear system
                    magma_sgeqrs_gpu( M, N, 1,
                                      d_A, ldda, tau, dT,
                                      d_B, M, hwork, lwork2, &info );
                    if (info != 0) {
                        printf("magma_sgeqrs returned error %lld: %s.\n",
                               (long long) info, magma_strerror( info ));
                    }
                    magma_free_cpu( hwork );
                }
                #if defined(MAGMA_HAVE_CUDA) || defined(MAGMA_HAVE_HIP)
                else if ( opts.version == 3 ) {
                    // allocate hwork
                    magma_sgeqrs3_gpu( M, N, 1,
                                       d_A, ldda, tau, dT,
                                       d_B, M, tmp, -1, &info );
                    lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );
                    TESTING_CHECK( magma_smalloc_cpu( &hwork, lwork2 ));

                    // solve linear system
                    magma_sgeqrs3_gpu( M, N, 1,
                                       d_A, ldda, tau, dT,
                                       d_B, M, hwork, lwork2, &info );
                    if (info != 0) {
                        printf("magma_sgeqrs3 returned error %lld: %s.\n",
                               (long long) info, magma_strerror( info ));
                    }
                    magma_free_cpu( hwork );
                }
                #endif
                else {
                    printf( "Unknown version %lld\n", (long long) opts.version );
                    return -1;
                }
                magma_sgetvector( N, d_B, 1, x, 1, opts.queue );

                // compute r = Ax - b, saved in b
                blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione );

                // compute residual |Ax - b| / (max(m,n)*|A|*|x|)
                float norm_x, norm_A, norm_r, work[1];
                norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work );
                norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work );
                norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work );

                magma_free_cpu( x );
                magma_free_cpu( b );
                magma_free( d_B );
                error = norm_r / (max(M,N) * norm_A * norm_x);
            }

            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_sgeqrf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0) {
                    printf("lapackf77_sgeqrf returned error %lld: %s.\n",
                           (long long) info, magma_strerror( info ));
                }
            }

            /* =====================================================================
               Print performance and error.
               =================================================================== */
            printf("%5lld %5lld   ", (long long) M, (long long) N );
            if ( opts.lapack ) {
                printf( "%7.2f (%7.2f)", cpu_perf, cpu_time );
            }
            else {
                printf("  ---   (  ---  )" );
            }
            printf( "   %7.2f (%7.2f)   ", gpu_perf, gpu_time );
            if ( opts.check == 1 ) {
                bool okay = (error < tol && error2 < tol);
                status += ! okay;
                printf( "%11.2e   %11.2e   %s\n", error, error2, (okay ? "ok" : "failed") );
            }
            else if ( opts.check == 2 ) {
                if ( M >= N ) {
                    bool okay = (error < tol);
                    status += ! okay;
                    printf( "%10.2e   %s\n", error, (okay ? "ok" : "failed") );
                }
                else {
                    printf( "(error check only for M >= N)\n" );
                }
            }
            else {
                printf( "    ---\n" );
            }

            magma_free_cpu( tau    );
            magma_free_cpu( h_A    );
            magma_free_cpu( h_work );

            magma_free_pinned( h_R );

            magma_free( d_A );

            if ( opts.version == 1 || opts.version == 3 || opts.version == 4 ) {
                magma_free( dT );
            }

            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );

    opts.cleanup();
    TESTING_CHECK( magma_finalize() );
    return status;
}

流程分析：

改写为：

testing_cusolver_sgeqrf_gpu.cpp