OpenBLAS QR decompose example

news2024/9/24 13:59:19

1. 安装 OpenBLAS

release 版本:

Makefile:

 
all:
	wget https://github.com/OpenMathLib/OpenBLAS/archive/refs/tags/v0.3.27.tar.gz
	tar zxf v0.3.27.tar.gz 
	make -C OpenBLAS-0.3.27 FC=gfortran -j
 
install:
	make -C OpenBLAS-0.3.27 install PREFIX=../local/
#PREFIX=/opt/OpenBLAS
# PREFIX=../local/
 
clean:
	-rm -rf ./local/ ./OpenBLAS-0.3.27/ ./v0.3.27.tar.gz

编译安装 OpenBLAS:

$ make

$ sudo make install

默认安装在 /opt/OpenBLAS

PREFIX=../local/ 指定安装目录;

debug 版本:

all:
	wget https://github.com/OpenMathLib/OpenBLAS/archive/refs/tags/v0.3.27.tar.gz
	tar zxf v0.3.27.tar.gz 
	make -C OpenBLAS-0.3.27 DEBUG=1  FC=gfortran -j
 
install:
	make -C OpenBLAS-0.3.27 install PREFIX=../local/
#PREFIX=/opt/OpenBLAS
# PREFIX=../local/
 
clean:
	-rm -rf ./local/ ./OpenBLAS-0.3.27/ ./v0.3.27.tar.gz

2, 示例源码

ex_dgetrf_dorgqr_inverseQ2H_lapack.cpp

#include <stdio.h>
#include <math.h>


typedef long int lint;

extern "C"{
    void dgeqrf_(long int* M, long int* N, double* A, long int* LDA, double* TAU, double* WORK, long int* LWORK, int* INFO);
    void dorgqr_(long int* M, long int* N, long int* K, double* A, long int *LDA, double* TAU, double* WORK, long int* LWORK, int* INFO);
}

void init_matrix(double* A, long int m, long int n, long int lda)
{
    srand(2024);
    for(long int i=0; i<m; i++){
        for(long int j=0; j<n; j++){
            A[i + j*lda] = 1.0*rand()/RAND_MAX + 1.0*rand()/RAND_MAX;
        }
    }
}

void print_matrix(double* A, long int m, long int n, long int lda)
{
    for(long int i=0; i<m; i++){
        for(long int j=0; j<n; j++){
            printf("%8.4f ", A[i + j*lda]);
        }
        printf("\n");
    }
}

int main()
{
    lint m = 12;
    lint n = 12;// n <= m
    lint k = n;
    lint lda = m;
    lint lwork = 0;
    int info = -1;
    lint min_mn = std::fmin(m, n);

    double* work = nullptr;
    double* A = nullptr;
    double* tau = nullptr;

    A = (double*)malloc(lda*n*sizeof(double));
    init_matrix(A, m, n, lda);
    printf("A =\n");
    print_matrix(A, m, n, lda);

    tau = (double*)malloc(min_mn*sizeof(double));
    work = (double*)malloc(1*sizeof(double));
    lwork = -1;
    dgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
    printf("info = %d\n", info);
    lwork = work[0];
    printf("lwork = %ld\n", lwork);
    free(work);
    work = nullptr;
    work = (double*)malloc(lwork*sizeof(double));

    dgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
    printf("info = %d\n", info);
    printf("A =\n");
    print_matrix(A, m, n, lda);

    printf("tau =\n");
    print_matrix(tau, 1, min_mn, 1);

    free(work);
    work = nullptr;
    work = (double*)malloc(1*sizeof(double));
    lwork = -1;
    dorgqr_(&m, &n, &k, A, &lda, tau, work, &lwork, &info);
    printf("info = %d\n", info);
    lwork = work[0];
    printf("lwork = %ld\n", lwork);
    free(work);
    work = nullptr;
    work = (double*)malloc(lwork*sizeof(double));
    dorgqr_(&m, &n, &k, A, &lda, tau, work, &lwork, &info);
    printf("info = %d\n", info);
    printf("A =\n");
    print_matrix(A, m, n, lda);

    /









    return 0;
}

3, 运行效果

Makefile:

EXE := ex_dgetrf_dorgqr_inverseQ2H_lapack


all: $(EXE)



%: %.cpp
	g++ -g $< -o $@ -lopenblas -lpthread


.PHONY: clean
clean:
	-rm $(EXE)

编译运行:

$ make

$ ./ex_dgetrf_dorgqr_inverseQ2H_lapack

效果:

 4, debug DnSgeqrf

#include<time.h>
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>
#include<cusolverDn.h>
#include <iostream>

#define BILLION 1000000000L;
//#define printf(X, ... ) ;


void print_vector(float* tau, int n)
{
    for(int i=((n-7)<0? 0 : n-7); i<n; i++)
        printf("%7.4f ", tau[i]);
}

void _print_matrix(float* A, int m, int n, int lda)
{
    for(int i=((m-7)<0? 0 : m-7); i<m; i++)    {
        for(int j=((n-7)<0? 0 : n-7); j<n; j++){
            printf("%7.4f, ", A[i + j*lda]);
        }
        printf("\n");
    }
}

void print_matrix(float* A, int m, int n, int lda)
{
    for(int i=0; i<(m<32?m:32); i++)    {
        for(int j=0; j<(n<32?n:32); j++){
            printf("%9.4f ", A[i + j*lda]);
        }
        printf("\n");
    }
}

void print_matrix_tail(float* A, int m, int n, int lda)
{
    for(int i=((m<32)? 0 : m-32); i<m; i++)    {
        for(int j=((n<32)? 0 : n-32); j<n; j++){
            printf("%9.4f, ", A[i + j*lda]);
        }
        printf("\n");
    }
}

void init_matrix(float* A, int m, int n, int lda, int seed)
{
    srand(seed);

    for(int j=0; j<n; j++)    {
        for(int i=0; i<m; i++){
            A[i + j*lda] = (rand()%1000)/750.0f;
        }
    }
}

void set_I_matrix(float* A, int M, int N, int lda)
{
    memset(A, 0, lda*N*sizeof(float));
    int n = M<=N? M:N;
    for(int i=0; i<M; i++){
        A[i+i*lda] = 1.0f;
    }
}

void tau_matrix(float* A, int m, int n, int lda)
{
    printf("\ntuu = ");
    for(int j=(n-7<0? 0:n-7); j<n-1; j++){
        float tau, dot;

        tau = 0.0f;
        dot = 0.0f;
        for(int i=j+1; i<m; i++){
            dot += A[i + j*lda]*A[i + j*lda];
        }
        dot += 1.0f;
        tau = 2.0f/dot;
        printf("%7.4f ", tau);
    }
}

typedef long int lint;

extern "C"{
void sgeqrf_(long int* M, long int* N, float* A, long int* LDA, float* TAU, float* WORK, long int* LWORK, int* INFO);
}

void lapack_sgeqrf_test(float* A, lint m, lint n, lint lda)
{
    struct timespec start, stop; // variables for timing
    double accum;                // elapsed time variable
	lint lwork = 0;
	int info = -1;
	float* work = nullptr;
	lint min_mn = std::fmin(m, n);

	float* tau = nullptr;
	tau = (float*)malloc(min_mn*sizeof(float));
	work = (float*)malloc(1*sizeof(float));

	lwork = -1;
	sgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
	printf("info(lapack) = %d\n", info);
	lwork = work[0];
	printf("lwork = %ld", lwork);
	free(work);
	work = nullptr;
	work = (float*)malloc(lwork*sizeof(float));



    clock_gettime(CLOCK_REALTIME, &start); // start timer
	sgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);

    clock_gettime(CLOCK_REALTIME, &stop);  // stop timer
    accum = (stop.tv_sec - start.tv_sec) + // elapsed time
            (stop.tv_nsec - start.tv_nsec) / (double)BILLION;
    printf(" Sgeqrf(lapack) time : %lf sec .\n", accum); // print elapsed time



	printf("info(lapack) = %d\n", info);
	printf("A = R+V-I (lapack)::head =\n");
	print_matrix(A, m, n, lda);	
	printf("A = R+V-I (lapack)::tail =\n");
	print_matrix_tail(A, m, n, lda);	
	printf("tau(lapack) =\n");
	print_matrix(tau, 1, n, 1);

	free(work);	work = nullptr;
	free(tau); 	tau = nullptr;

}






int main(int argc, char *argv[]){

    struct timespec start, stop; // variables for timing
    double accum;                // elapsed time variable
    cusolverDnHandle_t cusolverH;
    cublasHandle_t cublasH;
    cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS;
    cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat = cudaSuccess;
    const int m = 1024*1024*8;//8192*2*16; // number of rows of A
    const int n = 32;//m;//8192; // number of columns of A
    
    printf("m=%d, n=%d\n", m, n);
    std::cout<<"m = "<< m <<" n = "<< n << std::endl;
    const int lda = m;  // leading dimension of A
    // declare matrices A and Q,R on the host
    float *A, *Q, *R;

    // prepare host memeory
    A = (float *)malloc(lda * n * sizeof(float)); // matr . A on the host
    Q = (float *)malloc(lda * n * sizeof(float)); // orthogonal factor Q
    R = (float *)malloc(n * n * sizeof(float));   // R=I-Q^T*Q

    init_matrix(A, m, n, lda, 2003);
    std::cout<<"A ="<<std::endl;
    print_matrix(A, m, n, lda);


    float *Al = nullptr;
    Al = (float*)malloc(lda*n*sizeof(float));
    memcpy(Al, A, lda*n*sizeof(float));

    lapack_sgeqrf_test(Al, m, n, lda);
    free(Al);
    Al = nullptr;


    float *d_A, *d_R;                    // matrices A, R on the device
    float *d_tau;                        // scalars defining the elementary reflectors
    int *devInfo;                        // info on the device
    float *d_work;                       // workspace on the device
    // workspace sizes
    int lwork_geqrf = 0;
    int lwork_orgqr = 0;
    int lwork = 0;
    int info_gpu = 0;             // info copied from the device
    const float h_one = 1;        // constants used in
    const float h_minus_one = -1; // computations of I-Q^T*Q
    
    // create cusolver and cublas handles
    cusolver_status = cusolverDnCreate(&cusolverH);
    cublas_status = cublasCreate(&cublasH);

    // prepare device memory
    cudaStat = cudaMalloc((void **)&d_A, lda*n*sizeof(float));
    cudaStat = cudaMalloc((void **)&d_tau,   n*sizeof(float));
    cudaStat = cudaMalloc((void **)&devInfo,   sizeof(int));
    cudaStat = cudaMalloc((void **)&d_R,   n*n*sizeof(float));
    cudaStat = cudaMemcpy(d_A, A,        lda*n*sizeof(float),
                          cudaMemcpyHostToDevice); // copy d_A <- A

    // compute working space for geqrf and orgqr
    cusolver_status = cusolverDnSgeqrf_bufferSize(cusolverH, m, n, d_A, lda, &lwork_geqrf); // compute Sgeqrf buffer size
printf("lwork_geqrf = %d Bytes\n", lwork_geqrf*4);
    cusolver_status = cusolverDnSorgqr_bufferSize(cusolverH, m, n, n, d_A, lda, d_tau, &lwork_orgqr); // and Sorgqr b. size
    lwork = (lwork_geqrf > lwork_orgqr) ? lwork_geqrf : lwork_orgqr;

    // device memory for workspace
    cudaStat = cudaMalloc((void **)&d_work, sizeof(float) * lwork);
    printf("LL::\n");

    cudaStat = cudaDeviceSynchronize();
    // QR factorization for d_A
    clock_gettime(CLOCK_REALTIME, &start); // start timer
    cusolver_status = cusolverDnSgeqrf(cusolverH, m, n, d_A, lda,
                                       d_tau, d_work, lwork, devInfo);
    cudaStat = cudaDeviceSynchronize();
    clock_gettime(CLOCK_REALTIME, &stop);  // stop timer
    accum = (stop.tv_sec - start.tv_sec) + // elapsed time
            (stop.tv_nsec - start.tv_nsec) / (double)BILLION;
    printf(" Sgeqrf(gpu) time : %lf sec .\n", accum); // print elapsed time
    

    cudaStat = cudaMemcpy(&info_gpu, devInfo, sizeof(int),
                          cudaMemcpyDeviceToHost); // copy devInfo -> info_gpu

    // check geqrf error code
    printf("\n after geqrf : info_gpu = %d\n", info_gpu);
///
    printf("\nA =\n");
    print_matrix(A, m, n, lda);
    cudaStat = cudaMemcpy(A, d_A, sizeof(float) * lda * n,
                          cudaMemcpyDeviceToHost);
    printf("\nA = V+R-I(gpu)::head =\n");
    print_matrix(A, m, n, lda);
    printf("A = R+V-I (lapack)::tail =\n");
    print_matrix_tail(A, m, n, lda);	

#if 0

 /
    float* tau = nullptr;

    tau = (float*)malloc(n*sizeof(float));
    cudaStat = cudaMemcpy(tau, d_tau, n*sizeof(float), cudaMemcpyDeviceToHost);
    printf("\ntau = ");print_vector(tau, n);
    tau_matrix(A, m, n, lda);

    free(tau);
///
    // apply orgqr function to compute the orthogonal matrix Q
    // using elementary reflectors vectors stored in d_A and
    // elementary reflectors scalars stored in d_tau ,
    cusolver_status = cusolverDnSorgqr(cusolverH, m, n, n, d_A,
                                       lda, d_tau, d_work, lwork, devInfo);
    cudaStat = cudaDeviceSynchronize();
    cudaStat = cudaMemcpy(&info_gpu, devInfo, sizeof(int),
                          cudaMemcpyDeviceToHost); // copy devInfo -> info_gpu
    // check orgqr error code
    printf("\n after orgqr : info_gpu = %d\n", info_gpu);
    cudaStat = cudaMemcpy(Q, d_A, sizeof(float) * lda * n,
                          cudaMemcpyDeviceToHost); // copy d_A ->Q

    set_I_matrix(R, n, n, n);
    printf("\nR=\n");
    print_matrix(R, n, n, n);

    cudaStat = cudaMemcpy(d_R, R, sizeof(float) * n * n,
                          cudaMemcpyHostToDevice); // copy R-> d_R
    // compute R = -Q**T*Q + I
    cublas_status = cublasSgemm_v2(cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
                                   n, n, m, &h_minus_one, d_A, lda, d_A, lda, &h_one, d_R, n);
    float dR_nrm2 = 0.0; // norm value
    // compute the norm of R = -Q**T*Q + I
    cublas_status = cublasSnrm2_v2(cublasH, n * n, d_R, 1, &dR_nrm2);
    printf("||I - Q^T*Q|| = %E\n", dR_nrm2); // print the norm

#endif
    // free memory
    cudaFree(d_A);
    cudaFree(d_tau);
    cudaFree(devInfo);
    cudaFree(d_work);
    cudaFree(d_R);
    cublasDestroy(cublasH);
    cusolverDnDestroy(cusolverH);
    cudaDeviceReset();
    return 0;
}
// Sqeqrf time : 0.434779 sec .
// after geqrf : info_gpu = 0
// after orgqr : info_gpu = 0
//|I - Q**T*Q| = 2.515004E -04
//
//

Makefile

TARGETS = qr_cusolver_sgeqrf
all: $(TARGETS)
 
LD_FLAGS = -L/usr/local/cuda/lib64  	\
	   -lcudart -lcudadevrt 	\
	   -lcusolver -lcublas 		\
	   -lcublasLt -lpthread

LAPACK_DIR := /home/hipper/ex_cusolverDnSgeqrf_gpu_cpu/lapack/local/lib

#LD_FLAGS += $(LAPACK_DIR)/liblapack.a $(LAPACK_DIR)/librefblas.a -lgfortran
LD_FLAGS += -lopenblas 
 
%: %.cpp
	g++ -g $< -o $@  -I/usr/local/cuda/include  $(LD_FLAGS) -fopenmp 
	
#	-I./cblas_source -L./cblas_source/CBLAS/lib -lcblas_LINUX -L/usr/local/lib -lblas -lgfortran
 
.PHONY:clean
clean:
	-rm -f $(TARGETS)

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/2104885.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

字节跳动一面

字节跳动一面【C后端开发】 base &#xff1a; 深圳 岗位&#xff1a;C后端开发 时间&#xff1a; 2024/8/30 文章目录 基本介绍C语言1. 堆栈内存是否连续&#xff0c;为什么&#xff1f;2. int i0; i ; 两个线程同时执行10000次&#xff0c;i最终的数值是多少&#xff1f;3.…

【Java基础】代理

文章目录 代理代理模式的优点代理模式类型基于JDK的静态代理基于JDK的动态代理 代理 一种设计模式&#xff0c;不允许用户直接访问核心功能&#xff0c;而是通过代理来访问核心类的核心功能 举个例子&#xff0c;如果我们现在需要一个银行转账系统&#xff0c;我们需要在一个Ja…

git创建本地分支并track跟踪远程分支

git创建本地分支并track跟踪远程分支 查看本地分支与远程分支的映射关系&#xff1a; git branch -vv 查看远程都有什么分支&#xff1a; git branch -r 在本地自动新建一个xxx分支&#xff0c;且自动track跟踪远程的同名xxx分支&#xff1a; git checkout --track origin/xx…

MinIO Packet Pushers 播客: 汤姆-里昂,《NFS 必死》。

我们真的很喜欢 Packet Pushers 的团队。他们的播客是业内最好的播客之一&#xff0c;涵盖了从堆栈顶部到底部的技术。我们最近有机会赞助传奇人物 Tom Lyon 对 Ethan Banks 和 Drew Conry-Murray 的采访。Packet Pushers 的团队对 Tom 最近题为“NFS&#xff08;网络文件系统&…

数据结构(13)——平衡二叉树(红黑树)

欢迎来到博主的专栏——数据结构 博主ID&#xff1a;代码小号 文章目录 红黑树红黑树节点之间的关系红黑树的插入uncle节点为红色uncle节点是黑色或者没有uncle节点 红黑树 平衡二叉树最出名的除了AVL树之外就是红黑树&#xff08;RBTree&#xff09;&#xff0c;所谓红黑树&a…

JSON 格式详解

JSON 格式详解 随着互联网的发展和各种 Web 应用程序的普及&#xff0c;数据交换已经成为了我们日常开发中的重要环节。而在各种数据交换格式中&#xff0c;JSON&#xff08;JavaScript Object Notation&#xff09;作为一种轻量级的数据交换格式&#xff0c;以其简洁、易于阅…

2024.9.4(k8s)

一、前期准备 1、配置主机映射 [rootk8s-master ~]# vim /etc/hosts 192.168.8.168 k8s-master 192.168.8.176 k8s-node1 192.168.8.177 k8s-node2[rootk8s-master ~]# ping k8s-master 2、配置yum源 [rootk8s-master yum.repos.d]# vim kubernetes.repo [kubernetes] n…

智能医学(二)——MDPI特刊推荐

特刊征稿 01 特刊名称&#xff1a; eHealth and mHealth: Challenges and Prospects, 2nd Volume 参与期刊&#xff1a; 截止时间&#xff1a; 摘要提交截止日期 关闭(2024年6月30日) 投稿截止日期 2024年9月30日 目标及范围&#xff1a; 关键字 l 人工智能 l 计算机…

模拟实现string类及体验传统深拷贝

目录 strcpy 构造函数 优化 拷贝构造/深拷贝 operator size/operator[] operator<< c_str() 模拟string::iterator 插入 push_back() append() operator reserve npos strcpy strcpy是将/0拷贝完成后才会停止。 构造函数 string():_str(nullptr) {} st…

vite 打包 学习

plugins.jsimport vue from "vitejs/plugin-vue"; // 自动引入插件 import autoImport from "unplugin-auto-import/vite"; import setupExtend from "unplugin-vue-setup-extend-plus/vite"; import { ElementPlusResolver } from unplugin-vue…

国内Etsy开店注册账号需要什么?

Etsy作为海外知名二手电商平台&#xff0c;对于原创手工产品的商家来说具有巨大的市场流量与商机&#xff0c;但注册Etsy账号对于国内跨境电商用户来说确实存在一定的难度&#xff0c;作为Etsy也是小有名气的小商家&#xff0c;今天也分享一下开店的经验帮助大家出海。 一、Ets…

终端安全一体化解决方案有哪些?值得收藏的五款终端安全系统

随着信息技术的迅猛发展&#xff0c;企业和个人面临着越来越多的安全威胁。终端作为连接互联网和用户的第一线&#xff0c;其安全性直接影响到整个网络乃至组织的安全态势。为了应对日益复杂的网络环境&#xff0c;许多企业开始采用终端安全一体化解决方案&#xff0c;以期达到…

EVPN学习

三、VXLAN BGP EVPN基本原理_vxlan的type2,type3区别-CSDN博客 华为数通笔记VXLAN&BGP EVPN_vxlan为什么用bgp协议-CSDN博客

【MeterSphere】vnc连接不上selenium-chrome容器

目录 一、现象 二、查看配置文件 docker-compose-seleniarm.yml 三、处理 3.1 删除上图当中的三行 3.2 msctl reload 3.3 重新连接 前言:使用vnc连不上ms的selenium-chrome容器,看不到里面运行情况,以前其实可以,后来不行了,研究了一下 一、现象 输入IP:端口,连…

vmware 17.6 pro for personal USE初体验

新学期开学了&#xff0c;暑假期间把台式机放在办公室远程&#xff0c;无赖期间经常断电&#xff0c;把我的老台给烧坏了&#xff0c;检测了下固态硬盘和机械硬盘&#xff0c;好歹能用。但是win11的系统奔溃了。就花了半天时间重装。*v* 悲剧的是&#xff0c;一些软件环境必须…

怎么合并pdf文件?6个PDF文件合并成一个,只需要这5步!

在日常工作和学习中&#xff0c;我们经常需要处理多个PDF文件&#xff0c;有时为了方便查阅和管理&#xff0c;需要将它们合并成一个文件。以下是几种实用的方法来合并PDF文件&#xff0c;特别是如何将6个PDF文件合并成一个。 PDF合并工具推荐1. 金舟PDF编辑器 第一步、从金舟…

php民宿短租平台Java民宿预约系统python民宿预订住宿与可视化分析系统(源码、调试、LW、开题、PPT)

&#x1f495;&#x1f495;作者&#xff1a;计算机源码社 &#x1f495;&#x1f495;个人简介&#xff1a;本人 八年开发经验&#xff0c;擅长Java、Python、PHP、.NET、Node.js、Android、微信小程序、爬虫、大数据、机器学习等&#xff0c;大家有这一块的问题可以一起交流&…

【C++ 第十八章】C++11 新增语法(4)

前情回顾&#xff1a; 【C11 新增语法&#xff08;1&#xff09;&#xff1a;1~6 点】 C11出现与历史、花括号统一初始化、initializer_list初始化列表、 auto、decltype、nullptr、STL的一些新变化 【C11 新增语法&#xff08;2&#xff09;&#xff1a;7~8 点】 右值引用和…

爬虫练习(js逆向解密)

目标 网站地址交易列表 - 福建省公共资源交易电子公共服务平台 (fj.gov.cn) 抓取内容如下&#xff1a; 分析 查找js代码 点击下一页翻页的时候&#xff0c;查看请求返回的数据&#xff0c;发现data数据是经过加密后得到的 通过全局搜索data,发现有两千多个结果&#xff0c;一个…

Qt将数据库中的数据导出为html

一、源码分享 bool ReportFormUtils::exportReportHtml(QString &errString, const QString tableName, const QString savePathAndName, const QString tableTitle, const QString tableInfo) {Q_UNUSED(errString)Q_UNUSED(tableName)Q_UNUSED(savePathAndName)#define …