1. 安装 OpenBLAS
release 版本:
Makefile:
all:
wget https://github.com/OpenMathLib/OpenBLAS/archive/refs/tags/v0.3.27.tar.gz
tar zxf v0.3.27.tar.gz
make -C OpenBLAS-0.3.27 FC=gfortran -j
install:
make -C OpenBLAS-0.3.27 install PREFIX=../local/
#PREFIX=/opt/OpenBLAS
# PREFIX=../local/
clean:
-rm -rf ./local/ ./OpenBLAS-0.3.27/ ./v0.3.27.tar.gz
编译安装 OpenBLAS:
$ make
$ sudo make install
默认安装在 /opt/OpenBLAS
PREFIX=../local/ 指定安装目录;
debug 版本:
all:
wget https://github.com/OpenMathLib/OpenBLAS/archive/refs/tags/v0.3.27.tar.gz
tar zxf v0.3.27.tar.gz
make -C OpenBLAS-0.3.27 DEBUG=1 FC=gfortran -j
install:
make -C OpenBLAS-0.3.27 install PREFIX=../local/
#PREFIX=/opt/OpenBLAS
# PREFIX=../local/
clean:
-rm -rf ./local/ ./OpenBLAS-0.3.27/ ./v0.3.27.tar.gz
2, 示例源码
ex_dgetrf_dorgqr_inverseQ2H_lapack.cpp
#include <stdio.h>
#include <math.h>
typedef long int lint;
extern "C"{
void dgeqrf_(long int* M, long int* N, double* A, long int* LDA, double* TAU, double* WORK, long int* LWORK, int* INFO);
void dorgqr_(long int* M, long int* N, long int* K, double* A, long int *LDA, double* TAU, double* WORK, long int* LWORK, int* INFO);
}
void init_matrix(double* A, long int m, long int n, long int lda)
{
srand(2024);
for(long int i=0; i<m; i++){
for(long int j=0; j<n; j++){
A[i + j*lda] = 1.0*rand()/RAND_MAX + 1.0*rand()/RAND_MAX;
}
}
}
void print_matrix(double* A, long int m, long int n, long int lda)
{
for(long int i=0; i<m; i++){
for(long int j=0; j<n; j++){
printf("%8.4f ", A[i + j*lda]);
}
printf("\n");
}
}
int main()
{
lint m = 12;
lint n = 12;// n <= m
lint k = n;
lint lda = m;
lint lwork = 0;
int info = -1;
lint min_mn = std::fmin(m, n);
double* work = nullptr;
double* A = nullptr;
double* tau = nullptr;
A = (double*)malloc(lda*n*sizeof(double));
init_matrix(A, m, n, lda);
printf("A =\n");
print_matrix(A, m, n, lda);
tau = (double*)malloc(min_mn*sizeof(double));
work = (double*)malloc(1*sizeof(double));
lwork = -1;
dgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
printf("info = %d\n", info);
lwork = work[0];
printf("lwork = %ld\n", lwork);
free(work);
work = nullptr;
work = (double*)malloc(lwork*sizeof(double));
dgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
printf("info = %d\n", info);
printf("A =\n");
print_matrix(A, m, n, lda);
printf("tau =\n");
print_matrix(tau, 1, min_mn, 1);
free(work);
work = nullptr;
work = (double*)malloc(1*sizeof(double));
lwork = -1;
dorgqr_(&m, &n, &k, A, &lda, tau, work, &lwork, &info);
printf("info = %d\n", info);
lwork = work[0];
printf("lwork = %ld\n", lwork);
free(work);
work = nullptr;
work = (double*)malloc(lwork*sizeof(double));
dorgqr_(&m, &n, &k, A, &lda, tau, work, &lwork, &info);
printf("info = %d\n", info);
printf("A =\n");
print_matrix(A, m, n, lda);
/
return 0;
}
3, 运行效果
Makefile:
EXE := ex_dgetrf_dorgqr_inverseQ2H_lapack
all: $(EXE)
%: %.cpp
g++ -g $< -o $@ -lopenblas -lpthread
.PHONY: clean
clean:
-rm $(EXE)
编译运行:
$ make
$ ./ex_dgetrf_dorgqr_inverseQ2H_lapack
效果:
4, debug DnSgeqrf
#include<time.h>
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>
#include<cusolverDn.h>
#include <iostream>
#define BILLION 1000000000L;
//#define printf(X, ... ) ;
void print_vector(float* tau, int n)
{
for(int i=((n-7)<0? 0 : n-7); i<n; i++)
printf("%7.4f ", tau[i]);
}
void _print_matrix(float* A, int m, int n, int lda)
{
for(int i=((m-7)<0? 0 : m-7); i<m; i++) {
for(int j=((n-7)<0? 0 : n-7); j<n; j++){
printf("%7.4f, ", A[i + j*lda]);
}
printf("\n");
}
}
void print_matrix(float* A, int m, int n, int lda)
{
for(int i=0; i<(m<32?m:32); i++) {
for(int j=0; j<(n<32?n:32); j++){
printf("%9.4f ", A[i + j*lda]);
}
printf("\n");
}
}
void print_matrix_tail(float* A, int m, int n, int lda)
{
for(int i=((m<32)? 0 : m-32); i<m; i++) {
for(int j=((n<32)? 0 : n-32); j<n; j++){
printf("%9.4f, ", A[i + j*lda]);
}
printf("\n");
}
}
void init_matrix(float* A, int m, int n, int lda, int seed)
{
srand(seed);
for(int j=0; j<n; j++) {
for(int i=0; i<m; i++){
A[i + j*lda] = (rand()%1000)/750.0f;
}
}
}
void set_I_matrix(float* A, int M, int N, int lda)
{
memset(A, 0, lda*N*sizeof(float));
int n = M<=N? M:N;
for(int i=0; i<M; i++){
A[i+i*lda] = 1.0f;
}
}
void tau_matrix(float* A, int m, int n, int lda)
{
printf("\ntuu = ");
for(int j=(n-7<0? 0:n-7); j<n-1; j++){
float tau, dot;
tau = 0.0f;
dot = 0.0f;
for(int i=j+1; i<m; i++){
dot += A[i + j*lda]*A[i + j*lda];
}
dot += 1.0f;
tau = 2.0f/dot;
printf("%7.4f ", tau);
}
}
typedef long int lint;
extern "C"{
void sgeqrf_(long int* M, long int* N, float* A, long int* LDA, float* TAU, float* WORK, long int* LWORK, int* INFO);
}
void lapack_sgeqrf_test(float* A, lint m, lint n, lint lda)
{
struct timespec start, stop; // variables for timing
double accum; // elapsed time variable
lint lwork = 0;
int info = -1;
float* work = nullptr;
lint min_mn = std::fmin(m, n);
float* tau = nullptr;
tau = (float*)malloc(min_mn*sizeof(float));
work = (float*)malloc(1*sizeof(float));
lwork = -1;
sgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
printf("info(lapack) = %d\n", info);
lwork = work[0];
printf("lwork = %ld", lwork);
free(work);
work = nullptr;
work = (float*)malloc(lwork*sizeof(float));
clock_gettime(CLOCK_REALTIME, &start); // start timer
sgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
clock_gettime(CLOCK_REALTIME, &stop); // stop timer
accum = (stop.tv_sec - start.tv_sec) + // elapsed time
(stop.tv_nsec - start.tv_nsec) / (double)BILLION;
printf(" Sgeqrf(lapack) time : %lf sec .\n", accum); // print elapsed time
printf("info(lapack) = %d\n", info);
printf("A = R+V-I (lapack)::head =\n");
print_matrix(A, m, n, lda);
printf("A = R+V-I (lapack)::tail =\n");
print_matrix_tail(A, m, n, lda);
printf("tau(lapack) =\n");
print_matrix(tau, 1, n, 1);
free(work); work = nullptr;
free(tau); tau = nullptr;
}
int main(int argc, char *argv[]){
struct timespec start, stop; // variables for timing
double accum; // elapsed time variable
cusolverDnHandle_t cusolverH;
cublasHandle_t cublasH;
cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat = cudaSuccess;
const int m = 1024*1024*8;//8192*2*16; // number of rows of A
const int n = 32;//m;//8192; // number of columns of A
printf("m=%d, n=%d\n", m, n);
std::cout<<"m = "<< m <<" n = "<< n << std::endl;
const int lda = m; // leading dimension of A
// declare matrices A and Q,R on the host
float *A, *Q, *R;
// prepare host memeory
A = (float *)malloc(lda * n * sizeof(float)); // matr . A on the host
Q = (float *)malloc(lda * n * sizeof(float)); // orthogonal factor Q
R = (float *)malloc(n * n * sizeof(float)); // R=I-Q^T*Q
init_matrix(A, m, n, lda, 2003);
std::cout<<"A ="<<std::endl;
print_matrix(A, m, n, lda);
float *Al = nullptr;
Al = (float*)malloc(lda*n*sizeof(float));
memcpy(Al, A, lda*n*sizeof(float));
lapack_sgeqrf_test(Al, m, n, lda);
free(Al);
Al = nullptr;
float *d_A, *d_R; // matrices A, R on the device
float *d_tau; // scalars defining the elementary reflectors
int *devInfo; // info on the device
float *d_work; // workspace on the device
// workspace sizes
int lwork_geqrf = 0;
int lwork_orgqr = 0;
int lwork = 0;
int info_gpu = 0; // info copied from the device
const float h_one = 1; // constants used in
const float h_minus_one = -1; // computations of I-Q^T*Q
// create cusolver and cublas handles
cusolver_status = cusolverDnCreate(&cusolverH);
cublas_status = cublasCreate(&cublasH);
// prepare device memory
cudaStat = cudaMalloc((void **)&d_A, lda*n*sizeof(float));
cudaStat = cudaMalloc((void **)&d_tau, n*sizeof(float));
cudaStat = cudaMalloc((void **)&devInfo, sizeof(int));
cudaStat = cudaMalloc((void **)&d_R, n*n*sizeof(float));
cudaStat = cudaMemcpy(d_A, A, lda*n*sizeof(float),
cudaMemcpyHostToDevice); // copy d_A <- A
// compute working space for geqrf and orgqr
cusolver_status = cusolverDnSgeqrf_bufferSize(cusolverH, m, n, d_A, lda, &lwork_geqrf); // compute Sgeqrf buffer size
printf("lwork_geqrf = %d Bytes\n", lwork_geqrf*4);
cusolver_status = cusolverDnSorgqr_bufferSize(cusolverH, m, n, n, d_A, lda, d_tau, &lwork_orgqr); // and Sorgqr b. size
lwork = (lwork_geqrf > lwork_orgqr) ? lwork_geqrf : lwork_orgqr;
// device memory for workspace
cudaStat = cudaMalloc((void **)&d_work, sizeof(float) * lwork);
printf("LL::\n");
cudaStat = cudaDeviceSynchronize();
// QR factorization for d_A
clock_gettime(CLOCK_REALTIME, &start); // start timer
cusolver_status = cusolverDnSgeqrf(cusolverH, m, n, d_A, lda,
d_tau, d_work, lwork, devInfo);
cudaStat = cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &stop); // stop timer
accum = (stop.tv_sec - start.tv_sec) + // elapsed time
(stop.tv_nsec - start.tv_nsec) / (double)BILLION;
printf(" Sgeqrf(gpu) time : %lf sec .\n", accum); // print elapsed time
cudaStat = cudaMemcpy(&info_gpu, devInfo, sizeof(int),
cudaMemcpyDeviceToHost); // copy devInfo -> info_gpu
// check geqrf error code
printf("\n after geqrf : info_gpu = %d\n", info_gpu);
///
printf("\nA =\n");
print_matrix(A, m, n, lda);
cudaStat = cudaMemcpy(A, d_A, sizeof(float) * lda * n,
cudaMemcpyDeviceToHost);
printf("\nA = V+R-I(gpu)::head =\n");
print_matrix(A, m, n, lda);
printf("A = R+V-I (lapack)::tail =\n");
print_matrix_tail(A, m, n, lda);
#if 0
/
float* tau = nullptr;
tau = (float*)malloc(n*sizeof(float));
cudaStat = cudaMemcpy(tau, d_tau, n*sizeof(float), cudaMemcpyDeviceToHost);
printf("\ntau = ");print_vector(tau, n);
tau_matrix(A, m, n, lda);
free(tau);
///
// apply orgqr function to compute the orthogonal matrix Q
// using elementary reflectors vectors stored in d_A and
// elementary reflectors scalars stored in d_tau ,
cusolver_status = cusolverDnSorgqr(cusolverH, m, n, n, d_A,
lda, d_tau, d_work, lwork, devInfo);
cudaStat = cudaDeviceSynchronize();
cudaStat = cudaMemcpy(&info_gpu, devInfo, sizeof(int),
cudaMemcpyDeviceToHost); // copy devInfo -> info_gpu
// check orgqr error code
printf("\n after orgqr : info_gpu = %d\n", info_gpu);
cudaStat = cudaMemcpy(Q, d_A, sizeof(float) * lda * n,
cudaMemcpyDeviceToHost); // copy d_A ->Q
set_I_matrix(R, n, n, n);
printf("\nR=\n");
print_matrix(R, n, n, n);
cudaStat = cudaMemcpy(d_R, R, sizeof(float) * n * n,
cudaMemcpyHostToDevice); // copy R-> d_R
// compute R = -Q**T*Q + I
cublas_status = cublasSgemm_v2(cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
n, n, m, &h_minus_one, d_A, lda, d_A, lda, &h_one, d_R, n);
float dR_nrm2 = 0.0; // norm value
// compute the norm of R = -Q**T*Q + I
cublas_status = cublasSnrm2_v2(cublasH, n * n, d_R, 1, &dR_nrm2);
printf("||I - Q^T*Q|| = %E\n", dR_nrm2); // print the norm
#endif
// free memory
cudaFree(d_A);
cudaFree(d_tau);
cudaFree(devInfo);
cudaFree(d_work);
cudaFree(d_R);
cublasDestroy(cublasH);
cusolverDnDestroy(cusolverH);
cudaDeviceReset();
return 0;
}
// Sqeqrf time : 0.434779 sec .
// after geqrf : info_gpu = 0
// after orgqr : info_gpu = 0
//|I - Q**T*Q| = 2.515004E -04
//
//
Makefile
TARGETS = qr_cusolver_sgeqrf
all: $(TARGETS)
LD_FLAGS = -L/usr/local/cuda/lib64 \
-lcudart -lcudadevrt \
-lcusolver -lcublas \
-lcublasLt -lpthread
LAPACK_DIR := /home/hipper/ex_cusolverDnSgeqrf_gpu_cpu/lapack/local/lib
#LD_FLAGS += $(LAPACK_DIR)/liblapack.a $(LAPACK_DIR)/librefblas.a -lgfortran
LD_FLAGS += -lopenblas
%: %.cpp
g++ -g $< -o $@ -I/usr/local/cuda/include $(LD_FLAGS) -fopenmp
# -I./cblas_source -L./cblas_source/CBLAS/lib -lcblas_LINUX -L/usr/local/lib -lblas -lgfortran
.PHONY:clean
clean:
-rm -f $(TARGETS)