环境版本:
工程目录:
测试输出:
WORKSPACE
参考仓库:CUDA rules for Bazel 及 examples
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
http_archive(
name = "rules_cuda",
sha256 = "76aea02d6763e0ba5bdf5f981174d6ec39c19e575812cf6956329e453e091adf", # 可选
strip_prefix = "rules_cuda-main",
urls = ["https://github.com/bazel-contrib/rules_cuda/archive/main.zip"],
)
# CUDA rules for Bazel
# https://github.com/bazel-contrib/rules_cuda
# bazel for cuda 具体用法示例
# https://github.com/bazel-contrib/rules_cuda/tree/main/examples
# https://github.com/bazel-contrib/rules_cuda/blob/main/examples/basic/BUILD.bazel
load("@rules_cuda//cuda:repositories.bzl", "rules_cuda_dependencies", "rules_cuda_toolchains")
rules_cuda_dependencies()
rules_cuda_toolchains(register_toolchains = True)
BUILD
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_binary")
cuda_library(
name = "cuda_matmul_lib",
srcs = ["matmul.cu"],
hdrs = ["matmul.h"],
)
cc_binary(
name = "cuda_matmul",
srcs = ["main.cc"],
deps = [":cuda_matmul_lib"],
)
main.cc
#include <iostream>
#include <vector>
#include "matmul.h"
#define N 4 // 矩阵大小 (可调整)
void printMatrix(const float* M, int size) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
std::cout << M[i * size + j] << " ";
}
std::cout << std::endl;
}
}
int main() {
float A[N * N] = {1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 14, 15, 16};
float B[N * N] = {1, 0, 0, 0,
0, 1, 0, 0,
0, 0, 1, 0,
0, 0, 0, 1};
float C[N * N] = {0};
std::cout << "Matrix A:\n";
printMatrix(A, N);
std::cout << "Matrix B:\n";
printMatrix(B, N);
// 调用 CUDA 矩阵乘法
matrixMultiply(A, B, C, N);
std::cout << "Matrix C (A * B):\n";
printMatrix(C, N);
return 0;
}
matmul.h
#ifndef MATMUL_H
#define MATMUL_H
void matrixMultiply(float *A, float *B, float *C, int N);
#endif // MATMUL_H
matmul.cu
#include <cuda_runtime.h>
#include "matmul.h"
// CUDA 核函数
__global__ void matmulKernel(float *A, float *B, float *C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < N) {
float sum = 0.0f;
for (int i = 0; i < N; i++) {
sum += A[row * N + i] * B[i * N + col];
}
C[row * N + col] = sum;
}
}
// 矩阵乘法封装函数
void matrixMultiply(float *A, float *B, float *C, int N) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);
// 设备内存分配
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
// 复制数据到 GPU
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
// 设置 CUDA 线程块
dim3 blockDim(16, 16);
dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);
// 启动 Kernel
matmulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);
cudaDeviceSynchronize();
// 复制结果回 CPU
cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
// 释放设备内存
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}