🎯要点
- 使用英伟达GPU、大都会和并行回火算法模拟蒙特卡洛。
- 使用兰佐斯算法计算传输矩阵特征值。
- 使用 Suzuki-Trotter 公式归一化量子无序系统。
- 算法模型特征:多CUDA线程,多GPU和多任务式并行计算。
🍁磁态分析角度
- Python和MATLAB自旋玻璃投资组合神经网络广义方程
- Python和C++及MATLAB低温磁态机器学习模型
🍪语言内容分比
🍇CUDA矩阵乘法二维块平铺实现
从数学上讲,给定一个广义矩阵乘法运算
D
=
A
B
+
C
D=A B+C
D=AB+C,其中
D
∈
R
m
×
n
,
A
∈
R
m
×
k
,
B
∈
R
k
×
n
,
C
∈
R
m
×
n
D \in R ^{m \times n}, A \in R ^{m \times k}, B \in R ^{k \times n}, C \in R ^{m \times n}
D∈Rm×n,A∈Rm×k,B∈Rk×n,C∈Rm×n,矩阵可以分成更小的矩阵。
A
=
[
A
1
,
1
d
b
m
×
d
b
k
A
1
,
2
d
b
m
×
d
b
k
…
A
1
,
k
d
m
m
×
d
b
k
A
2
,
1
d
m
m
×
d
b
k
A
2
,
2
d
b
m
×
d
b
k
⋯
A
2
,
k
/
d
b
k
d
b
m
×
d
b
k
⋮
⋮
⋱
⋮
A
m
/
d
m
m
,
1
d
m
b
×
d
b
k
A
m
/
d
m
m
,
2
d
b
m
×
d
b
k
⋯
A
m
/
d
b
m
,
k
/
d
b
k
d
b
m
×
d
b
k
]
A=\left[\begin{array}{cccc} A_{1,1}^{d_{b m} \times d_{b k}} & A_{1,2}^{d_{b m} \times d_{b k}} & \ldots & A_{1, k}^{d_{m m} \times d_{b k}} \\ A_{2,1}^{d_{m m} \times d_{b k}} & A_{2,2}^{d_{b m} \times d_{b k}} & \cdots & A_{2, k / d_{b k}}^{d_{b m} \times d_{b k}} \\ \vdots & \vdots & \ddots & \vdots \\ A_{m / d_{m m}, 1}^{d_{m b} \times d_{b k}} & A_{m / d_{m m}, 2}^{d_{b m} \times d_{b k}} & \cdots & A_{m / d_{b m}, k / d_{b k}}^{d_{b m} \times d_{b k}} \end{array}\right]
A=
A1,1dbm×dbkA2,1dmm×dbk⋮Am/dmm,1dmb×dbkA1,2dbm×dbkA2,2dbm×dbk⋮Am/dmm,2dbm×dbk…⋯⋱⋯A1,kdmm×dbkA2,k/dbkdbm×dbk⋮Am/dbm,k/dbkdbm×dbk
B = [ B 1 , 1 d b k × d b n B 1 , 2 d b k × d b n … B 1 , n / d b n d b k × d b n B 2 , 1 d b k × d b n B 2 , 2 d b k × d b n … B 2 , n / d b n d b k × d b n ⋮ ⋮ ⋱ ⋮ B k / d b k , 1 d b k d b n B k / d b k , 2 d b k × d b n ⋯ B k / d b k , n / d b n d b k × d b n ] B=\left[\begin{array}{cccc} B_{1,1}^{d_{b k} \times d_{b n}} & B_{1,2}^{d_{b k} \times d_{b n}} & \ldots & B_{1, n / d_{b n}}^{d_{b k} \times d_{b n}} \\ B_{2,1}^{d_{b k} \times d_{b n}} & B_{2,2}^{d_{b k} \times d_{b n}} & \ldots & B_{2, n / d_{b n}}^{d_{b k} \times d_{b n}} \\ \vdots & \vdots & \ddots & \vdots \\ B_{k / d_{b k}, 1}^{d_{b k} d_{b n}} & B_{k / d_{b k}, 2}^{d_{b k} \times d_{b n}} & \cdots & B_{k / d_{b k}, n / d_{b n}}^{d_{b k} \times d_{b n}} \end{array}\right] B= B1,1dbk×dbnB2,1dbk×dbn⋮Bk/dbk,1dbkdbnB1,2dbk×dbnB2,2dbk×dbn⋮Bk/dbk,2dbk×dbn……⋱⋯B1,n/dbndbk×dbnB2,n/dbndbk×dbn⋮Bk/dbk,n/dbndbk×dbn
C = [ C 1 , 1 d b m × d b n C 1 , 2 d b m × d b n … C 1 , n / d b n d b m × d b n C 2 , 1 d b m × d b n C 2 , 2 d b m × d b n … C 2 , n / d b n d b m × d m n ⋮ ⋮ ⋱ ⋮ C m / d b m , 1 d b m × d l n C m / d b m , 2 d b n × d b n ⋯ C m / d b m , n / d b n d b m × d b n ] C=\left[\begin{array}{cccc} C_{1,1}^{d_{b m} \times d_{b n}} & C_{1,2}^{d_{b m} \times d_{b n}} & \ldots & C_{1, n / d_{b n}}^{d_{b m} \times d_{b n}} \\ C_{2,1}^{d_{b m} \times d_{b n}} & C_{2,2}^{d_{b m} \times d_{b n}} & \ldots & C_{2, n / d_{b n}}^{d_{b m} \times d_{m n}} \\ \vdots & \vdots & \ddots & \vdots \\ C_{m / d_{b m}, 1}^{d_{b m} \times d_{l n}} & C_{m / d_{b m}, 2}^{d_{b n} \times d_{b n}} & \cdots & C_{m / d_{b m}, n / d_{b n}}^{d_{b m} \times d_{b n}} \end{array}\right] C= C1,1dbm×dbnC2,1dbm×dbn⋮Cm/dbm,1dbm×dlnC1,2dbm×dbnC2,2dbm×dbn⋮Cm/dbm,2dbn×dbn……⋱⋯C1,n/dbndbm×dbnC2,n/dbndbm×dmn⋮Cm/dbm,n/dbndbm×dbn
D = [ D 1 , 1 d b m × d b n D 1 , 2 d b m × d b n … D 1 , n / d b n d b m × d b n D 2 , 1 d b m × d b n D 2 , 2 d b m × d b n … D 2 , n / d b n d b m × d m n ⋮ ⋮ ⋱ ⋮ D m / d b m , 1 d b m × d b n D m / d b m , 2 d b m × d b n ⋯ D m / d b m , n / d b n d b n × d l m ] D=\left[\begin{array}{cccc} D_{1,1}^{d_{b m} \times d_{b n}} & D_{1,2}^{d_{b m} \times d_{b n}} & \ldots & D_{1, n / d_{b n}}^{d_{b m} \times d_{b n}} \\ D_{2,1}^{d_{b m} \times d_{b n}} & D_{2,2}^{d_{b m} \times d_{b n}} & \ldots & D_{2, n / d_{b n}}^{d_{b m} \times d_{m n}} \\ \vdots & \vdots & \ddots & \vdots \\ D_{m / d_{b m}, 1}^{d_{b m} \times d_{b n}} & D_{m / d_{b m}, 2}^{d_{b m} \times d_{b n}} & \cdots & D_{m / d_{b m}, n / d_{b n}}^{d_{b n} \times d_{l_m}} \end{array}\right] D= D1,1dbm×dbnD2,1dbm×dbn⋮Dm/dbm,1dbm×dbnD1,2dbm×dbnD2,2dbm×dbn⋮Dm/dbm,2dbm×dbn……⋱⋯D1,n/dbndbm×dbnD2,n/dbndbm×dmn⋮Dm/dbm,n/dbndbn×dlm
代码实现如下:
template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
size_t BLOCK_TILE_SIZE_K, size_t NUM_THREADS, size_t BLOCK_TILE_SKEW_SIZE_X = 0U, size_t BLOCK_TILE_SKEW_SIZE_K = 0U>
__device__ void load_data_to_shared_memory(T const* A, size_t lda,
T const* B, size_t ldb,
T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K + BLOCK_TILE_SKEW_SIZE_K],
T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X + BLOCK_TILE_SKEW_SIZE_X],
size_t thread_block_tile_idx,
size_t thread_linear_idx,
size_t m, size_t n,
size_t k)
{
#pragma unroll
for (size_t load_idx{0U};
load_idx <
(BLOCK_TILE_SIZE_Y * BLOCK_TILE_SIZE_K + NUM_THREADS - 1U) /
NUM_THREADS;
++load_idx)
{
size_t const A_thread_block_tile_row_idx{
(thread_linear_idx + load_idx * NUM_THREADS) /
BLOCK_TILE_SIZE_K};
size_t const A_thread_block_tile_col_idx{
(thread_linear_idx + load_idx * NUM_THREADS) %
BLOCK_TILE_SIZE_K};
size_t const A_row_idx{blockIdx.y * BLOCK_TILE_SIZE_Y +
A_thread_block_tile_row_idx};
size_t const A_col_idx{thread_block_tile_idx * BLOCK_TILE_SIZE_K +
A_thread_block_tile_col_idx};
T val{static_cast<T>(0)};
if (A_row_idx < m && A_col_idx < k)
{
val = A[A_row_idx * lda + A_col_idx];
}
static_assert(BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_Y % NUM_THREADS ==
0U);
A_thread_block_tile[A_thread_block_tile_row_idx]
[A_thread_block_tile_col_idx] = val;
}
#pragma unroll
for (size_t load_idx{0U};
load_idx <
(BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_X + NUM_THREADS - 1U) /
NUM_THREADS;
++load_idx)
{
size_t const B_thread_block_tile_row_idx{
(thread_linear_idx + load_idx * NUM_THREADS) /
BLOCK_TILE_SIZE_X};
size_t const B_thread_block_tile_col_idx{
(thread_linear_idx + load_idx * NUM_THREADS) %
BLOCK_TILE_SIZE_X};
size_t const B_row_idx{thread_block_tile_idx * BLOCK_TILE_SIZE_K +
B_thread_block_tile_row_idx};
size_t const B_col_idx{blockIdx.x * BLOCK_TILE_SIZE_X +
B_thread_block_tile_col_idx};
T val{static_cast<T>(0)};
if (B_row_idx < k && B_col_idx < n)
{
val = B[B_row_idx * ldb + B_col_idx];
}
static_assert(BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_K % NUM_THREADS ==
0U);
B_thread_block_tile[B_thread_block_tile_row_idx]
[B_thread_block_tile_col_idx] = val;
}
}
template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
size_t BLOCK_TILE_SIZE_K>
__global__ void gemm_v02(size_t m, size_t n, size_t k, T alpha, T const* A,
size_t lda, T const* B, size_t ldb, T beta, T* C,
size_t ldc)
{
constexpr size_t NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y};
size_t const thread_linear_idx{threadIdx.y * blockDim.x + threadIdx.x};
size_t const C_col_idx{blockIdx.x * blockDim.x + threadIdx.x};
size_t const C_row_idx{blockIdx.y * blockDim.y + threadIdx.y};
__shared__ T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K];
__shared__ T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X];
size_t const num_thread_block_tiles{(k + BLOCK_TILE_SIZE_K - 1) /
BLOCK_TILE_SIZE_K};
T sum{static_cast<T>(0)};
for (size_t thread_block_tile_idx{0U};
thread_block_tile_idx < num_thread_block_tiles;
++thread_block_tile_idx)
{
load_data_to_shared_memory<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y,
BLOCK_TILE_SIZE_K, NUM_THREADS>(
A, lda, B, ldb, A_thread_block_tile, B_thread_block_tile,
thread_block_tile_idx, thread_linear_idx, m, n, k);
__syncthreads();
#pragma unroll
for (size_t k_i{0U}; k_i < BLOCK_TILE_SIZE_K; ++k_i)
{
sum += A_thread_block_tile[threadIdx.y][k_i] *
B_thread_block_tile[k_i][threadIdx.x];
}
__syncthreads();
}
if (C_row_idx < m && C_col_idx < n)
{
C[C_row_idx * ldc + C_col_idx] =
alpha * sum + beta * C[C_row_idx * ldc + C_col_idx];
}
}
template <typename T>
void launch_gemm_kernel_v02(size_t m, size_t n, size_t k, T const* alpha,
T const* A, size_t lda, T const* B, size_t ldb,
T const* beta, T* C, size_t ldc,
cudaStream_t stream)
{
constexpr unsigned int BLOCK_TILE_SIZE_X{32U};
constexpr unsigned int BLOCK_TILE_SIZE_Y{32U};
constexpr unsigned int BLOCK_TILE_SIZE_K{32U};
constexpr unsigned int NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y};
static_assert(BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_Y % NUM_THREADS == 0U);
static_assert(BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_K % NUM_THREADS == 0U);
dim3 const block_dim{BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, 1U};
dim3 const grid_dim{
(static_cast<unsigned int>(n) + block_dim.x - 1U) / block_dim.x,
(static_cast<unsigned int>(m) + block_dim.y - 1U) / block_dim.y, 1U};
gemm_v02<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, BLOCK_TILE_SIZE_K>
<<<grid_dim, block_dim, 0U, stream>>>(m, n, k, *alpha, A, lda, B, ldb,
*beta, C, ldc);
CHECK_LAST_CUDA_ERROR();
}
此 FP32 广义矩阵乘法实现的性能在 NVIDIA GeForce RTX 3090 GPU 上达到 2.66 TFLOPS,比之前的实现好很多。不过,它距离 GPU 的理论峰值性能还相差甚远。
使用二维块平铺和一维线程平铺实现
我们首先将矩阵
B
B
B 的数据从共享内存缓存到寄存器中。每个具有块线程索引
(
t
m
,
t
n
)
\left(t_m, t_n\right)
(tm,tn) 的线程,其中
t
m
∈
[
1
,
d
t
m
/
d
t
m
]
t_m \in\left[1, d_{t m} / d_{t m}\right]
tm∈[1,dtm/dtm] 和
t
n
∈
[
1
,
d
m
m
]
t_n \in\left[1, d_{m m}\right]
tn∈[1,dmm],现在负责计算小矩阵的
d
t
m
d_{t m}
dtm 个元素,其中
d
t
m
d_{t m}
dtm 是线程块大小。
(
D
b
m
,
b
m
d
m
×
d
m
)
t
m
t
m
+
d
m
,
t
n
=
(
∑
b
k
=
1
k
/
d
k
A
b
m
,
b
k
d
m
×
d
∗
B
b
k
,
b
n
d
k
×
d
m
+
C
b
m
,
b
n
d
m
×
d
m
)
t
m
t
m
+
d
m
,
t
n
=
∑
b
k
=
1
k
/
d
m
k
(
∑
t
k
=
1
d
m
(
A
b
m
,
b
k
d
m
×
d
k
k
)
t
m
t
m
+
d
m
,
t
k
(
B
b
k
,
b
n
d
m
×
d
m
)
t
k
,
t
n
)
+
(
C
b
m
,
b
n
d
m
×
d
m
)
t
m
t
m
+
d
m
,
t
n
\begin{aligned} &\begin{aligned} & \left(D_{b_m, b_m}^{d_m \times d_m}\right)_{t_m t_m+d_m, t_n}=\left(\sum_{b_k=1}^{k / d_k} A_{b_m, b_k}^{d_m \times d *} B_{b_k, b_n}^{d_k \times d_m}+C_{b_m, b_n}^{d_m \times d_m}\right)_{t_m t_m+d_m, t_n} \end{aligned}\\ &\begin{aligned} & =\sum_{b_k=1}^{k / d_{m k}}\left(\sum_{t_k=1}^{d_m}\left(A_{b_m, b_k}^{d_m \times d_{k_k}}\right)_{t_m t_m+d_m, t_k}\left(B_{b_k, b_n}^{d_m \times d_m}\right)_{t_k, t_n}\right)+\left(C_{b_m, b_n}^{d_m \times d_m}\right)_{t_m t_m+d_m, t_n} \end{aligned} \end{aligned}
(Dbm,bmdm×dm)tmtm+dm,tn=
bk=1∑k/dkAbm,bkdm×d∗Bbk,bndk×dm+Cbm,bndm×dm
tmtm+dm,tn=bk=1∑k/dmk(tk=1∑dm(Abm,bkdm×dkk)tmtm+dm,tk(Bbk,bndm×dm)tk,tn)+(Cbm,bndm×dm)tmtm+dm,tn
template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
size_t BLOCK_TILE_SIZE_K, size_t THREAD_TILE_SIZE_Y>
__global__ void gemm_v03(size_t m, size_t n, size_t k, T alpha, T const* A,
size_t lda, T const* B, size_t ldb, T beta, T* C,
size_t ldc)
{
constexpr size_t NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y /
THREAD_TILE_SIZE_Y};
size_t const thread_linear_idx{threadIdx.y * blockDim.x + threadIdx.x};
__shared__ T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K];
__shared__ T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X];
size_t const num_thread_block_tiles{(k + BLOCK_TILE_SIZE_K - 1) /
BLOCK_TILE_SIZE_K};
T C_thread_results[THREAD_TILE_SIZE_Y] = {static_cast<T>(0)};
for (size_t thread_block_tile_idx{0U};
thread_block_tile_idx < num_thread_block_tiles;
++thread_block_tile_idx)
{
load_data_to_shared_memory<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y,
BLOCK_TILE_SIZE_K, NUM_THREADS>(
A, lda, B, ldb, A_thread_block_tile, B_thread_block_tile,
thread_block_tile_idx, thread_linear_idx, m, n, k);
__syncthreads();
#pragma unroll
for (size_t k_i{0U}; k_i < BLOCK_TILE_SIZE_K; ++k_i)
{
size_t const B_thread_block_tile_row_idx{k_i};
T const B_val{
B_thread_block_tile[B_thread_block_tile_row_idx]
[thread_linear_idx % BLOCK_TILE_SIZE_X]};
#pragma unroll
for (size_t thread_tile_row_idx{0U};
thread_tile_row_idx < THREAD_TILE_SIZE_Y;
++thread_tile_row_idx)
{
size_t const A_thread_block_tile_row_idx{
thread_linear_idx / BLOCK_TILE_SIZE_X * THREAD_TILE_SIZE_Y +
thread_tile_row_idx};
size_t const A_thread_block_tile_col_idx{k_i};
T const A_val{A_thread_block_tile[A_thread_block_tile_row_idx]
[A_thread_block_tile_col_idx]};
C_thread_results[thread_tile_row_idx] += A_val * B_val;
}
}
__syncthreads();
}
#pragma unroll
for (size_t thread_tile_row_idx{0U};
thread_tile_row_idx < THREAD_TILE_SIZE_Y; ++thread_tile_row_idx)
{
size_t const C_row_idx{blockIdx.y * BLOCK_TILE_SIZE_Y +
thread_linear_idx / BLOCK_TILE_SIZE_X *
THREAD_TILE_SIZE_Y +
thread_tile_row_idx};
size_t const C_col_idx{blockIdx.x * BLOCK_TILE_SIZE_X +
thread_linear_idx % BLOCK_TILE_SIZE_X};
if (C_row_idx < m && C_col_idx < n)
{
C[C_row_idx * ldc + C_col_idx] =
alpha * C_thread_results[thread_tile_row_idx] +
beta * C[C_row_idx * ldc + C_col_idx];
}
}
}
template <typename T>
void launch_gemm_kernel_v03(size_t m, size_t n, size_t k, T const* alpha,
T const* A, size_t lda, T const* B, size_t ldb,
T const* beta, T* C, size_t ldc,
cudaStream_t stream)
{
constexpr unsigned int BLOCK_TILE_SIZE_X{64U};
constexpr unsigned int BLOCK_TILE_SIZE_Y{64U};
constexpr unsigned int BLOCK_TILE_SIZE_K{8U};
constexpr unsigned int THREAD_TILE_SIZE_Y{8U};
constexpr unsigned int NUM_THREADS_PER_BLOCK{
BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y / THREAD_TILE_SIZE_Y};
static_assert(BLOCK_TILE_SIZE_Y % THREAD_TILE_SIZE_Y == 0U);
static_assert(NUM_THREADS_PER_BLOCK % BLOCK_TILE_SIZE_K == 0U);
static_assert(NUM_THREADS_PER_BLOCK % BLOCK_TILE_SIZE_X == 0U);
dim3 const block_dim{NUM_THREADS_PER_BLOCK, 1U, 1U};
dim3 const grid_dim{
(static_cast<unsigned int>(n) + BLOCK_TILE_SIZE_X - 1U) /
BLOCK_TILE_SIZE_X,
(static_cast<unsigned int>(m) + BLOCK_TILE_SIZE_Y - 1U) /
BLOCK_TILE_SIZE_Y,
1U};
gemm_v03<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, BLOCK_TILE_SIZE_K,
THREAD_TILE_SIZE_Y><<<grid_dim, block_dim, 0U, stream>>>(
m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
CHECK_LAST_CUDA_ERROR();
}