MoE 混合专家模型

news2025/4/19 15:41:40

MoE特点

做了特征增广；门控机制保证了特征增广的同时模型不会过拟合。

大模型中会使用到MoE技术。

简单实现

实现一个简单的linear混合专家模型。这里每个专家是一个Linear。

import torch
import torch.nn as nn
import torch.nn.functional as F

# 一个专家
class Linear(nn.Module):
  def __init__(self, in_features, out_features):
    super().__init__()
    self.fc = nn.Linear(in_features, out_features)

  def forward(self, x):
    return self.fc(x)

class MoELayer(nn.Module):
  # in_features是每个样本的维度
  def __init__(self, num_experts, in_features, out_features):
    super().__init__()
    self.num_experts = num_experts
    # 实现几个专家
    self.experts = nn.ModuleList([Linear(in_features, out_features) for _ in range(self.num_experts)])
    # 门控机制:对某个样本，为每个专家生成一个权重
    self.gate = nn.Linear(in_features, num_experts)
  def forward(self, x):  # x:batch x in_dim
    # 经过每一个专家
    expert_outputs = torch.stack([ep(x) for ep in self.experts], dim=1) # b x num_experts x out_dim
    # 不同样本，得到每个专家的概率，概率和为1
    weights = F.softmax(self.gate(x), dim=-1) # b x num_experts 
    # 不同专家的结果加权和 b x out_dim
    out = torch.bmm(weights.unsqueeze(1), expert_outputs).squeeze(1)  # 批量矩阵乘法 第一个维度是批量
    return out


batch_size = 10
in_fea = 15
out_fea = 5
num_experts = 3

model = MoELayer(num_experts, in_fea, out_fea)
x = torch.rand(batch_size, in_fea) # 样本：10个样本 每个样本是一个15维向量
out = model(x)
print(out.shape) # 10,5