【Transformer从零开始代码实现 pytoch版】（五）总架构类的实现

news2025/2/19 7:18:45

Transformer总架构

在这里插入图片描述
在实现完输入部分、编码器、解码器和输出部分之后，就可以封装各个部件为一个完整的实体类了。

【Transformer从零开始代码实现 pytoch版】（一）输入部件：embedding+positionalEncoding

【Transformer从零开始代码实现 pytoch版】（二）Encoder编码器组件：mask + attention + feed forward + add&norm

【Transformer从零开始代码实现 pytoch版】（三）Decoder编码器组件：多头自注意力+多头注意力+全连接层+规范化层

【Transformer从零开始代码实现 pytoch版】（四）输出部件：Linear+softmax

编码器-解码器总结构代码实现

class EncoderDecoder(nn.Module):
    """ 编码器解码器架构实现、定义了初始化、forward、encode和decode部件
    """
    def __init__(self, encoder, decoder, source_embed, target_embed, generator):
        """ 传入五大部件参数

        :param encoder: 编码器
        :param decoder: 解码器
        :param source_embed: 源数据embedding函数
        :param target_embed: 目标数据embedding函数
        :param generator: 输出部分类被生成器对象
        """
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = source_embed
        self.tgt_embed = target_embed
        self.generator = generator					# 生成器后面会专门用到

    def forward(self, source, target, source_mask, target_mask):
        """ 构建数据流入流出

        :param source: 源数据
        :param target: 目标数据
        :param source_mask: 源数据掩码张量
        :param target_mask: 目标数据掩码张量
        :return:
        """
        # 注意这里先用的encode和decode函数，又才在其函数里面，再用了encoder和decoder
        return self.decode(self.encode(source, source_mask), source_mask, target, target_mask)

    def encode(self, source, source_mask):
        """ 编码函数，编码部件

        :param source: 源数据张量
        :param source_mask: 源数据的掩码张量
        :return: 经过解码器的输出
        """
        return self.encoder(self.src_embed(source), source_mask)

    def decode(self, memory, source_mask, target, target_mask):
        """ 解码函数，解码部件

        :param memory:编码器的输出QV
        :param source_mask:源数据的掩码张量
        :param target:目标数据
        :param target_mask:目标数据的掩码张量
        :return:
        """
        return self.decoder(self.tgt_embed(target), memory, source_mask, target_mask)

示例

# 输入参数
vocab_size = 1000
size = d_model = 512

# 编码器部分
dropout = 0.2
d_ff = 64				# 隐藏层参数
head = 8				# 注意力头数
c = copy.deepcopy
attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
encoder_layer = EncoderLayer(size, c(attn), c(ff), dropout)
encoder_N = 8
encoder = Encoder(encoder_layer, encoder_N)

# 解码器部分
dropout = 0.2
d_ff = 64
head = 8
c = copy.deepcopy
attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
decoder_layer = DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout)
decoder_N = 8
decoder = Decoder(decoder_layer, decoder_N)

# 用了nn的embedding作为输入示意
source_embed = nn.Embedding(vocab_size, d_model)
target_embed = nn.Embedding(vocab_size, d_model)
generator = Generator(d_model, vocab_size)

# 输入张量和掩码张量
source = target = torch.LongTensor([[100, 2, 421, 508], [491, 998, 1, 221]])
source_mask = target_mask = torch.zeros(2, 4, 4)

# 实例化编码器-解码器，再带入参数实现
ed = EncoderDecoder(encoder, decoder, source_embed, target_embed, generator)
ed_res = ed(source, target, source_mask, target_mask)
print(f"ed_res: {ed_res}\n shape:{ed_res.shape}")


ed_res: tensor([[[-0.1861,  0.0849, -0.3015,  ...,  1.1753, -1.4933,  0.2484],
         [-0.3626,  1.3383,  0.1739,  ...,  1.1304,  2.0266, -0.5929],
         [ 0.0785,  1.4932,  0.3184,  ..., -0.2021, -0.2330,  0.1539],
         [-0.9703,  1.1944,  0.1763,  ...,  0.1586, -0.6066, -0.6147]],
        [[-0.9216, -0.0309, -0.6490,  ...,  1.0177,  0.5574,  0.4873],
         [-1.4097,  0.6678, -0.6708,  ...,  1.1176,  0.1959, -1.2494],
         [-0.3204,  1.2794, -0.4022,  ...,  0.6319, -0.4709,  1.0520],
         [-1.3238,  1.1470, -0.9943,  ...,  0.4026,  1.0911,  0.1327]]],
       grad_fn=<AddBackward0>)
 shape:torch.Size([2, 4, 512])

编码器-解码器模型构建函数

def make_model(source_vocab, target_vocab, N=6, d_model=512, d_ff=2048, head=8, dropout=0.1):
    """ 用于构建模型

    :param source_vocab: 源数据词汇总数
    :param target_vocab: 目标词汇总数
    :param N: 解码器/解码器堆叠层数
    :param d_model: 词嵌入维度
    :param d_ff: 前馈全连接层隐藏层维度
    :param dropout: 置0比率
    :return: 返回构建编码器-解码器模型
    """
    # 拷贝函数，来保证拷贝的函数彼此之间相互独立，不受干扰
    c = copy.deepcopy

    # 实例化多头注意力
    attn = MultiHeadedAttention(head, d_model)

    # 实例化全连接层
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)

    # 实例化位置编码类，得到对象position
    position = PositionalEncoding(d_model, dropout)

    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embedding(d_model, source_vocab), c(position)),
        nn.Sequential(Embedding(d_model, source_vocab), c(position)),
        Generator(d_model, target_vocab)
    )

    # 模型结构构建完成后，初始化模型中的参数
    for p in model.parameters():
        # 这里判定当参数维度大于1的时候，则会将其初始化成一个服从均匀分布的矩阵
        if p.dim() > 1:
            nn.init.xavier_normal(p)        # 生成服从正态分布的数，默认为U(-1, 1)，更改第二个参数可以改值

    return model

示例

source_vocab = target_vocab = 11
N = 6
res = make_model(source_vocab, target_vocab, N)
print(res)


EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w1): Linear(in_features=512, out_features=2048, bias=True)
          (w2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0-1): 2 x SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (norm): LayerNorm()
  )
  (decoder): Decoder(
    (layers): ModuleList(
      (0-5): 6 x DecoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (src_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w1): Linear(in_features=512, out_features=2048, bias=True)
          (w2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0-2): 3 x SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (norm): LayerNorm()
  )
  (src_embed): Sequential(
    (0): Embedding(
      (lut): Embedding(512, 11)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (tgt_embed): Sequential(
    (0): Embedding(
      (lut): Embedding(512, 11)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (generator): Generator(
    (project): Linear(in_features=512, out_features=11, bias=True)
  )
)