# 定义一个简单的嵌入层
class EmbeddingLayer(nn.Module):
def __init__(self, vocab_size, embed_dim):
super(EmbeddingLayer, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
def forward(self, x):
return self.embedding(x)
# 随机生成一个输入序列
input_seq = torch.randint(0, vocab_size, (32, 50)) # (batch_size, seq_len)
# 获取输入表示
input_repr = embedding_layer(input_seq)
class Attention(nn.Module):
def __init__(self, embed_dim):
super(Attention, self).__init__()
self.query = nn.Linear(embed_dim, embed_dim)
self.key = nn.Linear(embed_dim, embed_dim)
self.value = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
Q = self.query(x)
K = self.key(x)
V = self.value(x)
attention_scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(embed_dim)
attention_weights = F.softmax(attention_scores, dim=-1)
return attention_weights
def weighted_sum(attention_weights, input_repr):
return torch.matmul(attention_weights, input_repr)
class OutputLayer(nn.Module):
def __init__(self, embed_dim, output_dim):
super(OutputLayer, self).__init__()
self.fc = nn.Linear(embed_dim, output_dim)
def forward(self, x):
return self.fc(x)
import torch
import torch.nn as nn
import torch.nn.functional as F
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, num_heads, dropout=0.1):
super(TransformerBlock, self).__init__()
self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
self.ffn = nn.Sequential(
nn.Linear(embed_dim, 4 * embed_dim),
nn.Linear(4 * embed_dim, embed_dim),
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# 输入表示
# x: (seq_len, batch_size, embed_dim)
attn_output, _ = self.attn(x, x, x) # 自注意力,输入和输出都是x
attn_output = self.dropout(attn_output)
x = self.norm1(x + attn_output) # 加权求和和残差连接
# 前馈网络
ffn_output = self.ffn(x)
ffn_output = self.dropout(ffn_output)
x = self.norm2(x + ffn_output) # 加权求和和残差连接
return x
class TextTransformer(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, num_layers, dropout=0.1):
super(TextTransformer, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.positional_encoding = nn.Parameter(torch.randn(1, 1, embed_dim))
self.encoder = nn.Sequential(*[TransformerBlock(embed_dim, num_heads, dropout) for _ in range(num_layers)])
self.fc_out = nn.Linear(embed_dim, vocab_size) # 假设是分类任务
def forward(self, x):
# 输入表示
embeds = self.embedding(x) # (batch_size, seq_len, embed_dim)
embeds = embeds + self.positional_encoding[:, :embeds.size(1), :] # 添加位置编码
embeds = embeds.transpose(0, 1) # (seq_len, batch_size, embed_dim)
# 计算注意力权重和加权求和
out = self.encoder(embeds)
# 输出
out = out.transpose(0, 1) # (batch_size, seq_len, embed_dim)
out = self.fc_out(out[:, -1, :]) # 假设只取序列的最后一个向量进行分类
return out
# 模型参数
vocab_size = 10000 # 词汇表大小
embed_dim = 256 # 嵌入层维度
num_heads = 8 # 注意力头数
num_layers = 6 # Transformer层数
# 实例化模型
model = TextTransformer(vocab_size, embed_dim, num_heads, num_layers)
# 随机生成一个输入序列
input_seq = torch.randint(0, vocab_size, (32, 100)) # (batch_size, seq_len)
# 前向传播
output = model(input_seq)
print(output.shape) # 应该输出 (batch_size, vocab_size)
1.Soft Attention
这种类型的注意力机制会输出一个概率分布,每个输入元素都有一个对应的权重,这些权重的和为1。Soft attention通常可以微分,因此可以用于梯度下降。Soft Attention输出一个概率分布,可以通过梯度下降进行优化。
import torch
import torch.nn as nn
import torch.nn.functional as F
class SoftAttention(nn.Module):
def __init__(self, embed_dim):
super(SoftAttention, self).__init__()
self.weight = nn.Parameter(torch.randn(embed_dim, 1))
def forward(self, x):
# x: (batch_size, seq_len, embed_dim)
scores = torch.matmul(x, self.weight).squeeze(-1) # (batch_size, seq_len)
weights = F.softmax(scores, dim=-1) # Softmax to get probabilities
return weights
# 示例使用
embed_dim = 128
soft_attn = SoftAttention(embed_dim)
input_seq = torch.randn(32, 50, embed_dim) # (batch_size, seq_len, embed_dim)
attention_weights = soft_attn(input_seq)
print("Soft Attention Weights:", attention_weights.sum(dim=1)) # 应该接近于1
2.Hard Attention
与soft attention不同,hard attention会随机或确定性地选择一个输入元素,并只关注这个元素。Hard attention通常不可微分,因此训练时可能需要使用强化学习或变分方法。Hard Attention随机选择一个输入元素,这里我们使用一个简单的采样策略。
import torch
class HardAttention(nn.Module):
def __init__(self, embed_dim):
super(HardAttention, self).__init__()
def forward(self, x):
# x: (batch_size, seq_len, embed_dim)
probs = torch.rand(x.size(0), x.size(1), device=x.device)
_, idx = torch.topk(probs, k=1, dim=1)
selected = torch.gather(x, 1, idx.unsqueeze(-1).expand(-1, -1, x.size(-1)))
return selected.squeeze(1)
# 示例使用
hard_attn = HardAttention(embed_dim)
selected_elements = hard_attn(input_seq)
print("Hard Attention Selected Elements:", selected_elements.shape) # (batch_size, embed_dim)
class SelfAttention(nn.Module):
def __init__(self, embed_dim):
super(SelfAttention, self).__init__()
self.query = nn.Linear(embed_dim, embed_dim)
self.key = nn.Linear(embed_dim, embed_dim)
self.value = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
# x: (batch_size, seq_len, embed_dim)
Q = self.query(x)
K = self.key(x)
V = self.value(x)
attention_scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(embed_dim)
attention_weights = F.softmax(attention_scores, dim=-1)
output = torch.matmul(attention_weights, V)
return output, attention_weights
# 示例使用
self_attn = SelfAttention(embed_dim)
output, weights = self_attn(input_seq)
print("Self Attention Output:", output.shape) # (batch_size, seq_len, embed_dim)
4.Multi-Head Attention
在Transformer模型中,为了捕捉不同子空间中的信息,会使用多头注意力机制,即并行地运行多个自注意力机制,然后将结果合并。Multi-Head Attention并行地运行多个自注意力机制,然后将结果合并。
class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
self.query = nn.Linear(embed_dim, embed_dim)
self.key = nn.Linear(embed_dim, embed_dim)
self.value = nn.Linear(embed_dim, embed_dim)
self.fc_out = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
# x: (batch_size, seq_len, embed_dim)
batch_size, seq_len, embed_dim = x.size()
Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
attention_scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(self.head_dim)
attention_weights = F.softmax(attention_scores, dim=-1)
output = torch.matmul(attention_weights, V).transpose(1, 2).contiguous()
output = output.view(batch_size, seq_len, embed_dim)
output = self.fc_out(output)
return output
# 示例使用
num_heads = 8
multi_head_attn = MultiHeadAttention(embed_dim, num_heads)
multi_head_output = multi_head_attn(input_seq)
print("Multi-Head Attention Output:", multi_head_output.shape) # (batch_size, seq_len, embed_dim)
Soft Attention和Self-Attention可以直接用于梯度下降优化,而Hard Attention由于其不可微分的特性,可能需要特殊的训练技巧。Multi-Head Attention则通过并行处理捕捉更丰富的信息。
import torch
import torch.nn as nn
import torch.optim as optim
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
embedded = self.dropout(self.embedding(src))
outputs, (hidden, cell) = self.rnn(embedded)
return hidden, cell
class Attention(nn.Module):
def __init__(self, enc_hid_dim, dec_hid_dim):
self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
self.v = nn.Linear(dec_hid_dim, 1, bias=False)
def forward(self, hidden, encoder_outputs):
hidden = hidden.repeat(encoder_outputs.shape[0], 1).transpose(0, 1)
encoder_outputs = encoder_outputs.transpose(0, 1)
attn_energies = self.score(hidden, encoder_outputs)
return F.softmax(attn_energies, dim=-1)
def score(self, hidden, encoder_outputs):
energy = torch.tanh(self.attn([hidden, encoder_outputs], dim=2)))
energy = self.v(energy).squeeze(2)
return energy
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
self.output_dim = output_dim
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
self.attention = Attention(hid_dim, hid_dim)
self.fc_out = nn.Linear(hid_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, cell, encoder_outputs):
input = input.unsqueeze(0)
embedded = self.dropout(self.embedding(input))
attn_weights = self.attention(hidden, encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
rnn_input =, context), dim=2)
output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
output = output.squeeze(0)
out = self.fc_out(output)
return out, hidden, cell
# 假设参数
input_dim = 1000 # 源语言词汇表大小
output_dim = 1000 # 目标语言词汇表大小
emb_dim = 256 # 嵌入层维度
hid_dim = 512 # 隐藏层维度
n_layers = 2 # LSTM层数
dropout = 0.1 # Dropout
# 实例化模型
encoder = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout)
decoder = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout)
# 假设输入
src = torch.randint(0, input_dim, (10, 32)) # (seq_len, batch_size)
input = torch.randint(0, output_dim, (1, 32)) # (seq_len, batch_size)
# 前向传播
hidden, cell = encoder(src)
output, hidden, cell = decoder(input, hidden, cell, src)
print("Translation Output:", output.shape) # (batch_size, output_dim)
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
class AttentionCNN(nn.Module):
def __init__(self):
self.cnn = models.resnet18(pretrained=True)
self.fc = nn.Linear(512, 1000) # 假设有1000个类别
def forward(self, x):
x = self.cnn(x)
# 假设我们添加一个简单的注意力层
attention_weights = torch.sigmoid(self.cnn.fc.weight)
x = torch.sum(x * attention_weights, dim=1)
x = self.fc(x)
return x
# 实例化模型
attention_cnn = AttentionCNN()
# 假设输入
input_image = torch.randn(32, 3, 224, 224) # (batch_size, channels, height, width)
# 前向传播
output = attention_cnn(input_image)
print("Image Recognition Output:", output.shape) # (batch_size, num_classes)
class SpeechRecognitionModel(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, output_dim, n_layers, dropout):
self.rnn = nn.LSTM(input_dim, emb_dim, n_layers, dropout=dropout, batch_first=True)
self.attention = Attention(emb_dim, emb_dim)
self.fc_out = nn.Linear(emb_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# x: (batch_size, seq_len, input_dim)
outputs, (hidden, cell) = self.rnn(x)
attn_weights = self.attention(hidden, outputs)
context = torch.bmm(attn_weights, outputs)
output = self.fc_out(context.squeeze(1))
return output
# 假设参数
input_dim = 128 # 特征维度
output_dim = 1000 # 词汇表大小
# 实例化模型
speech_recognition = SpeechRecognitionModel(input_dim, emb_dim, hid_dim, output_dim, n_layers, dropout)
# 假设输入
speech_signal = torch.randn(32, 100, input_dim) # (batch_size, seq_len, input_dim)
# 前向传播
output = speech_recognition(speech_signal)
print("Speech Recognition Output:", output.shape) # (batch_size, output_dim)