从全连接层到卷积
企业级理解卷积
- 不稳定输入 稳定输出 求系统存量 - 信号系统
- 周围像素点如何产生影响 - 图像处理
- 一个像素点如何试探 - 图像识别
好处:
平移不变性和局部性
图像卷积
import torch
from torch import nn
from d2l import torch as d2l
def try_gpu(i=0): #@save
"""如果存在,则返回gpu(i),否则返回cpu()"""
if torch.cuda.device_count() >= i + 1:
return torch.device(f'cuda:{i}')
return torch.device('cpu')
def corr2d(X, K):#互相关运算
h, w = K.shape
Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
for i in range(Y.shape[0]):
for j in range(Y.shape[1]):
Y[i, j] = (X[i:i + h, j:j + w] * K).sum()
return Y
X = torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]], device=try_gpu())
K = torch.tensor([[0.0, 1.0], [2.0, 3.0]], device=try_gpu())
corr2d(X, K)
tensor([[19., 25.],
[37., 43.]])
class Conv2D(nn.Module):
# 卷积层
def __init__(self, kernel_size):
super().__init__()
self.weight = nn.Parameter(torch.rand(kernel_size))
self.bias = nn.Parameter(torch.zeros(1))
def forward(self, X):
return corr2d(X, self.weight) + self.bias
# 构造一个二维卷积层,它具有1个输出通道和形状为(1,2)的卷积核
conv2d = nn.Conv2d(1, 1, kernel_size=(1, 2), bias=False)
# 这个二维卷积层使用四维输入和输出格式(批量大小、通道、高度、宽度),
# 其中批量大小和通道数都为1
X = torch.ones((6, 8), )
X[:, 2:6] = 0
K = torch.tensor([[1.0, -1.0]])
Y = corr2d(X, K)
X = X.reshape((1, 1, 6, 8), )
Y = Y.reshape((1, 1, 6, 7))
lr = 3e-2 # 学习率
for i in range(10):
Y_hat = conv2d(X)
l = (Y_hat - Y) ** 2
conv2d.zero_grad()
l.sum().backward()
conv2d.weight.data -= lr * conv2d.weight.grad
if (i + 1) % 2 == 0:
print(f'epoch {i + 1}, loss {l.sum():.3f}')
conv2d.weight
epoch 2, loss 9.937
epoch 4, loss 2.727
epoch 6, loss 0.892
epoch 8, loss 0.327
epoch 10, loss 0.128
Parameter containing:
tensor([[ 1.0240, -0.9515]], requires_grad=True)
卷积操作转化为矩阵乘法
kernels-to-compute-second-order-derivative-of-digital-image
常用图像卷积核小结
因为3*3和7*7感受野差不多大,但是3*3参数更加少
填充和步幅
import torch
from torch import nn
def comp_conv2d(conv2d, X):
X = X.reshape((1, 1) + X.shape)
Y = conv2d(X)
# return Y.reshape(Y.shape[2:])
return torch.squeeze(Y)
conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1)
X = torch.rand(size=(8, 8))
comp_conv2d(conv2d, X).shape
torch.Size([8, 8])
conv2d = nn.Conv2d(1, 1, kernel_size=(5, 3), padding=(2, 1))
comp_conv2d(conv2d, X).shape
torch.Size([8, 8])
Z = torch.rand(size=(8, 8))
conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1, stride=2)
comp_conv2d(conv2d, Z).shape
torch.Size([4, 4])
conv2d = nn.Conv2d(1, 1, kernel_size=(3, 5), padding=(0, 1), stride=(3, 4))
comp_conv2d(conv2d, X).shape
torch.Size([2, 2])
对于音频信号,步幅2说明什么?
答:我觉得在计网中音频采样需要2*MaxHz采集一周期,用步幅为2正好采集一周期
一般情况:
- 填充 - 为了保证图像大小不变
- 步幅 - 减少计算复杂度
- 核大小(最关键)- 一般套用经典网络(如ResNet)
多输入多输出通道
import torch
from d2l import torch as d2l
def corr2d_multi_in(X, K):
# 先遍历“X”和“K”的第0个维度(通道维度),再把它们加在一起
return sum(d2l.corr2d(x, k) for x, k in zip(X, K))
X = torch.tensor([[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]],
[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]])
K = torch.tensor([[[0.0, 1.0], [2.0, 3.0]], [[1.0, 2.0], [3.0, 4.0]]])
corr2d_multi_in(X, K)
tensor([[ 56., 72.],
[104., 120.]])
def corr2d_multi_in_out(X, K):
# 迭代“K”的第0个维度,每次都对输入“X”执行互相关运算。
# stack将所有结果都按照新维度 叠加在一起
return torch.stack([corr2d_multi_in(X, k) for k in K], 0)
K = torch.stack((K, K + 1, K + 2), 0)
K.shape, corr2d_multi_in_out(X, K).shape
(torch.Size([3, 2, 2, 2]), torch.Size([3, 2, 2]))
多输出通道利用不同的卷积核学到不同的特征
多输入通道通过同核识别并组合中的模式
这里bias感觉不太对,觉得是Co因为多输入通道时会相加多个和一个
3D卷积-分析视频
def conv2d_by_mul(X, K):
h, w = K.shape
outh = X.shape[0] - h + 1
outw = X.shape[1] - w + 1
K = K.reshape(-1, 1)
Y = []
for i in range(outh):
for j in range(outw):
Y.append(X[i:i + h, j:j + w].reshape(-1))
Y = torch.stack(Y, 0)
# 用矩阵乘法表示互相关运算
res = (torch.matmul(Y, K)).reshape(outh, outw)
return res
汇聚层(Pooling)-池化
它具有双重目的:降低卷积层对位置的敏感性,同时降低对空间降采样表示的敏感性。
import torch
from torch import nn
from d2l import torch as d2l
def pool2d(X, pool_size, mode="max"):
# 实现step=1的池化
p_h, p_w = pool_size
Y = torch.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1))
for i in range(Y.shape[0]):
for j in range(Y.shape[1]):
if mode == "max":
Y[i, j] = X[i:i + p_h, j:j + p_w].max()
elif mode == "avg":
Y[i, j] = X[i: i + p_h, j:j + p_w].mean()
return Y
X = torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])
pool2d(X, (2, 2)), pool2d(X, (2, 2), 'avg')
(tensor([[4., 5.],
[7., 8.]]),
tensor([[2., 3.],
[5., 6.]]))
# 默认情况下,深度学习框架中的步幅与汇聚窗口的大小相同。
X = torch.arange(16, dtype=torch.float32).reshape((1, 1, 4, 4))
pool2d = nn.MaxPool2d(3)
pool2d(X)
tensor([[[[10.]]]])
pool2d = nn.MaxPool2d(3, padding=1, stride=2)
pool2d(X)
tensor([[[[ 5., 7.],
[13., 15.]]]])
pool2d = nn.MaxPool2d((2, 3), stride=(2, 3), padding=(0, 1))
pool2d(X)
tensor([[[[ 5., 7.],
[13., 15.]]]])
X = torch.cat((X, X + 1), 1)
X.shape, nn.MaxPool2d(3, padding=1, stride=2)(X).shape
(torch.Size([1, 4, 4, 4]), torch.Size([1, 4, 2, 2]))
最小池化层-利用ReLu激活后最小的就是0了,没啥用可以先*-1再乘回来和最大池化一个效果
卷积神经网络(LeNet)
import torch
from torch import nn
from d2l import torch as d2l
import numpy as np
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.ReLU(),
# 6 * 28 * 28
nn.AvgPool2d(kernel_size=2, stride=2),
# 6 * 14 * 14
nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
# 16 * 10 * 10
nn.AvgPool2d(kernel_size=2, stride=2),
# 16 * 5 * 5
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10)
)
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
def evaluate_accuracy_gpu(net, data_iter, device=None): #@save
"""使用GPU计算模型在数据集上的精度"""
if isinstance(net, nn.Module):
net.eval() # 设置为评估模式
if not device:
device = next(iter(net.parameters())).device
# 正确预测的数量,总预测的数量
metric = d2l.Accumulator(2)
with torch.no_grad():
for X, y in data_iter:
if isinstance(X, list):
# BERT微调所需的(之后将介绍)
X = [x.to(device) for x in X]
else:
X = X.to(device)
y = y.to(device)
metric.add(d2l.accuracy(net(X), y), y.numel())
return metric[0] / metric[1]
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
"""用GPU训练模型(在第六章定义)"""
def init_weights(m):
if type(m) == nn.Linear or type(m) == nn.Conv2d:
nn.init.xavier_uniform_(m.weight)
net.apply(init_weights)
print('training on', device)
net.to(device) # 弄到GPU
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
legend=['train loss', 'train acc', 'test acc'])
timer, num_batches = d2l.Timer(), len(train_iter)
for epoch in range(num_epochs):
# 训练损失之和,训练准确率之和,样本数
metric = d2l.Accumulator(3)
net.train()
for i, (X, y) in enumerate(train_iter):
timer.start()
optimizer.zero_grad()
X, y = X.to(device), y.to(device) # 弄到GPU
y_hat = net(X)
l = loss(y_hat, y)
l.backward()
optimizer.step()
with torch.no_grad():
metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
timer.stop()
train_l = metric[0] / metric[2]
train_acc = metric[1] / metric[2]
if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
animator.add(epoch + (i + 1) / num_batches,
(train_l, train_acc, None))
test_acc = evaluate_accuracy_gpu(net, test_iter)
animator.add(epoch + 1, (None, None, test_acc))
print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
f'test acc {test_acc:.3f}')
print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(device)}')
X = torch.rand(size=(1, 1, 28, 28), dtype=torch.float32)
for layer in net:
X = layer(X)
print(layer.__class__.__name__,'output shape: \t',X.shape)
Conv2d output shape: torch.Size([1, 6, 28, 28])
Sigmoid output shape: torch.Size([1, 6, 28, 28])
AvgPool2d output shape: torch.Size([1, 6, 14, 14])
Conv2d output shape: torch.Size([1, 16, 10, 10])
Sigmoid output shape: torch.Size([1, 16, 10, 10])
AvgPool2d output shape: torch.Size([1, 16, 5, 5])
Flatten output shape: torch.Size([1, 400])
Linear output shape: torch.Size([1, 120])
Sigmoid output shape: torch.Size([1, 120])
Linear output shape: torch.Size([1, 84])
Sigmoid output shape: torch.Size([1, 84])
Linear output shape: torch.Size([1, 10])
lr, num_epochs = 0.9, 10
train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
loss 0.384, train acc 0.857, test acc 0.827
49027.0 examples/sec on cuda:0
通道数目慢慢增加-匹配的模式变多了
CNN explanier-可视化卷积
可用他设计网络,因为pytorch需要自己算,不方便