Dropout正则化：提升PyTorch模型性能的神器！

本文展示了在PyTorch模型中添加Dropout正则化如何影响模型在损失和准确率方面的性能。

Dropout正则化在机器学习中的意义是什么？

Dropout正则化是机器学习中的一种方法，通过在神经网络中随机丢弃一些单元（神经元）来模拟同时训练多个网络架构的效果，这对于减少训练过程中的过拟合风险至关重要。

在PyTorch模型中集成Dropout

要在PyTorch模型中集成Dropout正则化，可以方便地使用torch.nn.Dropout类。

这个类需要一个输入参数，即dropout率，它表示神经元被关闭（即不参与训练）的概率。

Dropout可以应用于任何非输出层之后。

self.dropout = nn.Dropout(0.25)

观察Dropout对模型性能的影响

为了研究Dropout的效果，我们将训练一个用于图像分类的模型。

首先，我们将训练一个包含Dropout正则化的网络，然后训练一个不包含Dropout正则化的网络。

两个模型都将在Cifar-10数据集上进行8个周期的训练。

步骤 1: 导入必要的依赖和库

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import numpy as np

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

步骤 2: 加载数据集并准备DataLoader

BATCH_SIZE = 32

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root="./data", train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root="./data", train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)

CLASS_NAMES = ("plane", "car", "bird", "cat",
               "deer", "dog", "frog", "horse", "ship", "truck")

步骤 3: 加载并可视化部分数据

def show_batch(image_batch, label_batch):
  plt.figure(figsize=(10,10))
  for n in range(25):
      ax = plt.subplot(5,5,n+1)
      img = image_batch[n] / 2 + 0.5     # unnormalize
      img = img.numpy()
      plt.imshow(np.transpose(img, (1, 2, 0)))
      plt.title(CLASS_NAMES[label_batch[n]])
      plt.axis("off")
sample_images, sample_labels = next(iter(trainloader))
show_batch(sample_images, sample_labels)

输出：

步骤 4: 实现包含Dropout正则化的网络

class Net(nn.Module):
  def __init__(self, input_shape=(3,32,32)):
    super(Net, self).__init__()
    
    self.conv1 = nn.Conv2d(3, 32, 3)
    self.conv2 = nn.Conv2d(32, 64, 3)
    self.conv3 = nn.Conv2d(64, 128, 3)
    
    self.pool = nn.MaxPool2d(2,2)

    n_size = self._get_conv_output(input_shape)
    
    self.fc1 = nn.Linear(n_size, 512)
    self.fc2 = nn.Linear(512, 10)

    self.dropout = nn.Dropout(0.25)

  def _get_conv_output(self, shape):
    batch_size = 1
    input = torch.autograd.Variable(torch.rand(batch_size, *shape))
    output_feat = self._forward_features(input)
    n_size = output_feat.data.view(batch_size, -1).size(1)
    return n_size

  def _forward_features(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = self.pool(F.relu(self.conv3(x)))
    return x
      
  def forward(self, x):
    x = self._forward_features(x)
    x = x.view(x.size(0), -1)
    x = self.dropout(x)
    x = F.relu(self.fc1(x))
    x = self.dropout(x)
    x = self.fc2(x)
    return x

步骤 5: 实现训练过程

def train(model, device, train_loader, optimizer, criterion, epoch, steps_per_epoch=20):
  # Switch model to training mode. This is necessary for layers like dropout, batchnorm etc which behave differently in training and evaluation mode
  model.train()

  train_loss = 0
  train_total = 0
  train_correct = 0

  # We loop over the data iterator, and feed the inputs to the network and adjust the weights.
  for batch_idx, (data, target) in enumerate(train_loader, start=0):
    
    # Load the input features and labels from the training dataset
    data, target = data.to(device), target.to(device)
    
    # Reset the gradients to 0 for all learnable weight parameters
    optimizer.zero_grad()
    
    # Forward pass: Pass image data from training dataset, make predictions about class image belongs to (0-9 in this case)
    output = model(data)
    
    # Define our loss function, and compute the loss
    loss = criterion(output, target)
    train_loss += loss.item()

    scores, predictions = torch.max(output.data, 1)
    train_total += target.size(0)
    train_correct += int(sum(predictions == target))
            
    # Reset the gradients to 0 for all learnable weight parameters
    optimizer.zero_grad()

    # Backward pass: compute the gradients of the loss w.r.t. the model"s parameters
    loss.backward()
    
    # Update the neural network weights
    optimizer.step()

  acc = round((train_correct / train_total) * 100, 2)
  print("Epoch [{}], Loss: {}, Accuracy: {}".format(epoch, train_loss/train_total, acc), end="")

步骤 6: 实现测试函数

def test(model, device, test_loader, criterion, classes):
  # Switch model to evaluation mode. This is necessary for layers like dropout, batchnorm etc which behave differently in training and evaluation mode
  model.eval()

  test_loss = 0
  test_total = 0
  test_correct = 0

  example_images = []
  with torch.no_grad():
      for data, target in test_loader:
          # Load the input features and labels from the test dataset
          data, target = data.to(device), target.to(device)
          
          # Make predictions: Pass image data from test dataset, make predictions about class image belongs to (0-9 in this case)
          output = model(data)
          
          # Compute the loss sum up batch loss
          test_loss += criterion(output, target).item()
          
          scores, predictions = torch.max(output.data, 1)
          test_total += target.size(0)
          test_correct += int(sum(predictions == target))

  acc = round((test_correct / test_total) * 100, 2)
  print(" Test_loss: {}, Test_accuracy: {}".format(test_loss/test_total, acc))

步骤 7: 初始化网络的损失函数和优化器

net = Net().to(device)
print(net)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())

输出：

Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

步骤 8: 开始训练

ffor epoch in range(8):
  train(net, device, trainloader, optimizer, criterion, epoch)
  test(net, device, testloader, criterion, CLASS_NAMES)

print("Finished Training")

输出：

Epoch [0], Loss: 0.0461193907892704, Accuracy: 45.45 Test_loss: 0.036131812924146654, Test_accuracy: 58.58
Epoch [1], Loss: 0.03446852257728577, Accuracy: 60.85 Test_loss: 0.03089196290373802, Test_accuracy: 65.27
Epoch [2], Loss: 0.029333480607271194, Accuracy: 66.83 Test_loss: 0.027052838513255118, Test_accuracy: 70.41
Epoch [3], Loss: 0.02650276515007019, Accuracy: 70.21 Test_loss: 0.02630699208676815, Test_accuracy: 70.99
Epoch [4], Loss: 0.024451716771125794, Accuracy: 72.41 Test_loss: 0.024404651895165445, Test_accuracy: 73.03
Epoch [5], Loss: 0.022718262011408807, Accuracy: 74.35 Test_loss: 0.023125074282288553, Test_accuracy: 74.86
Epoch [6], Loss: 0.021408387248516084, Accuracy: 75.76 Test_loss: 0.023151200053095816, Test_accuracy: 74.43
Epoch [7], Loss: 0.02033562403023243, Accuracy: 76.91 Test_loss: 0.023537022879719736, Test_accuracy: 73.93
Finished Training

不含Dropout正则化的网络

接下来，我们将构建相同的网络，但不包含Dropout层。

我们将在相同的数据集和训练周期上训练这个网络，并评估Dropout对网络性能的影响，这里将使用相同的训练和测试函数。

步骤 1: 实现网络

class Net(nn.Module):
  def __init__(self, input_shape=(3,32,32)):
    super(Net, self).__init__()
    
    self.conv1 = nn.Conv2d(3, 32, 3)
    self.conv2 = nn.Conv2d(32, 64, 3)
    self.conv3 = nn.Conv2d(64, 128, 3)
    
    self.pool = nn.MaxPool2d(2,2)

    n_size = self._get_conv_output(input_shape)
    
    self.fc1 = nn.Linear(n_size, 512)
    self.fc2 = nn.Linear(512, 10)

  def _get_conv_output(self, shape):
    batch_size = 1
    input = torch.autograd.Variable(torch.rand(batch_size, *shape))
    output_feat = self._forward_features(input)
    n_size = output_feat.data.view(batch_size, -1).size(1)
    return n_size

  def _forward_features(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = self.pool(F.relu(self.conv3(x)))
    return x
      
  def forward(self, x):
    x = self._forward_features(x)
    x = x.view(x.size(0), -1)
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

步骤 2: 初始化损失函数和优化器

net = Net().to(device)
print(net)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())

输出：

Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)

步骤 3: 开始训练

for epoch in range(8):
  train(net, device, trainloader, optimizer, criterion, epoch)
  test(net, device, testloader, criterion, CLASS_NAMES)

print("Finished Training")

输出：

Epoch [0], Loss: 0.04425342482566833, Accuracy: 48.29 Test_loss: 0.03506121407747269, Test_accuracy: 59.93
Epoch [1], Loss: 0.0317487561249733, Accuracy: 63.97 Test_loss: 0.029791217082738877, Test_accuracy: 66.4
Epoch [2], Loss: 0.026000032302737237, Accuracy: 70.83 Test_loss: 0.027046055325865747, Test_accuracy: 69.97
Epoch [3], Loss: 0.022179243130385877, Accuracy: 75.11 Test_loss: 0.02481114484965801, Test_accuracy: 72.95
Epoch [4], Loss: 0.01933788091301918, Accuracy: 78.26 Test_loss: 0.024382170912623406, Test_accuracy: 73.55
Epoch [5], Loss: 0.016771901984512807, Accuracy: 81.04 Test_loss: 0.024696413831412793, Test_accuracy: 73.53
Epoch [6], Loss: 0.014588635778725148, Accuracy: 83.41 Test_loss: 0.025593858751654625, Test_accuracy: 73.94
Epoch [7], Loss: 0.01255791916936636, Accuracy: 85.94 Test_loss: 0.026889967443048952, Test_accuracy: 73.69
Finished Training