基于遗传算法特征选择及单层感知机模型的IMDB电影评论文本分类案例

1.数据载入及处理
2.感知机模型建立
3.模型训练
4.遗传算法进行特征选择
- 注意
5.联系我们

1.数据载入及处理

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from keras.datasets import imdb
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

max_features = 10000
maxlen = 200
batch_size = 32

# 加载IMDB数据集
print('Loading data...')
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')

# 限定评论长度，并进行填充
print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)[:2000]
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)[:2000]
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

# 将整数序列转换为文本
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in input_train[0]])

# 使用词袋模型表示文本
vectorizer = CountVectorizer(max_features=max_features)
X_train = vectorizer.fit_transform([' '.join([reverse_word_index.get(i - 3, '?') for i in sequence]) for sequence in input_train])
X_test = vectorizer.transform([' '.join([reverse_word_index.get(i - 3, '?') for i in sequence]) for sequence in input_test])

# 转换数据为PyTorch张量
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

batch_size = 2000
train_iter = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size)
test_iter = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size)

2.感知机模型建立

# 定义感知机网络
class Perceptron(nn.Module):
    def __init__(self, input_size):
        super(Perceptron, self).__init__()
        self.fc = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

# 训练感知机模型
def train(model, iterator, optimizer, criterion):
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, label = batch
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label)
        loss.backward()
        optimizer.step()

# 测试感知机模型
def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for batch in iterator:
            text, label = batch
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, label)
            total_loss += loss.item()
            rounded_preds = torch.round(predictions)
            total_correct += (rounded_preds == label).sum().item()
    return total_loss / len(iterator), total_correct / len(iterator.dataset)

# 初始化感知机模型
input_size = X_train_tensor.shape[1]
model = Perceptron(input_size)

3.模型训练

# # 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

N_EPOCHS = 10
eval_acc_list = []
for epoch in range(N_EPOCHS):
    train(model, train_iter, optimizer, criterion)
    eval_loss, eval_acc = evaluate(model, test_iter, criterion)
    eval_acc_list.append(eval_acc)
    print(f'Epoch: {epoch+1}, Test Loss: {eval_loss:.3f}, Test Acc: {eval_acc*100:.2f}%')

plt.plot(range(N_EPOCHS), eval_acc_list)
plt.title('Test Accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

在这里插入图片描述

4.遗传算法进行特征选择

# 随机初始化染色体
def initialize_population(population_size, num_genes):
    # # Option 1:
    # p=np.array([0.05,0.95])
    # return np.random.choice([0, 1], size=(population_size, num_genes), p=p.ravel())

    # Option 2:
    return np.random.choice([0, 1], size=(population_size, num_genes))

# 计算适应值，以分类器的准确度
def calculate_fitness(population, model, criterion):
    fitness = []
    for chromosome in population: # population: a 0-1 sequence 
        selected_features = np.where(chromosome == 1)[0] 

        # 更新模型输入维度
        input_dim = len(selected_features)
        model.fc = nn.Linear(input_dim, 1)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        idx = torch.tensor(selected_features)        
        train_iter = DataLoader(TensorDataset(X_train_tensor[:, idx], y_train_tensor), batch_size)
        test_iter = DataLoader(TensorDataset(X_test_tensor[:, idx], y_test_tensor), batch_size)

        # 训练并获取准确度
        N_EPOCHS = 10
        for epoch in range(N_EPOCHS):
            train(model, train_iter, optimizer, criterion)
            test_loss, test_acc = evaluate(model, test_iter, criterion)
            model.train() 
        fitness.append(test_acc)
    return np.array(fitness)

# 选择
def selection(population, fitness): # input populations and their accuracy
    probabilities = fitness / sum(fitness) # the accuracy-based probability of selection

    # # Option 1: no random in selection, choose the top 2 as parents
    # probabilities_copy = probabilities.copy()
    # probabilities_copy.sort()
    # max_1 = probabilities_copy[-1]
    # max_2 = probabilities_copy[-2]
    # max_1_index = np.where(probabilities == max_1)
    # max_2_index = np.where(probabilities == max_2)
    # selected_indices = [max_1_index[0].tolist()[0], max_2_index[0].tolist()[0]] * 25

    # Option 2: random 
    selected_indices = np.random.choice(range(len(population)), size=len(population), p=probabilities)

    return population[selected_indices]

# 交叉
def crossover(parents, crossover_rate):
    children = []
    for i in range(0, len(parents), 2):
        parent1, parent2 = parents[i], parents[i + 1]
        if np.random.rand() < crossover_rate:
            crossover_point = np.random.randint(1, len(parent1))
            child1 = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
            child2 = np.concatenate((parent2[:crossover_point], parent1[crossover_point:]))
        else:
            child1, child2 = parent1, parent2
        children.extend([child1, child2])
    return np.array(children)

# 变异
def mutation(children, mutation_rate):
    for i in range(len(children)):
        mutation_points = np.where(np.random.rand(len(children[i])) < mutation_rate)[0]
        children[i][mutation_points] = 1 - children[i][mutation_points]  # key
    return children

# 定义遗传算法的主函数
def genetic_algorithm(population_size, num_genes, generations, crossover_rate, mutation_rate, model, criterion):
    # 初始化染色体
    population = initialize_population(population_size, num_genes)

    fitness_list = []

    for generation in range(generations):
        print('Generation', generation+1, ":")
        fitness = calculate_fitness(population, model, criterion) # return a list (1, population_size) with history test acc

        # 选择
        selected_population = selection(population, fitness) # return a list, (population_size, num_genes / input_size / sentence_length), each adjacent are parents

        # 交叉
        children = crossover(selected_population, crossover_rate)

        # 变异
        mutated_children = mutation(children, mutation_rate)

        # 形成新种群
        population = mutated_children

        # 输出当前最优解
        best_individual = population[np.argmax(fitness)]
        fitness_list.append(fitness.max())
        print(f"Generation {generation + 1}, Best Individual: {best_individual}, Fitness: {fitness.max()}")

    plt.plot(range(generations), fitness_list)
    plt.title('Test Accuracy with feature selection via genetic algorithm')
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.show()

    # 返回最优解
    best_individual = population[np.argmax(fitness)]
    return best_individual

# 调用遗传算法
model = Perceptron(input_size)
best_solution = genetic_algorithm(population_size=50, num_genes=input_size, generations=10, crossover_rate=0.8, mutation_rate=0.1, model=model, criterion=criterion)
print(f"Final Best Solution: {best_solution}")

# 解释最优解
selected_features = np.where(best_solution == 1)[0]
print(f"Selected Features: {selected_features}")
print("Shape of Selected Features = ",selected_features.shape)

在这里插入图片描述

注意

在本任务中，selection函数中第一个option 1仅选择效果最好的两个染色体作为父母比option 2在population中随机选择的效率更高（10轮次后，验证集精度74%>71%）；
在本任务中，初始化initialize_population函数中指定选择更多的特征（95%, Option 1）比随机选择特征（50%, Option 2）的效率更高；
每一次基于筛选输入特征的维度修改模型结构参数后，需要注意重申一下 optimizer变量，因为optimizer的声明中涉及model.parameters()