基于 chinese-roberta-wwm-ext 微调训练 6 分类情感分析模型

news2025/7/7 10:06:19

一、模型和数据集介绍

1.1 预训练模型

chinese-roberta-wwm-ext 是基于 RoBERTa 架构下开发，其中 wwm 代表 Whole Word Masking，即对整个词进行掩码处理，通过这种方式，模型能够更好地理解上下文和语义关联，提高中文文本处理的准确性和效果。

与原始的 BERT 模型相比，chinese-roberta-wwm-ext 在训练数据规模和训练步数上做了一些调整，以进一步提升模型的性能和鲁棒性。并且在大规模无监督语料库上进行了预训练，使其具备强大的语言理解和生成能力。它能够广泛应用于各种自然语言处理任务，如文本分类、命名实体识别、情感分析等。我们可以使用这个模型作为基础，在不同的任务上进行微调和迁移学习，以实现更准确、高效的中文文本处理。

huggingface地址：https://huggingface.co/hfl/chinese-roberta-wwm-ext

进到 huggingface 中下载预训练模型：

在这里插入图片描述

1.2 数据集

数据集采用 SMP2020微博情绪分类评测 ，进入下面链接下载数据集：

https://smp2020ewect.github.io/

在这里插入图片描述

下载后数据分了三种类型，训练集、测试集、验证集：

在这里插入图片描述

数据的格式为 JSON 格式，结构如下：

在这里插入图片描述

其中 label 为当前 content 的分类，一共分了 6 种类别，如下所示：

label	说明
fear	恐惧
neutral	无情绪
sad	悲伤
surprise	惊奇
angry	愤怒
happy	积极

二、模型微调训练

2.1、处理数据集构建DataLoader

from transformers import BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
import numpy as np
import torch
import json
import os


class GenDateSet():
    def __init__(self, tokenizer, train_file, val_file, label_dict, max_length=128, batch_size=10):
        self.train_file = train_file
        self.val_file = val_file
        self.max_length = max_length
        self.batch_size = batch_size
        self.label_dict = label_dict
        self.tokenizer = tokenizer

    def gen_data(self, file):
        if not os.path.exists(file):
            raise Exception("数据集不存在")
        input_ids = []
        input_types = []
        input_masks = []
        labels = []
        with open(file, encoding='utf8') as f:
            data = json.load(f)
        if not data:
            raise Exception("数据集不存在")

        # 处理数据
        for index, item in enumerate(data):
            text = item['content']
            label = item['label']
            tokens = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_length)
            input_id, types, masks = tokens['input_ids'], tokens['token_type_ids'], tokens['attention_mask']
            input_ids.append(input_id)
            input_types.append(types)
            input_masks.append(masks)
            y_ = self.label_dict[label]
            labels.append([y_])

            if index % 1000 == 0:
                print('处理', index, '条数据')

        # 构建 TensorDataset
        data_gen = TensorDataset(torch.LongTensor(np.array(input_ids)),
                                 torch.LongTensor(np.array(input_types)),
                                 torch.LongTensor(np.array(input_masks)),
                                 torch.LongTensor(np.array(labels)))
        # 打乱
        sampler = RandomSampler(data_gen)
        # 构建 DataLoader
        return DataLoader(data_gen, sampler=sampler, batch_size=self.batch_size)

    def gen_train_data(self):
        # 生成训练集
        return self.gen_data(self.train_file)

    def gen_val_data(self):
        # 生成验证集
        return self.gen_data(self.val_file)

2.2 构建模型迭代训练

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn as nn
from tqdm import tqdm
from gen_datasets import GenDateSet

# 标签结构
label_dict = {
    'fear': 0,
    'neutral': 1,
    'sad': 2,
    'surprise': 3,
    'angry': 4,
    'happy': 5
}

# 预训练模型位置
model_dir = 'D:\\AIGC\\model\\chinese-roberta-wwm-ext'
# 这里暂时使用测试集训练，数据较少
train_file = 'data/usual_train.txt'
# train_file = 'data/usual_test_labeled.txt'
# 验证集
val_file = 'data/virus_eval_labeled.txt'
# 训练模型存储位置
save_model_path = './model/'
# 最大长度
max_length = 128
# 分类数
num_classes = 6
batch_size = 10
epoch = 10

def val(model, device, data):
    model.eval()
    test_loss = 0.0
    acc = 0
    for (input_id, types, masks, y) in tqdm(data):
        input_id, types, masks, y = input_id.to(device), types.to(device), masks.to(device), y.to(device)
        with torch.no_grad():
            y_ = model(input_id, token_type_ids=types, attention_mask=masks)
            logits = y_['logits']
        test_loss += nn.functional.cross_entropy(logits, y.squeeze())
        pred = logits.max(-1, keepdim=True)[1]
        acc += pred.eq(y.view_as(pred)).sum().item()
    test_loss /= len(data)
    return acc / len(data.dataset)


def main():
    # 加载 tokenizer 和  model
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=num_classes)
    # 加载数据集
    dateset = GenDateSet(tokenizer, train_file, val_file, label_dict, max_length, batch_size)
    # 训练集
    train_data = dateset.gen_train_data()
    # 验证集
    val_data = dateset.gen_val_data()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    model = model.to(device)
    # 优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    best_acc = 0.0
    for epoch_index in range(epoch):
        batch_epoch = 0
        for (input_id, types, masks, y) in tqdm(train_data):
            input_id, types, masks, y = input_id.to(device), types.to(device), masks.to(device), y.to(device)
            # 前向传播
            outputs = model(input_id, token_type_ids=types, attention_mask=masks, labels=y)
            # 梯度清零
            optimizer.zero_grad()
            # 计算 loss
            loss = outputs.loss
            # 反向传播
            loss.backward()
            optimizer.step()
            batch_epoch += 1
            if batch_epoch % 10 == 0:
                print('Train Epoch:', epoch_index, ' , batch_epoch: ', batch_epoch, ' , loss = ', loss.item())

        # 评估准确度
        acc = val(model, device, val_data)
        print('Train Epoch:', epoch_index, ' val acc = ', acc)
        # 存储 best model
        if best_acc < acc:
            # torch.save(model.state_dict(), save_model_path)
            model.save_pretrained("./model")
            tokenizer.save_pretrained("./model")
            best_acc = acc

if __name__ == '__main__':
    main()

运行之后可以看到训练进度：

在这里插入图片描述

训练中可以看到验证集的准确率，等待训练结束后，可以在 model 下看到保存的模型和 tokenizer 文件：

在这里插入图片描述

三、模型测试

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

label_dict = {
    0: '恐惧',
    1: '无情绪',
    2: '悲伤',
    3: '惊奇',
    4: '愤怒',
    5: '积极'
}
model_dir = "./model"
num_classes = 6
max_length = 128


def main():
    # 加载预训练模型和分词器
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=num_classes)

    while True:
        text = input("请输入内容: \n ")
        if not text or text == "":
            continue
        if text == "q":
            break

        encoded_input = tokenizer(text, padding="max_length", truncation=True, max_length=max_length)

        input_ids = torch.tensor([encoded_input['input_ids']])
        token_type_ids = torch.tensor([encoded_input['token_type_ids']])
        attention_mask = torch.tensor([encoded_input['attention_mask']])
        # 前向传播
        y_ = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        output = y_['logits'][0]
        pred = output.max(-1, keepdim=True)[1][0].item()
        print('预测结果：', label_dict[pred])

if __name__ == '__main__':
    main()