基于 BERT+BILSTM 实现情感分析分类（附源码）

一、数据集

二、数据清洗和划分

2.1 安装依赖

2.2 清洗和划分

三、下载 Bert 模型

四、训练和测试模型

本文主要基于 Bert 和 BiLSTM 实现情感分类，其中参考了多个博客，具体见参考链接。

源码已上传Gitee : bert-bilstm-in-Sentiment-classification

一、数据集

数据集采用疫情期间微博评论数据集，一共10万条，可从这里下载Weibo nCoV Data。

二、数据清洗和划分

2.1 安装依赖

主要安装如下依赖，pip 更换国内源。

pandas
scikit-learn
matplotlib
seaborn
torch
torchvision
datasets
transformers
pytorch_lightning

2.2 清洗和划分

实现代码如下所示。

import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read Data
df = pd.read_csv('data/nCoV_100k_train.labled.csv')

# Only need text and labels
df = df[['微博中文内容', '情感倾向']]
df = df.rename(columns={'微博中文内容': 'text', '情感倾向': 'label'})
print(df)

# Observing data balance
print(df.label.value_counts())
print(df.label.value_counts() / df.shape[0] * 100)
plt.figure(figsize=(8, 4))
sns.countplot(x='label', data=df)
plt.show()
# print(df_train[df_train.label > 5.0])
# print(df_train[(df_train.label < -1.1)])
# # discarding outliers
# df_train.drop(df_train[(df_train.label < -1.1) | (df_train.label > 5)].index, inplace=True, axis=0)
# df_train.reset_index(inplace=True, drop=True)
# print(df_train.label.value_counts())
# sns.countplot(x='label', data=df_train)
# plt.show()

df.drop(df[(df.label == '4') |
           (df.label == '-') |
           (df.label == '·') |
          (df.label == '-2') |
          (df.label == '10') |
           (df.label == '9')].index, inplace=True, axis=0)
df.reset_index(inplace=True, drop=True)
print(df.value_counts())
sns.countplot(x='label', data=df)
plt.show()


# checking for empty rows
print(df.isnull().sum())
# deleting empty row data
df.dropna(axis=0, how='any', inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.isnull().sum())

# examining duplicate data
print(df.duplicated().sum())
print(df[df.duplicated()==True])
# deleting duplicate data
index = df[df.duplicated() == True].index
df.drop(index, axis=0, inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.duplicated().sum())

# We also need to address duplicate data where the text is the same but the label is different
print(df['text'].duplicated().sum())
print(df[df['text'].duplicated() == True])


# viewing examples
print(df[df['text'] == df.iloc[1473]['text']])
print(df[df['text'] == df.iloc[1814]['text']])

# removing data where the text is the same but the label is different
index = df[df['text'].duplicated() == True].index
df.drop(index, axis=0, inplace=True)
df.reset_index(inplace=True, drop=True)

# checking
print(df['text'].duplicated().sum())  # 0
print(df)


# inspecting shapes and indices
print("======data-clean======")
print(df.tail())
print(df.shape)

# viewing the maximum length of text
print(df['text'].str.len().sort_values())


# Split dataset. 0.6/0.2/0.2
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.25)
print(train.shape)
print(test.shape)
print(val.shape)

train.to_csv('./data/clean/train.csv', index=None)
val.to_csv('./data/clean/val.csv', index=None)
test.to_csv('./data/clean/test.csv', index=None)

三、下载 Bert 模型

在 Hugging Face google-bert/bert-base-chinese 下载对应的模型文件，魔搭社区下载的运行可能有如下问题。

safetensors_rust.SafetensorError: Error while deserializing header: HeaderTooLarge

下载对应的模型文件。

四、训练和测试模型

训练和测试代码如下所示。

# https://www.kaggle.com/code/isseyice/sentiment-classification-based-on-bert-and-lstm
# https://github.com/iceissey/issey_Kaggle/blob/main/Bert_BiLSTM/BiLSTM_lighting.py#L36
# https://www.cnblogs.com/chuanzhang053/p/17653381.html
import torch
import datasets
import pandas as pd
from datasets import load_dataset  # hugging-face dataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import torch.optim as optim
from torch.nn.functional import one_hot
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torchmetrics.functional import accuracy, recall, precision, f1_score  # lightning中的评估
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint


# todo: 定义超参数
batch_size = 16
epochs = 5
dropout = 0.1
rnn_hidden = 768
rnn_layer = 1
class_num = 3
lr = 0.001
PATH = './model'  # model checkpoint path
# 分词器
token = BertTokenizer.from_pretrained('./model/bert-base-chinese')

# Customize dataset 自定义数据集
class MydataSet(Dataset):
    def __init__(self, path, split):
        # self.dataset = load_dataset('csv', data_files=path, split=split)  # TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'.
        self.df = pd.read_csv(path)
        self.dataset = datasets.Dataset.from_pandas(self.df)

    def __getitem__(self, item):
        text = self.dataset[item]['text']
        label = self.dataset[item]['label']
        return text, label

    def __len__(self):
        return len(self.dataset)


# todo: 定义批处理函数
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    # 分词并编码
    data = token.batch_encode_plus(
        batch_text_or_text_pairs=sents,  # 单个句子参与编码
        truncation=True,  # 当句子长度大于max_length时,截断
        padding='max_length',  # 一律补pad到max_length长度
        max_length=300,
        return_tensors='pt',  # 以pytorch的形式返回，可取值tf,pt,np,默认为返回list
        return_length=True,
    )

    # input_ids: 编码之后的数字
    # attention_mask: 是补零的位置是0,其他位置是1
    input_ids = data['input_ids']   # input_ids 就是编码后的词
    attention_mask = data['attention_mask']  # pad的位置是0,其他位置是1
    token_type_ids = data['token_type_ids']  # (如果是一对句子)第一个句子和特殊符号的位置是0,第二个句子的位置是1
    labels = torch.LongTensor(labels)  # 该批次的labels

    # print(data['length'], data['length'].max())
    return input_ids, attention_mask, token_type_ids, labels


# 定义模型，上游使用bert预训练，下游任务选择双向LSTM模型，最后加一个全连接层
class BiLSTMClassifier(nn.Module):
    def __init__(self, drop, hidden_dim, output_dim):
        super(BiLSTMClassifier, self).__init__()
        self.drop = drop
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim

        # 加载bert中文模型
        self.embedding = BertModel.from_pretrained('./model/bert-base-chinese')

        # 冻结上游模型参数(不进行预训练模型参数学习)
        for param in self.embedding.parameters():
            param.requires_grad_(False)
        
        #生成下游RNN层以及全连接层
        self.lstm = nn.LSTM(input_size=768, hidden_size=self.hidden_dim, num_layers=2, batch_first=True,
                            bidirectional=True, dropout=self.drop)
        self.fc = nn.Linear(self.hidden_dim * 2, self.output_dim)
        # 使用CrossEntropyLoss作为损失函数时，不需要激活。因为实际上CrossEntropyLoss将softmax-log-NLLLoss一并实现的。

    def forward(self, input_ids, attention_mask, token_type_ids):
        embedded = self.embedding(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # 第0维才是我们需要的embedding,embedding.last_hidden_state = embedding[0]
        embedded = embedded.last_hidden_state  # The embedding we need is in the 0th dimension. So, to extract it, we use: embedding.last_hidden_state = embedding[0].
        out, (h_n, c_n) = self.lstm(embedded)
        output = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)  # This part represents the output of the bidirectional LSTM. For detailed explanations, please refer to the article.
        output = self.fc(output)
        return output


# Define PyTorch Lightning
class BiLSTMLighting(pl.LightningModule):
    def __init__(self, drop, hidden_dim, output_dim):
        super(BiLSTMLighting, self).__init__()
        self.model = BiLSTMClassifier(drop, hidden_dim, output_dim)  # Set up model
        self.criterion = nn.CrossEntropyLoss()  # Set up loss function
        self.train_dataset = MydataSet('./data/clean/train.csv', 'train')
        self.val_dataset = MydataSet('./data/clean/val.csv', 'train')
        self.test_dataset = MydataSet('./data/clean/test.csv', 'train')

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=lr)
        return optimizer

    def forward(self, input_ids, attention_mask, token_type_ids):  # forward(self,x)
        return self.model(input_ids, attention_mask, token_type_ids)

    def train_dataloader(self):
        train_loader = DataLoader(dataset=self.train_dataset, batch_size=batch_size, collate_fn=collate_fn,
                                  shuffle=True, num_workers=3)
        return train_loader

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, labels = batch  # x, y = batch
        y = one_hot(labels + 1, num_classes=3)
        # 将one_hot_labels类型转换成float
        y = y.to(dtype=torch.float)
        # forward pass
        y_hat = self.model(input_ids, attention_mask, token_type_ids)
        # y_hat = y_hat.squeeze()  # 将[128, 1, 3]挤压为[128,3]
        loss = self.criterion(y_hat, y)  # criterion(input, target)
        self.log('train_loss', loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)  # 将loss输出在控制台
        return loss  # 必须把log返回回去才有用

    def val_dataloader(self):
        val_loader = DataLoader(dataset=self.val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, num_workers=3)
        return val_loader

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, labels = batch
        y = one_hot(labels + 1, num_classes=3)
        y = y.to(dtype=torch.float)
        # forward pass
        y_hat = self.model(input_ids, attention_mask, token_type_ids)
        # y_hat = y_hat.squeeze()
        loss = self.criterion(y_hat, y)
        self.log('val_loss', loss, prog_bar=False, logger=True, on_step=True, on_epoch=True)
        return loss

    def test_dataloader(self):
        test_loader = DataLoader(dataset=self.test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, num_workers=3)
        return test_loader

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, labels = batch
        target = labels + 1  # Used for calculating accuracy and F1-score later on
        y = one_hot(target, num_classes=3)
        y = y.to(dtype=torch.float)
        # forward pass
        y_hat = self.model(input_ids, attention_mask, token_type_ids)
        # y_hat = y_hat.squeeze()
        pred = torch.argmax(y_hat, dim=1)
        acc = (pred == target).float().mean()

        loss = self.criterion(y_hat, y)
        self.log('loss', loss)
        # task: Literal["binary", "multiclass", "multilabel"], corresponding to [binary classification, multiclass classification, multilabel classification]
        # average=None: outputs scores for each class separately; without it, the scores are averaged.
        re = recall(pred, target, task="multiclass", num_classes=class_num, average=None)
        pre = precision(pred, target, task="multiclass", num_classes=class_num, average=None)
        f1 = f1_score(pred, target, task="multiclass", num_classes=class_num, average=None)

        def log_score(name, scores):
            for i, score_class in enumerate(scores):
                self.log(f"{name}_class{i}", score_class)

        log_score("recall", re)
        log_score("precision", pre)
        log_score("f1", f1)
        self.log('acc', accuracy(pred, target, task="multiclass", num_classes=class_num))
        self.log('avg_recall', recall(pred, target, task="multiclass", num_classes=class_num, average="weighted"))
        self.log('avg_precision', precision(pred, target, task="multiclass", num_classes=class_num, average="weighted"))
        self.log('avg_f1', f1_score(pred, target, task="multiclass", num_classes=class_num, average="weighted"))

# 训练
def train():
    # 增加回调最优模型，这个比较好用
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',  # Monitor object: 'val_loss'
        dirpath='./model/checkpoints/',  # Path to save the model
        filename='model-{epoch:02d}-{val_loss:.2f}',  # Name of the best model
        save_top_k=1,  # Save only the best one
        mode='min'  # When the monitored metric is at its lowest.
    )

    # Trainer可以帮助调试，比如快速运行、只使用一小部分数据进行测试、完整性检查等，
    # 详情请见官方文档https://lightning.ai/docs/pytorch/latest/debug/debugging_basic.html
    # auto自适应gpu数量
    trainer = Trainer(max_epochs=epochs, log_every_n_steps=10, accelerator='gpu', devices="auto", fast_dev_run=False, callbacks=[checkpoint_callback])
    model = BiLSTMLighting(drop = dropout, hidden_dim = rnn_hidden, output_dim = class_num)
    trainer.fit(model)
    
    return model

# 测试
def test(model = None):
    # Load the parameters of the previously trained best model.
    if model is None:
        model = BiLSTMLighting.load_from_checkpoint(checkpoint_path=PATH,
                                                    drop=dropout, hidden_dim=rnn_hidden, output_dim=class_num)
    trainer = Trainer(fast_dev_run=False)
    result = trainer.test(model)
    print(result)

if __name__ == '__main__':

    model = train()
    test(model)

执行如下命令。

python main.py

运行结果：

root@dsw-398300-bf64cb7b7-f28cl:/mnt/workspace/bert-bilstm-in-sentiment-classification# python main.py 
2024-07-13 13:18:25.494250: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-13 13:18:25.933228: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-13 13:18:27.151707: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at ./model/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Missing logger folder: /mnt/workspace/bert-bilstm-in-sentiment-classification/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | BiLSTMClassifier | 125 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
23.6 M    Trainable params
102 M     Non-trainable params
125 M     Total params
503.559   Total estimated model params size (MB)
Epoch 4: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4506/4506 [13:41<00:00,  5.48it/s, loss=0.52, v_num=0, train_loss_step=0.654, train_loss_epoch=0.566]`Trainer.fit` stopped: `max_epochs=5` reached.                                                                                                                                                                                                                                              
Epoch 4: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4506/4506 [13:42<00:00,  5.48it/s, loss=0.52, v_num=0, train_loss_step=0.654, train_loss_epoch=0.566]
root@dsw-398300-bf64cb7b7-f28cl:/mnt/workspace/bert-bilstm-in-sentiment-classification# python main.py 
2024-07-13 15:10:59.831283: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-13 15:10:59.868774: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-13 15:11:00.420343: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at ./model/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | BiLSTMClassifier | 125 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
23.6 M    Trainable params
102 M     Non-trainable params
125 M     Total params
503.559   Total estimated model params size (MB)
Epoch 4: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4506/4506 [13:41<00:00,  5.48it/s, loss=0.544, v_num=1, train_loss_step=0.435, train_loss_epoch=0.568]`Trainer.fit` stopped: `max_epochs=5` reached.                                                                                                                                                                                                                                              
Epoch 4: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4506/4506 [13:41<00:00,  5.48it/s, loss=0.544, v_num=1, train_loss_step=0.435, train_loss_epoch=0.568]
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1764: PossibleUserWarning: GPU available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='gpu', devices=1)`.
  rank_zero_warn(
Testing DataLoader 0:  61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 690/1127 [46:08<29:13,  4.01s/it]
Testing DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1127/1127 [1:17:28<00:00,  4.12s/it]
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric        ┃       DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│            acc            │    0.7424814105033875     │
│          avg_f1           │    0.7335338592529297     │
│       avg_precision       │    0.7671085596084595     │
│        avg_recall         │    0.7424814105033875     │
│         f1_class0         │    0.5128293633460999     │
│         f1_class1         │    0.7843108177185059     │
│         f1_class2         │    0.6716415286064148     │
│           loss            │    0.5851244330406189     │
│     precision_class0      │     0.637470543384552     │
│     precision_class1      │    0.7558891773223877     │
│     precision_class2      │    0.7077022790908813     │
│       recall_class0       │    0.4741062521934509     │
│       recall_class1       │    0.8350556492805481     │
│       recall_class2       │    0.6949878334999084     │
└───────────────────────────┴───────────────────────────┘
[{'loss': 0.5851244330406189, 'recall_class0': 0.4741062521934509, 'recall_class1': 0.8350556492805481, 'recall_class2': 0.6949878334999084, 'precision_class0': 0.637470543384552, 'precision_class1': 0.7558891773223877, 'precision_class2': 0.7077022790908813, 'f1_class0': 0.5128293633460999, 'f1_class1': 0.7843108177185059, 'f1_class2': 0.6716415286064148, 'acc': 0.7424814105033875, 'avg_recall': 0.7424814105033875, 'avg_precision': 0.7671085596084595, 'avg_f1': 0.7335338592529297}]
root@dsw-398300-bf64cb7b7-f28cl:/mnt/workspace/bert-bilstm-in-sentiment-classification#

参考链接：

[1] 使用huggingface实现BERT+BILSTM情感3分类（附数据集源代码）_bert和bilstm中间需要加入drop层么-CSDN博客

[2] https://huggingface.co/google-bert/bert-base-chinese/tree/main

[3] 【NLP实战】基于Bert和双向LSTM的情感分类【上篇】_bert+lstm-CSDN博客

[4] 【NLP实战】基于Bert和双向LSTM的情感分类【中篇】_bert+lstm-CSDN博客

[5] https://github.com/iceissey/issey_Kaggle/tree/main/Bert_BiLSTM

[6] https://www.kaggle.com/code/isseyice/sentiment-classification-based-on-bert-and-lstm#Part-2:-Training-and-Evaluating-the-Model [7]https://www.kaggle.com/datasets/liangqingyuan/chinese-text-multi-classification?resource=download

[8]千言（LUGE）| 全面的中文开源数据集合