bert 相似度任务训练，简单版本

news2025/7/13 1:56:29

任务

代码

train.py

predit.py

数据

任务

使用 bert-base-chinese 训练相似度任务，参考：微调BERT模型实现相似性判断 - 知乎

参考他上面代码，他使用的是 BertForNextSentencePrediction 模型，BertForNextSentencePrediction 原本是设计用于下一个句子预测任务的。在BERT的原始训练中，模型会接收到一对句子，并试图预测第二个句子是否紧跟在第一个句子之后；所以使用这个模型标签(label)只能是 0,1，相当于二分类任务了

但其实在相似度任务中，我们每一条数据都是【text1\ttext2\tlabel】的形式，其中 label 代表相似度，可以给两个文本打分表示相似度，也可以映射为分类任务，0 代表不相似，1 代表相似，他这篇文章利用了这种思想，对新手还挺有用的。

现在我搞了一个招聘数据，里面有办公区域列，处理过了，每一行代表【地址1\t地址2\t相似度】

只要两文本中有一个地址相似我就作为相似，标签为 1，否则 0

利用这数据微调，没有使用验证数据集，就最后使用测试集来看看效果。

代码

train.py

import json
import torch
from transformers import BertTokenizer, BertForNextSentencePrediction
from torch.utils.data import DataLoader, Dataset


# 能用gpu就用gpu
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

bacth_size = 32
epoch = 3
auto_save_batch = 5000
learning_rate = 2e-5


# 准备数据集
class MyDataset(Dataset):
    def __init__(self, data_file_paths):
        self.texts = []
        self.labels = []
        # 分词器用默认的
        self.tokenizer = BertTokenizer.from_pretrained('../bert-base-chinese')
        # 自己实现对数据集的解析
        with open(data_file_paths, 'r', encoding='utf-8') as f:
            for line in f:
                text1, text2, label = line.split('\t')
                self.texts.append((text1, text2))
                self.labels.append(int(label))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text1, text2 = self.texts[idx]
        label = self.labels[idx]
        encoded_text = self.tokenizer(text1, text2, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        return encoded_text, label


# 训练数据文件路径
train_dataset = MyDataset('../data/train.txt')

# 定义模型
# num_labels=5 定义相似度评分有几个
model = BertForNextSentencePrediction.from_pretrained('../bert-base-chinese', num_labels=6)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 训练模型
train_loader = DataLoader(train_dataset, batch_size=bacth_size, shuffle=True)
trained_data = 0
batch_after_last_save = 0
total_batch = 0
total_epoch = 0

for epoch in range(epoch):
    trained_data = 0
    for batch in train_loader:
        inputs, labels = batch
        # 不知道为啥，出来的数据维度是 (batch_size, 1, 128)，需要把第二维去掉
        inputs['input_ids'] = inputs['input_ids'].squeeze(1)
        inputs['token_type_ids'] = inputs['token_type_ids'].squeeze(1)
        inputs['attention_mask'] = inputs['attention_mask'].squeeze(1)
        # 因为要用GPU，将数据传输到gpu上
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss, logits = outputs[:2]
        loss.backward()
        optimizer.step()
        trained_data += len(labels)
        trained_process = float(trained_data) / len(train_dataset)
        batch_after_last_save += 1
        total_batch += 1
        # 每训练 auto_save_batch 个 batch，保存一次模型
        if batch_after_last_save >= auto_save_batch:
            batch_after_last_save = 0
            model.save_pretrained(f'../output/cn_equal_model_{total_epoch}_{total_batch}.pth')
            print("保存模型：cn_equal_model_{}_{}.pth".format(total_epoch, total_batch))
        print("训练进度：{:.2f}%, loss={:.4f}".format(trained_process * 100, loss.item()))
    total_epoch += 1
    model.save_pretrained(f'../output/cn_equal_model_{total_epoch}_{total_batch}.pth')
    print("保存模型：cn_equal_model_{}_{}.pth".format(total_epoch, total_batch))

训练好后的文件，输出的最后一个文件夹才是效果最好的模型：

predit.py

import torch
from transformers import BertTokenizer, BertForNextSentencePrediction


tokenizer = BertTokenizer.from_pretrained('../bert-base-chinese')
model = BertForNextSentencePrediction.from_pretrained('../output/cn_equal_model_3_171.pth')

with torch.no_grad():
    with open('../data/test.txt', 'r', encoding='utf8') as f:
        lines = f.readlines()
        correct = 0
        for i, line in enumerate(lines):
            text1, text2, label = line.split('\t')
            encoded_text = tokenizer(text1, text2, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
            outputs = model(**encoded_text)
            res = torch.argmax(outputs.logits, dim=1).item()
            print(text1, text2, label, res)
            if str(res) == label.strip('\n'):
                correct += 1
            print(f'{i + 1}/{len(lines)}')
        print(f'acc:{correct / len(lines)}')