前言
最近开始关注LLM相关知识,但之前的NLP范式的技能不能丢。
这个练习还是比较适合我,感谢举办方选题,快速全部打卡一波。
打卡记录
任务一: 报名比赛,下载比赛数据集并完成读取
比赛链接:https://challenge.xfyun.cn/topic/info?type=text-detector&ch=vWxQGFU
报名,下载数据到本地后,利用pandas库读取csv
文件,
import pandas as pd
def load_data(data_dir='../data/'):
'''ChatGPT生成文本检测器公开数据.zip
'''
train = pd.read_csv(data_dir + 'train.csv',
sep=',', names=['name', 'label', 'content'])
test = pd.read_csv(data_dir + 'test.csv',
sep=',', names=['name', 'content'])
return train, test
train, test = load_data()
在notebook输出数据,看出文本已经通过一层映射脱敏处理处理成连续的token id, 意味着后面无法直接用现成pretrain 好的model或者embedding进行微调,需要重新预训练一下。
train
name | label | content | |
---|---|---|---|
0 | name | label | content |
1 | 1 | 0 | [4509 3181 1253 2278 290 3562 2051 599 3125 ... |
2 | 2 | 1 | [ 263 1325 2563 4160 2196 169 3125 2563 2619 ... |
3 | 3 | 0 | [3635 177 3125 1251 3839 5212 2109 1171 1194 ... |
4 | 4 | 1 | [3037 266 246 3547 1253 2278 3125 649 697 ... |
... | ... | ... | ... |
13996 | 13996 | 0 | [5212 1759 1953 139 1953 3180 3187 5212 3414 ... |
13997 | 13997 | 1 | [ 699 778 2777 1333 224 3543 998 139 4411 ... |
13998 | 13998 | 1 | [ 506 211 139 3333 3293 286 4358 272 5212 ... |
13999 | 13999 | 1 | [1583 169 123 2969 998 5212 1759 266 1435 ... |
14000 | 14000 | 0 | [1759 266 4399 205 5212 1759 266 4399 205 ... |
14001 rows × 3 columns
test
name | content | |
---|---|---|
0 | name | content |
1 | 14001 | [3125 2196 286 123 1539 1759 266 3549 649 ... |
2 | 14002 | [1109 2113 3122 213 3125 1294 5212 2338 2233 ... |
3 | 14003 | [ 236 3125 139 3037 5212 4294 1600 4550 3169 ... |
4 | 14004 | [ 13 13 13 0 0 0 245 1472 3125 ... |
... | ... | ... |
9996 | 23996 | [ 430 4829 302 3447 5212 2081 2176 1146 2321 ... |
9997 | 23997 | [ 0 109 3324 532 1294 199 1370 2176 1105 ... |
9998 | 23998 | [3406 1093 5212 3036 3635 3125 3406 1296 5212 ... |
9999 | 23999 | [ 0 0 3799 248 184 5195 4399 205 434 ... |
10000 | 24000 | [2471 4553 262 1759 2579 4555 1560 4549 3125 ... |
10001 rows × 2 columns
任务二:对数据集字符进行可视化,统计标签和字符分布
对数据集中train,test集合分别统计基本特征,看一下数据特征的分布是否一致。
这里数据是文本,主要特征就是文本的长度分布。
import pandas as pd
def plot_data_len_distribution(data_lens,
bins=None,
columns=['区间的数量']):
if bins:
cats = pd.cut(data_lens, bins)
value_counts = pd.value_counts(cats, sort=False)
else:
value_counts = pd.value_counts(data_lens, sort=False)
df = pd.DataFrame(value_counts, columns=columns)
df.plot(kind='bar')
df.plot.pie(subplots=True, autopct="%.2f%%")
train = train[1:]
test = test[1:]
train['label'] = train['label'].apply(lambda x: int(x))
test['label'] = train['label'].apply(lambda x: int(x))
train['content_len'] = train['content'].apply(lambda x:x[1:-1].split()).apply(lambda x: len(x))
test['content_len'] = test['content'].apply(lambda x:x[1:-1].split()).apply(lambda x: len(x))
plot_data_len_distribution(train['label'].tolist(),
columns=['标签的数量'])
plot_data_len_distribution(train['content_len'].tolist(),
bins = [180, 200, 230, 250, 260, 270, 275, 280, 290, 300, 320, 350, 360, 500, 1000],
columns=['字符区间的数量'])
plot_data_len_distribution(test['content_len'].tolist(),
bins = [180, 200, 230, 250, 260, 270, 275, 280, 290, 300, 320, 350, 360, 500, 1000],
columns=['字符区间的数量'])
统计训练数据中label标签分布:0,1标签数量比大概 6 :1,如下:
训练数据中文本长度的分布如下:
测试数据中文本长度的分布和训练集一致,文本长度都是200。
主要可以看出两点:
- 训练集中正负样本1:6
- 训练集和测试集文本长度全是200
任务三:使用TFIDF提取文本特征
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
train['content'] = train['content'].apply(lambda x: x[1:-1])
test['content'] = test['content'].apply(lambda x: x[1:-1])
train['label'] = train['label'].apply(lambda x: int(x))
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf = TfidfVectorizer(token_pattern=r'\S+|\n', # 数据中文本特征token被映射成token id,对token id采取整体。
analyzer='word',
ngram_range=(1, 3),
max_features=2000)
train_tfidf = tfidf.fit_transform(train['content'])
test_tfidf = tfidf.fit_transform(test['content'])
任务四:使用TFIDF特征和线性模型完成训练和预测
为了验证不同模型方案效果,我在训练集进一步划分10%数据作为测试集
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
tfidf = TfidfVectorizer(token_pattern=r'\S+|\n',
analyzer='word',
ngram_range=(1, 3), # ngram len 范围,调试时选取最佳
max_features=500) # 调试时,选取最佳
# 在train集合上进一步划分train、test集合
train_more, test_more = train_test_split(train,
test_size=0.1,
random_state=43)
train_more_tfidf = tfidf.fit_transform(train_more['content'])
test_more_tfidf = tfidf.fit_transform(test_more['content'])
# 使用Sklearn中的线性模型(如逻辑回归)进行训练,并使用训练好的模型对测试集进行预测。
lr_model = LogisticRegression()
lr_model.fit(
train_more_tfidf,
train_more['label']
)
test_more['predict'] = lr_model.predict(test_more_tfidf).astype(int)
# 评估模型的性能,如准确率、精确率、召回率等指标。
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
pre = precision_score(test_more['label'], test_more['predict'])
acc = accuracy_score(test_more['label'], test_more['predict'])
rec = recall_score(test_more['label'], test_more['predict'])
f1 = f1_score(test_more['label'], test_more['predict'])
fpr, tpr, _ = roc_curve(test_more['label'], test_more['predict'])
auc = auc(fpr, tpr)
print(" pre: %.2f\n acc: %.2f\n rec: %.2f\n f1: %.2f\n auc: %.2f" % (pre, acc, rec, f1, auc))
pre: 0.95
acc: 0.96
rec: 0.79
f1: 0.86
auc: 0.89
TFIDF + LR 提交结果
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(token_pattern=r'\S+|\n',
analyzer='word',
ngram_range=(1, 3),
max_features=500)
train_tfidf = tfidf.fit_transform(train['content'])
test_tfidf = tfidf.fit_transform(test['content'])
lr_model = LogisticRegression()
lr_model.fit(
train_tfidf,
train['label']
)
submit = pd.read_csv('./data/sample_submit.csv')
submit = submit.sort_values(by='name')
submit['label'] = lr_model.predict(test_tfidf).astype(int)
submit.to_csv('lr.csv', index=None) #提交大概 0.9223
任务五:使用TFIDF特征和XGBoost完成训练和预测
相比任务四,将lr替换成xgboost即可,效果有一定的提升,同时xgboost需要调试的参数更多一些。
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(token_pattern=r'\S+|\n',
analyzer='word',
ngram_range=(1, 3), # ngram len 范围,调试时选取最佳
max_features=500) # 调试时,选取最佳
train['label'] = train['label'].apply(lambda x: int(x))
# 在train集合上进一步划分train、test集合
train_more, test_more = train_test_split(train,
test_size=0.1,
random_state=43)
train_more_tfidf = tfidf.fit_transform(train_more['content'])
test_more_tfidf = tfidf.fit_transform(test_more['content'])
# 使用XGBoost进行训练,并使用训练好的模型对测试集进行预测。
xgb_model = XGBClassifier(max_depth=8,
learning_rate=0.01,
n_estimators=2000,
objective='binary:logistic',
booster='gbtree',
n_jobs=-1,
nthread=None,
gamma=0.3,
min_child_weight=1,
max_delta_step=0,
subsample=0.8,
colsample_bytree=0.8,
colsample_bylevel=0.8,
reg_alpha=0.5,
reg_lambda=0.5,
scale_pos_weight=6, # negative:postive = 6 :1 正样本权重
base_score=0.15, # negative:postive = 6 :1 200/1400 = 0.14
random_state=233,
seed=None)
xgb_model.fit(
train_more_tfidf,
train_more['label']
)
test_more['predict'] = xgb_model.predict(test_more_tfidf).astype(int)
# 评估模型的性能,如准确率、精确率、召回率等指标。
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
pre = precision_score(test_more['label'], test_more['predict'])
acc = accuracy_score(test_more['label'], test_more['predict'])
rec = recall_score(test_more['label'], test_more['predict'])
f1 = f1_score(test_more['label'], test_more['predict'])
fpr, tpr, _ = roc_curve(test_more['label'], test_more['predict'])
auc = auc(fpr, tpr)
print(" pre: %.2f\n acc: %.2f\n rec: %.2f\n f1: %.2f\n auc: %.2f" % (pre, acc, rec, f1, auc))
pre: 0.95
acc: 0.94
rec: 0.65
f1: 0.77
auc: 0.82
TFIDF + XGBoost 提交结果
from xgboost import XGBClassifier
tfidf = TfidfVectorizer(token_pattern=r'\S+|\n',
analyzer='word',
ngram_range=(1, 3),
max_features=500)
train_tfidf = tfidf.fit_transform(train['content'])
test_tfidf = tfidf.fit_transform(test['content'])
# 使用XGBoost进行训练,并使用训练好的模型对测试集进行预测。
xgb_model = XGBClassifier(max_depth=8,
learning_rate=0.01,
n_estimators=2000,
objective='binary:logistic',
booster='gbtree',
n_jobs=-1,
nthread=None,
gamma=0.3,
min_child_weight=1,
max_delta_step=0,
subsample=0.8,
colsample_bytree=0.8,
colsample_bylevel=0.8,
reg_alpha=0.5,
reg_lambda=0.5,
scale_pos_weight=6, # negative:postive = 6 :1 正样本权重
base_score=0.15, # negative:postive = 6 :1 200/1400 = 0.14
random_state=233,
seed=None)
xgb_model.fit(
train_tfidf,
train['label']
)
submit = pd.read_csv('./data/sample_submit.csv')
submit = submit.sort_values(by='name')
submit['label'] = xgb_model.predict(test_tfidf).astype(int)
submit.to_csv('xgb.csv', index=None) # 0.9737
任务六:学会训练FastText、Word2Vec词向量
这一步利用gensim库进行word2vec训练词向量的步骤。
# 参考 https://radimrehurek.com/gensim/models/word2vec.html
# 以这个比赛数据为例, 准备语料
train, test = load_data()
train = train[1:] # 第一行是属性名称
test = test[1:]
match_texts = []
for data in train["content"].tolist():
for line in data[1:-1].split("\n"):
match_texts.append(line.split(" "))
for data in test["content"].tolist():
for line in data[1:-1].split("\n"):
match_texts.append(line.split(" "))
print("该比赛数据集的语料一共有 %d 个句子" % len(match_texts))
#使用gensim库中的FastText类,设置相应的参数(如词向量维度、窗口大小、训练迭代次数等)来构建词向量模型。
from gensim.models import FastText
fasttext_model = FastText(vector_size=100,
window=5,
epochs=100,
min_count=2)
# 使用gensim库中的Word2Vec类,设置相应的参数(如词向量维度、窗口大小、训练迭代次数等)来构建词向量模型。
from gensim.models.word2vec import Word2Vec
w2v_model = Word2Vec(vector_size=100, # word embedding 维度
window=5, # 窗口大小
epochs=100, # 训练迭代次数
min_count=2, # word 最低出现次数
sg=1, # 1 skip-gram 0 cbow
workers=8)
# 使用Word2Vec类的build_vocab()方法,构建词汇表
w2v_model.build_vocab(corpus_iterable=match_texts)
# 使用Word2Vec类的train()方法,训练词向量模型
w2v_model.train(corpus_iterable=match_texts,
total_examples=len(match_texts),
epochs=100)
w2v_model.save("../tmp/match_w2v_model.bin")
w2v_embed = w2v_model.wv
w2v_embed.save("../tmp/match_w2v_embed.bin")
def add_special_token(embed_path="", save_path=""):
print("Loading [%s] embeddding." % (embed_path))
wv_from_text = gensim.models.KeyedVectors.load(embed_path)
vocab = wv_from_text.key_to_index
print("Vocabulary Size: %s" % len(vocab.keys()))
word_vocab = dict()
word_vocab["PAD"] = 0
word_vocab["UNK"] = 1
for key in vocab.keys():
word_vocab[key] = len(word_vocab.keys())
pd.to_pickle(word_vocab, save_path+"vocab.pkl")
word_embed = wv_from_text.vectors
embed_dimen = word_embed.shape[-1]
unk_embed = np.random.randn(1, embed_dimen)
pad_embed = np.zeros(shape=(1, embed_dimen), dtype=np.float)
extral_embed = np.concatenate((pad_embed, unk_embed), axis=0)
word_embed = np.concatenate((extral_embed, word_embed), axis=0)
np.save(save_path+"match_embed.npy", word_embed)
print("Save add [Pad]、[UNK] Embedding shape: {}".format(word_embed.shape))
word2idx = {word: idx for idx, word in enumerate(word_vocab)}
idx2word = {idx: word for idx, word in enumerate(word_vocab)}
pd.to_pickle(word2idx, save_path+"word2idx.pkl")
pd.to_pickle(idx2word, save_path+"idx2word.pkl")
add_special_token("../tmp/match_w2v_embed.bin", "../tmp/")
Loading [../tmp/match_w2v_embed.bin] embeddding.
Vocabulary Size: 4718
Save add [Pad]、[UNK] Embedding shape: (4720, 100)
任务7、8、11 Common Train Model
任务7的TextCNN、任务8的BiLSTM、任务11的Bert微调我都使用基于Pytorch-lightning框架将模型编码和训练逻辑分开的实现,因此这三个模型共用一个训练模型,实现逻辑如下:
#%%writefile ../src/pl_train_model.py
import torch
import transformers
import pytorch_lightning as pl
torch.set_default_tensor_type(torch.DoubleTensor)
import torch.nn.functional as F
from torchmetrics import Precision
from torchmetrics import Accuracy
from torchmetrics import F1Score
from torchmetrics import AUROC
from model_network import TextCNN
from model_network import BiLSTM
from model_network import Bert
from model_network import roberta_base_AdamW_LLRD
class ClassifyTrainModel(pl.LightningModule):
def __init__(self, hparams={}):
super().__init__()
self.save_hyperparameters(hparams) # 将模型超参数保存在类中,可以通过self.hparams获取
if self.hparams.encode_model == "TextCNN":
self.encode = TextCNN(embed_path="../tmp/match_embed.npy")
elif self.hparams.encode_model == "BiLSTM":
self.encode = BiLSTM(embed_path="../tmp/match_embed.npy")
elif self.hparams.encode_model == "Bert":
self.encode = TransformerEmbed(model_name=self.hparams.pretrain_model,
freeze_layers=self.hparams.freeze_layers,
output_dim=self.hparams.output_dim)
else:
print("Error")
self.epochs = self.hparams.epochs
self.num_steps = self.hparams.num_steps
self.learning_rate = self.hparams.learning_rate
self.loss= torch.nn.CrossEntropyLoss(weight=torch.DoubleTensor([1.0, 2.0])) #训练集中,正负样本比例接近 1:6
self.metric_pre = Precision(task="binary", num_classes=self.hparams.output_dim)#.to(device)
self.metric_acc = Accuracy(task="binary", num_classes=self.hparams.output_dim)#.to(device)
self.metric_f1 = F1Score(task="binary", num_classes=self.hparams.output_dim)#.to(device)
self.metric_auc = AUROC(task="binary", num_classes=self.hparams.output_dim)#.to(device)
def forward(self, inputs):
if self.hparams.encode_model == "TextCNN":
return self.encode(inputs["input_ids"])
elif self.hparams.encode_model == "BiLSTM":
return self.encode(inputs["input_ids"])
elif self.hparams.encode_model == "Bert":
return self.encode(inputs["input_ids"],
input["token_type_ids"],
input["attention_mask"])
def configure_optimizers(self):
optimizer = torch.optim.AdamW(filter(lambda p: not p.requires_grad,
self.parameters()),
lr=self.learning_rate)
scheduler = transformers.get_scheduler(
self.hparams.lr_scheduler_type,
optimizer = optimizer,
num_warmup_steps = int(self.num_steps*0.1),
num_training_steps = self.num_steps
)
optim_dict = {"optimizer": optimizer,
"lr_scheduler": scheduler}
return optim_dict
def training_step(self, batch, batch_idx):
logits = self(batch)
labels = batch["target"]
if isinstance(labels, list):
labels = torch.tensor(labels)
loss = self.loss(logits, labels)
#loss = torch.nan_to_num(loss)
self.log("train_step_loss", loss, on_step=True, on_epoch=False,
prog_bar=True, logger=True, sync_dist=True)
return {"loss": loss}
def valid_step(self, batch, batch_idx):
logits = self(batch)
labels = batch["target"]
if isinstance(labels, list):
labels = torch.tensor(labels)
loss = self.loss(logits, labels)
loss = torch.nan_to_num(loss)
preds = torch.argmax(logits, dim=1)
return {"loss": loss.detach(),
"preds": preds.detach(),
"labels": labels}
def valid_epoch_end(self, outputs):
loss = 0.0
preds, labels = [], []
for output in outputs:
loss += output["loss"]
preds.append(output["preds"])
labels.append(output["labels"])
valid_loss /= len(outputs)
preds = torch.cat(preds)
labels = torch.cat(labels)
valid_pre = self.metric_pre(preds, labels)
valid_acc = self.metric_acc(preds, labels)
valid_f1 = self.metric_f1(preds, labels)
valid_auc = self.metric_auc(preds, labels)
metrics = {"valid_loss": valid_loss, "valid_pre": valid_pre,
"valid_acc": valid_acc, "valid_f1": valid_f1,
"valid_auc": valid_auc}
self.log_dict(metrics, on_step=False, on_epoch=True,
prog_bar=True, logger=True, sync_dist=True)
return metrics
def test_step(self, batch, batch_idx):
logits = self(batch)
labels = batch["target"]
if isinstance(labels, list):
labels = torch.tensor(labels)
loss = self.loss(logits, labels)
preds = torch.argmax(logits, dim=1)
return {"loss": loss.detach(),
"preds": preds.detach(),
"labels": labels}
def test_epoch_end(self, outputs):
loss = 0.0
preds, labels = [], []
for output in outputs:
loss += output["loss"]
preds.append(output["preds"])
labels.append(output["labels"])
loss /= len(outputs)
preds = torch.cat(preds)
labels = torch.cat(labels)
test_pre = self.metric_pre(preds, labels)
test_acc = self.metric_acc(preds, labels)
test_f1 = self.metric_f1(preds, labels)
test_auc = self.metric_auc(preds, labels)
metrics = {"test_loss": loss, "test_pre": test_pre,
"test_acc": test_acc, "test_f1": test_f1,
"test_auc": test_auc}
self.log_dict(metrics, on_step=False, on_epoch=True,
prog_bar=True, logger=True, sync_dist=True)
return metrics
def predict_step(self, batch, batch_idx):
logits = self(batch)
preds = torch.argmax(logits, dim=1)
return preds.detach()
@staticmethod # 由于该类中并不需要用到实例化对象的属性值,这里使用类的静态方法足以
def add_model_args(parent_parser):
parent_parser.add_argument("--output_dim", type=int, default=128)
parent_parser.add_argument("--num_step", type=int, default=-1)
parent_parser.add_argument("--learning_rate", type=float, default=0.0004365)
parent_parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
parent_parser.add_argument("--freeze_layers", type=list, default=[])
parent_parser.add_argument("--pretrain_model", type=str, default="hfl/chinese-roberta-wwm-ext")
return parent_parser
这三个任务,TextCNN和BiLSTM数据输入格式利用预训练好的词向量做一个映射。Bert需要通过transformers库自带tokenizer处理一下。
总共只需要写两种dataset去处理数据,具体逻辑如下:
%%writefile ../src/model_dataset.py
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
class EmbedDataset(Dataset):
def __init__(self, vocab_path, df, max_len=512,
pading_index=0, unk_index=1):
self.vocab = pd.read_pickle(vocab_path)
self.df = df
self.comment_text = self.df.content
self.max_len = max_len
if "label" in self.df:
self.targets = self.df['label'].tolist()
def __len__(self):
return len(self.comment_text)
def get_id(self, word):
if word not in self.vocab.keys():
return 1
else:
return self.vocab[word]
def __getitem__(self, index):
text = self.comment_text[index].split(" ")
text = text[:self.max_len]
input_ids = list(map(lambda x: self.get_id(x), text))
if len(input_ids) < self.max_len:
input_ids += [0] * (self.max_len - len(input_ids)) # padding 0
ans = {"input_ids": input_ids}
if "label" in self.df:
ans["target"] = self.targets[index]
return ans
class BertDataset(Dataset):
def __init__(self, df, tokenizer, max_len=512):
self.tokenizer = tokenizer
self.df = df
self.comment_text = self.df.content
self.max_len = max_len
if "label" in self.df:
self.targets = self.df['label'].tolist()
def __len__(self):
return len(self.comment_text)
def __getitem__(self, index):
comment_text = self.comment_text[index]
encode_dict = self.tokenizer.encode_plus(
comment_text,
add_special_tokens=True, # add token [CLS], [SEP]
truncation=True, # whether to truncate long sentences
max_length=self.max_len, # Maximum sentence length
padding="max_length", # Insufficient length Fill to maximum lengt
return_attention_mask=True, # Whether to return attention mask
return_tensors="pt", # Return tensor type
)
input_ids = encode_dict["input_ids"]
token_type_ids = encode_dict["token_type_ids"]
attention_mask = encode_dict['attention_mask']
ans = {"input_ids": input_ids.squeeze(0), # (1, max_len) -> (max_len)
"token_type_ids": token_type_ids.squeeze(0), # (1, max_len) -> (max_len)
"attention_mask": attention_mask.squeeze(0)} # (1, max_len) -> (max_len)
if "label" in self.df:
ans["target"] = self.targets[index]
return ans
Overwriting ../src/model_dataset.py
任务七:使用Word2Vec词向量,搭建TextCNN模型进行训练和预测
上面已经写好训练网络的框架和处理数据的逻辑,下面都只需要关注网络结构的编码,TextCNN的网络结构编码如下:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
class Mish(nn.Module): # 定义mish激活函数
def __init__(self):
super(Mish, self).__init__()
def forward(self,x):
x = x * (torch.tanh(F.softplus(x)))
return x
class TextCNN(nn.Module):
def __init__(self, embed_path, n_filters=100,
filter_sizes=[2, 3, 4], output_dim=2, dropout=0.1, pad_idx=0):
super().__init__()
embedding = np.load(embed_path)
vocab_size, embedding_dim = embedding.shape
self.embedding = nn.Embedding(vocab_size, embedding_dim,
padding_idx=pad_idx)
self.embedding.weight.data.copy_(torch.from_numpy(embedding))
self.embedding.weight.requires_grad = False
self.convs = nn.ModuleList([nn.Conv2d(in_channels=1,
out_channels=n_filters,
kernel_size=(fs, embedding_dim))
for fs in filter_sizes])
self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
self.dropout = nn.Dropout(dropout)
self.mish = Mish()
def forward(self, inputs):
embedded = self.embedding(inputs) # [batch size, sen len, embed dim]
print(embedded.shape)
embedded = embedded.unsqueeze(1) # [batch size, 1, sen len, embed dim]
print(embedded.shape)
conved = [self.mish(conv(embedded)).squeeze(3) for conv in self.convs]
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
cat = self.dropout(torch.cat(pooled, dim=1)) # [batch size, n_filters * len(filter_sizes)]
return self.fc(cat)
text_cnn = TextCNN(embed_path="../tmp/match_embed.npy")
TextCNN 训练的过程如下:
import argparse
import torch
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_tensor_type(torch.DoubleTensor)
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from model_dataset import EmbedDataset
from pl_train_model import ClassifyTrainModel
hparams = {'encode_model': 'textcnn',
'use_swa': True,
'max_len': 512,
'output_dim': 2,
'num_works': 10,
'batch_size': 8,
'accumulate_grad_batches': 16,
'pin_memory': True,
'epochs': 10,
'num_steps': -1,
'learning_rate': 1e-5,
'auto_lr_find': False,
'lr_scheduler_type': 'adaw',
'freeze_layers': [],
'precision': 16,
'train_monitor': 'valid_f1',
'train_mode': 'max',
'save_dir': './models/textcnn_models/',
'log_dir': './logs/',
'accelerator': 'gpu',
'devices': 1}
hparams = argparse.Namespace(**hparams)
train, _ = load_data()
train = train[1:]
train['label'] = train['label'].apply(lambda x: int(x))
train_more, test_more = train_test_split(train,
test_size=0.1,
random_state=43)
train_more = train_more.reset_index()
test_more = test_more.reset_index()
train_dataset = EmbedDataset(train_more, tokenizer, max_len=hparams.max_len)
train_dataloader = DataLoader(train_dataset,
shuffle=True,
num_workers=hparams.num_works,
batch_size=hparams.batch_size,
pin_memory=True)
hparams.num_steps = len(train_dataloader) * hparams.epochs
test_dataset = EmbedDataset(test_more, tokenizer, max_len=hparams.max_len)
test_dataloader = DataLoader(test_dataset,
shuffle=True,
num_workers=hparams.num_works,
batch_size=hparams.batch_size,
pin_memory=True)
pl.seed_everything(1234) # 统一设置随机种子
train_model = ClassifyTrainModel(hparams)
ckpt_callback = pl.callbacks.ModelCheckpoint(
monitor=hparams.train_monitor,
dirpath=hparams.save_dir,
filename="%s-{epochs:03d}-{%s:.3f}" % (hparams.encode_model,
hparams.train_monitor),
mode=hparams.train_mode)
callbacks = [ckpt_callback]
callbacks.append(pl.callbacks.StochasticWeightAveraging(swa_lrs=0.05))
logger = TensorBoardLogger(save_dir=hparams.log_dir, name="TrainModel")
trainer = pl.Trainer.from_argparse_args(
hparams,
max_epochs=hparams.epochs,
logger=logger,
callbacks=callbacks)
print("hparams.auto_lr_find=", hparams.auto_lr_find)
if hparams.auto_lr_find:
lr_finder = trainer.tuner.lr_find(model=train_model,
train_dataloaders=train_dataloader,
min_lr=1e-08,
max_lr=1e-1,
num_training=100,
mode="exponential", # 寻找策略:指数型exponential 和线数型 linear
early_stop_threshold=4.0)
fig = lr_finder.plot(suggest=True)
fig.show()
lr = lr_finder.suggestion()
print("suggest lr=", lr)
del train_model
hparams.learning_rate = lr
train_model = BertTrainModel(hparams)
print("Search best learning_rate: [%f]" % train_model.learning_rate)
trainer.fit(train_model, train_dataloader)
print("Test Data Result:")
result = trainer.test(train_model, test_dataloader)
print("Result:", result)
任务八:使用Word2Vec词向量,搭建BILSTM模型进行训练和预测
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
class BiLSTM(nn.Module):
def __init__(self, embed_path, n_hidden=100,
output_dim=2, num_layers=2, dropout=0.1, pad_idx=0):
super().__init__()
embedding = np.load(embed_path)
vocab_size, embedding_dim = embedding.shape
self.embedding = nn.Embedding(vocab_size, embedding_dim,
padding_idx=pad_idx)
self.embedding.weight.data.copy_(torch.from_numpy(embedding))
self.embedding.weight.requires_grad = False
self.bilstm = nn.LSTM(input_size=embedding_dim, hidden_size=n_hidden,
bidirectional=True, batch_first=True, num_layers=num_layers)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(2*n_hidden, output_dim)
def forward(self, inputs):
embed = self.embedding(inputs) # [batch size, sen len, embed dim]
print(embed.shape)
_, (h_n, _) = self.bilstm(embed) # bilstm output : [batch_size, seq_len, hidden_dim * 2]
output = torch.cat((h_n[-1], h_n[-2]), dim=-1)
dropout = self.dropout(output)
return self.fc(dropout)
bilstm = BiLSTM(embed_path="../tmp/match_embed.npy")
bilstm.embedding
import argparse
import torch
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_tensor_type(torch.DoubleTensor)
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from model_dataset import EmbedDataset
from pl_train_model import ClassifyTrainModel
hparams = {'encode_model': 'bilstm',
'use_swa': True,
'max_len': 512,
'output_dim': 2,
'num_works': 10,
'batch_size': 8,
'accumulate_grad_batches': 16,
'pin_memory': True,
'epochs': 10,
'num_steps': -1,
'learning_rate': 1e-5,
'auto_lr_find': False,
'lr_scheduler_type': 'adaw',
'freeze_layers': [],
'precision': 16,
'train_monitor': 'valid_f1',
'train_mode': 'max',
'save_dir': './models/textcnn_models/',
'log_dir': './logs/',
'accelerator': 'gpu',
'devices': 1}
hparams = argparse.Namespace(**hparams)
train, _ = load_data()
train = train[1:]
train['label'] = train['label'].apply(lambda x: int(x))
train_more, test_more = train_test_split(train,
test_size=0.1,
random_state=43)
train_more = train_more.reset_index()
test_more = test_more.reset_index()
train_dataset = EmbedDataset(train_more, tokenizer, max_len=hparams.max_len)
train_dataloader = DataLoader(train_dataset,
shuffle=True,
num_workers=hparams.num_works,
batch_size=hparams.batch_size,
pin_memory=True)
hparams.num_steps = len(train_dataloader) * hparams.epochs
test_dataset = EmbedDataset(test_more, tokenizer, max_len=hparams.max_len)
test_dataloader = DataLoader(test_dataset,
shuffle=True,
num_workers=hparams.num_works,
batch_size=hparams.batch_size,
pin_memory=True)
pl.seed_everything(1234) # 统一设置随机种子
train_model = ClassifyTrainModel(hparams)
ckpt_callback = pl.callbacks.ModelCheckpoint(
monitor=hparams.train_monitor,
dirpath=hparams.save_dir,
filename="%s-{epochs:03d}-{%s:.3f}" % (hparams.encode_model,
hparams.train_monitor),
mode=hparams.train_mode)
callbacks = [ckpt_callback]
callbacks.append(pl.callbacks.StochasticWeightAveraging(swa_lrs=0.05))
logger = TensorBoardLogger(save_dir=hparams.log_dir, name="TrainModel")
trainer = pl.Trainer.from_argparse_args(
hparams,
max_epochs=hparams.epochs,
logger=logger,
callbacks=callbacks)
print("hparams.auto_lr_find=", hparams.auto_lr_find)
if hparams.auto_lr_find:
lr_finder = trainer.tuner.lr_find(model=train_model,
train_dataloaders=train_dataloader,
min_lr=1e-08,
max_lr=1e-1,
num_training=100,
mode="exponential", # 寻找策略:指数型exponential 和线数型 linear
early_stop_threshold=4.0)
fig = lr_finder.plot(suggest=True)
fig.show()
lr = lr_finder.suggestion()
print("suggest lr=", lr)
del train_model
hparams.learning_rate = lr
train_model = BertTrainModel(hparams)
print("Search best learning_rate: [%f]" % train_model.learning_rate)
trainer.fit(train_model, train_dataloader)
print("Test Data Result:")
result = trainer.test(train_model, test_dataloader)
print("Result:", result)
任务九:学会Bert基础,transformer库基础使用
from transformers import BertConfig
from transformers import BertTokenizer
from transformers import BertModel
# 使用huggingface transformers库下载 hfl/chinese-macbert-base 的 Bert模型到本地
model_name = "hfl/chinese-macbert-base"
model_save = "./pretrain_models/hfl/chinese-macbert-base/"
# 从本地 cache_dir 载入,初始化模型
config = BertConfig.from_pretrained(model_name, cache_dir=model_save)
config.output_hidden_states = True #
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=model_save)
model = BertModel.from_pretrained(model_name, cache_dir=model_save)
# 输入文本,产生编码
sens = ["Hugging Face是什么,提供了哪些内容?",
"近年来人工智能在自然语言处理领域取得了巨大的进展。其中一项引人注目的技术是生成模型,如OpenAI的GPT-3.5。"]
print("tokenizer.tokenize:", tokenizer.tokenize(sens[0]))
input_ids = tokenizer(sens,
add_special_tokens=True,
return_attention_mask=True,
return_tensors="pt",
padding=True,
truncation="longest_first",
max_length=128)
print("input ids:", input_ids)
print("input ids shape:", input_ids["input_ids"].shape)
# 利用bert模型输出进行特征提取
# hidden_states 包含bert输入层embedding和每一层输出的embedding, 维度都是[batch, seq_len, hidden_state]
hidden_states = model(input_ids["input_ids"],
input_ids["token_type_ids"],
input_ids["attention_mask"],
output_hidden_states=True).hidden_states
# bert常见的不同特征提取方式
def bert_pooling(hidden_states, pooling="last_cls"):
if pooling == "last_cls":
return hidden_states[-1][:, 0, :]
elif pooling == "first_last_avg":
return (hidden_states[1] + hidden_states[-1]).mean(dim=1)
elif pooling == "last_avg":
return (hidden_states[-1]).mean(dim=1)
elif pooling == 'last2avg':
return (hidden_states[-1] + hidden_states[-2]).mean(dim=1)
else:
raise Exception("unknown pooling {}".format(self.pooling))
# 使用输出的第一层和最后一层相加+平均池化的方式
output = bert_pooling(hidden, pooling="first_last_avg")
print("bert output shape:", output.shape) # [2, 768]
任务十:使用Bert在比赛数据集中完成预训练
生成预训练数据
# 将比赛数据生成预训练语料
all_text = []
token = set()
for data in train["content"].tolist():
for line in data[1:-1].split("\n "):
all_text.append(line)
for word in line.split(" "):
token.add(word)
for data in test["content"].tolist():
for line in data[1:-1].split("\n "):
all_text.append(line)
for word in line.split(" "):
token.add(word)
with open("./data/all_data_txt.txt", "w") as file:
for line in all_text:
file.write(line + "\n")
len(token)
5230
重新制作 Tokenizer, 生成vocab.txt
# code encourage by https://blog.csdn.net/qq_26593695/article/details/115338593
from transformers import BertTokenizer
import tokenizers
# 创建分词器
bwpt = tokenizers.BertWordPieceTokenizer()
filepath = "./data/all_data_txt.txt" # 语料文件
#训练分词器
bwpt.train(
files=[filepath],
vocab_size=len(token), # 这里预设定的词语大小不是很重要
min_frequency=1,
limit_alphabet=1000
)
# 保存训练后的模型词表
bwpt.save_model('./models/pretrain_models/')
#output: ['./data/pre_train/vocab.txt']
# 加载刚刚训练的tokenizer
tokenizer = BertTokenizer(vocab_file='./models/pretrain_models/vocab.txt')
from transformers import (
CONFIG_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
HfArgumentParser,
Trainer,
TrainingArguments,
set_seed,
LineByLineTextDataset)
# 自己修改部分配置参数
config_kwargs = {
"cache_dir": None,
"revision": 'main',
"use_auth_token": None,
"hidden_size": 512,
"num_attention_heads": 4,
"hidden_dropout_prob": 0.2,
"vocab_size": len(token) # 自己设置词汇大小
}
# 将模型的配置参数载入
#config = AutoConfig.from_pretrained('', **config_kwargs)
# 载入预训练模型
model_name = "sijunhe/nezha-cn-base"
model_save = "../models/pretrain_models/sijunhe/nezha-cn-base/"
model = AutoModelForMaskedLM.from_pretrained(
model_name,
cache_dir=model_save,
revision='main'
)
model.resize_token_embeddings(len(tokenizer))
#output:Embedding(5230, 768, padding_idx=1)
# 通过LineByLineTextDataset接口 加载数据 #长度设置为512, # 这里file_path于本文第一部分的语料格式一致
train_dataset = LineByLineTextDataset(tokenizer=tokenizer,
file_path=filepath,
block_size=512)
# MLM模型的数据DataCollator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
mlm=True,
mlm_probability=0.15)
# 训练参数
pretrain_batch_size = 128
num_train_epochs = 100
training_args = TrainingArguments(
output_dir='./data/pre_train/',
resume_from_checkpoint=True,
overwrite_output_dir=True,
num_train_epochs=num_train_epochs,
learning_rate=8e-5,
lr_scheduler_type="cosine",
warmup_ratio=0.1,
weight_decay=0.01,
per_device_train_batch_size=pretrain_batch_size,
gradient_accumulation_steps=8,
logging_steps=100,
save_strategy='steps',
save_steps=1000, # 每1000 steps保存一个模型
save_total_limit=10) # 保存最近10个checkpoint,控制保存checkpoint的数量
# 通过Trainer接口训练模型
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset)
# 开始训练
trainer.train(True) # 之前有保存的模型
#trainer.train() # 第一次训练
trainer.save_model('./data/pre_train/')
Some weights of the model checkpoint at sijunhe/nezha-cn-base were not used when initializing NezhaForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing NezhaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NezhaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Embedding(5230, 768)
任务十一:使用Bert在比赛数据集上完成微调
Model Network
%%writefile ../src/model_network.py
import os
import torch
import torch.nn.functional as F
from torch import nn
import transformers
from transformers import BertModel
class Bert(nn.Module):
def __init__(self, model_name="hfl/chinese-roberta-wwm-ext/",
pooling="last_avg",
whitening=False,
output_dim=128,
hidden_size=768,
model_dir="../models/pretrain_models/",
freeze_layers=[]):
super().__init__()
self.model_name = model_name
if os.path.exists(model_dir+model_name):
model_name = model_dir + model_name
self.sen_bert = BertModel.from_pretrained(model_name)
self.freeze_layers = freeze_layers
if len(self.freeze_layers) > 0:
self.freeze_params()
self.pooling = pooling
self.Linear = nn.Linear(hidden_size, output_dim)
self.whitening = whitening
def freeze_params(self):
for name, param in self.sen_bert.named_parameters():
for freeze_name in self.freeze_layers:
if freeze_name in name:
param.requires_grad = False
break
def bert_pooling(self, hidden_states):
if self.pooling == "last_cls":
return hidden_states[-1][:, 0, :]
elif self.pooling == "first_last_avg":
return (hidden_states[1] + hidden_states[-1]).mean(dim=1)
elif self.pooling == "last_avg":
return (hidden_states[-1]).mean(dim=1)
elif self.pooling == 'last2avg':
return (hidden_states[-1] + hidden_states[-2]).mean(dim=1)
else:
raise Exception("unknown pooling {}".format(self.pooling))
def bert_whitening(self, output): # 暂时不折腾
pass
def forward(self, input_ids, token_type_ids, attention_mask):
hidden_states = self.sen_bert(input_ids,
token_type_ids,
attention_mask,
output_hidden_states=True).hidden_states
if self.whitening:
pass
#return self.bert_whitening(output)
else:
output_embed = self.bert_pooling(hidden_states) # [batch, hidden_size]
output_embed = self.Linear(output_embed) # [batch, output_dim]
return output_embed
Overwriting ../src/model_network.py
Bert微调的逻辑:
import argparse
import torch
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_tensor_type(torch.DoubleTensor)
import tokenizers
from transformers import BertTokenizer
tokenizer = BertTokenizer(vocab_file='../models/pretrain_models/vocab.txt')
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from model_dataset import CustomDataset
from pl_train_model import BertTrainModel
hparams = {'use_swa': True,
'max_len': 512,
'output_dim': 2,
'num_works': 10,
'batch_size': 8,
'accumulate_grad_batches': 16,
'pin_memory': True,
'epochs': 10,
'num_steps': -1,
'pretrain_model': 'checkpoint-35000',
'learning_rate': 1e-5,
'auto_lr_find': False,
'lr_scheduler_type': 'cosine',
'freeze_layers': [],
'precision': 16,
'train_monitor': 'train_step_loss',
'train_mode': 'min',
'save_dir': './models/finetune_models/',
'log_dir': './logs/',
'accelerator': 'gpu',
'devices': 1}
hparams = argparse.Namespace(**hparams)
train, _ = load_data()
train = train[1:]
train['label'] = train['label'].apply(lambda x: int(x))
train_more, test_more = train_test_split(train,
test_size=0.1,
random_state=43)
train_more = train_more.reset_index()
test_more = test_more.reset_index()
train_dataset = CustomDataset(train_more, tokenizer, max_len=hparams.max_len)
train_dataloader = DataLoader(train_dataset,
shuffle=True,
num_workers=hparams.num_works,
batch_size=hparams.batch_size,
pin_memory=True)
hparams.num_steps = len(train_dataloader) * hparams.epochs
test_dataset = CustomDataset(test_more, tokenizer, max_len=hparams.max_len)
test_dataloader = DataLoader(test_dataset,
shuffle=True,
num_workers=hparams.num_works,
batch_size=hparams.batch_size,
pin_memory=True)
pl.seed_everything(1234) # 统一设置随机种子
train_model = ClassifyTrainModel(hparams)
ckpt_callback = pl.callbacks.ModelCheckpoint(
monitor=hparams.train_monitor,
dirpath=hparams.save_dir,
filename="finetune-{epochs:03d}-{%s:.3f}" % hparams.train_monitor,
mode=hparams.train_mode)
callbacks = [ckpt_callback]
callbacks.append(pl.callbacks.StochasticWeightAveraging(swa_lrs=0.05))
logger = TensorBoardLogger(save_dir=hparams.log_dir, name="TrainModel")
trainer = pl.Trainer.from_argparse_args(
hparams,
max_epochs=hparams.epochs,
logger=logger,
callbacks=callbacks)
print("hparams.auto_lr_find=", hparams.auto_lr_find)
if hparams.auto_lr_find:
lr_finder = trainer.tuner.lr_find(model=train_model,
train_dataloaders=train_dataloader,
min_lr=1e-08,
max_lr=1e-1,
num_training=100,
mode="exponential", # 寻找策略:指数型exponential 和线数型 linear
early_stop_threshold=4.0)
fig = lr_finder.plot(suggest=True)
fig.show()
lr = lr_finder.suggestion()
print("suggest lr=", lr)
del train_model
hparams.learning_rate = lr
train_model = BertTrainModel(hparams)
# 等价于
# trainer.tune(train_model, data_module)
print("Search best learning_rate: [%f]" % train_model.learning_rate)
trainer.fit(train_model, train_dataloader)
print("Test Data Result:")
result, preds, labels = trainer.test(train_model, test_dataloader)
print("Result:", result)