文章目录
- 一、导入相关包
- 二、加载数据集
- 三、数据预处理
- 四、创建模型
- 五、创建评估函数
- 六、配置训练参数
- 七、创建训练器
- 八、模型训练
- 九、模型预测
一、导入相关包
- DataCollatorForTokenClassification 用于 Token 级别的分类任务
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification,
TrainingArguments, Trainer, DataCollatorForTokenClassification
二、加载数据集
# 如果可以联网,直接使用load_dataset进行加载,cache_dir将数据缓冲到该位置
ner_datasets = load_dataset("peoples_daily_ner", cache_dir="./data")
# 如果无法联网,则使用下面的方式加载数据集
# from datasets import DatasetDict
# ner_datasets = DatasetDict.load_from_disk("ner_data")
ner_datasets
'''
DatasetDict({
train: Dataset({
features: ['id', 'tokens', 'ner_tags'],
num_rows: 20865
})
validation: Dataset({
features: ['id', 'tokens', 'ner_tags'],
num_rows: 2319
})
test: Dataset({
features: ['id', 'tokens', 'ner_tags'],
num_rows: 4637
})
})
'''
- 查看一条数据
ner_datasets["train"][0]
'''
{'id': '0',
'tokens': ['海',
'钓',
'比',
'赛',
'地',
'点',
'在',
'厦',
'门',
'与',
'金',
'门',
'之',
'间',
'的',
'海',
'域',
'。'],
'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}
'''
- 查看特征信息
ner_datasets["train"].features
'''
{'id': Value(dtype='string', id=None),
'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}
'''
- 获取标签信息
label_list = ner_datasets["train"].features["ner_tags"].feature.names
label_list
'''
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
'''
三、数据预处理
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
tokenizer(ner_datasets["train"][0]["tokens"], is_split_into_words=True)
# 对于已经做好tokenize的数据,要指定is_split_into_words参数为True
'''
{'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
'''
否则
tokenizer(ner_datasets["train"][0]["tokens"], is_split_into_words=False)
'''
{'input_ids': [[101, 3862, 102], [101, 7157, 102], [101, 3683, 102], [101, 6612, 102], [101, 1765, 102], [101, 4157, 102], [101, 1762, 102], [101, 1336, 102], [101, 7305, 102], [101, 680, 102], [101, 7032, 102], [101, 7305, 102], [101, 722, 102], [101, 7313, 102], [101, 4638, 102], [101, 3862, 102], [101, 1818, 102], [101, 511, 102]], 'token_type_ids': [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]}
'''
- attention_mask取值为1指的是不需要做mask
- 字词分词的时候(例如下面的英文数据)会存在一个 token 对应若干 id,所以不能简单的一一对应
res = tokenizer("interesting word")
res
'''
{'input_ids': [101, 10673, 12865, 12921, 8181, 8681, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
'''
- 解决方案:word_ids(),实现标签映射
res.word_ids()
'''
[None, 0, 0, 0, 0, 1, None] # None代表的是特殊token
'''
- 实现label和token的一一对应
# 借助word_ids 实现标签映射
def process_function(examples):
tokenized_exmaples = tokenizer(examples["tokens"], max_length=128, truncation=True, is_split_into_words=True)
# 实现label和token的一一对应
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_exmaples.word_ids(batch_index=i)
label_ids = []
for word_id in word_ids:
if word_id is None:
label_ids.append(-100) # -100 softmax后会置为0
else:
label_ids.append(label[word_id])
labels.append(label_ids)
tokenized_exmaples["labels"] = labels
return tokenized_exmaples
tokenized_datasets = ner_datasets.map(process_function, batched=True)
tokenized_datasets
'''
DatasetDict({
train: Dataset({
features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 20865
})
validation: Dataset({
features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 2319
})
test: Dataset({
features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
num_rows: 4637
})
})
'''
print(tokenized_datasets["train"][0])
'''
{'id': '0',
'tokens': ['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。'],
'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100]}
'''
四、创建模型
- 对于所有的非二分类任务,切记要指定num_labels,否则就会device错误
# 对于所有的非二分类任务,切记要指定num_labels,否则就会device错误
model = AutoModelForTokenClassification.from_pretrained("hfl/chinese-macbert-base",
num_labels=len(label_list))
model.config.num_labels
'''
7
'''
五、创建评估函数
# seqeval = evaluate.load("seqeval_metric.py") # 本地的加载方式
!pip install seqeval
seqeval = evaluate.load("seqeval")
seqeval
'''
Args:
predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
references: List of List of reference labels (Ground truth (correct) target values)
suffix: True if the IOB prefix is after type, False otherwise. default: False
scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
default: None
mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
If you want to only count exact matches, pass mode="strict". default: None.
sample_weight: Array-like of shape (n_samples,), weights for individual samples. default: None
zero_division: Which value to substitute as a metric value when encountering zero division. Should be on of 0, 1,
"warn". "warn" acts as 0, but the warning is raised.
Returns:
'scores': dict. Summary of the scores for overall and per type
Overall:
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': F1 score, also known as balanced F-score or F-measure,
Per type:
'precision': precision,
'recall': recall,
'f1': F1 score, also known as balanced F-score or F-measure
Examples:
>>> predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> seqeval = evaluate.load("seqeval")
>>> results = seqeval.compute(predictions=predictions, references=references)
>>> print(list(results.keys()))
['MISC', 'PER', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']
>>> print(results["overall_f1"])
0.5
>>> print(results["PER"]["f1"])
1.0
""", stored examples: 0)
'''
import numpy as np
def eval_metric(pred):
# 返回的predictions只是logits
predictions, labels = pred
predictions = np.argmax(predictions, axis=-1)
# 将id转换为原始的字符串类型的标签
true_predictions = [
[label_list[p] for p, l in zip(prediction, label) if l != -100] # simple
for prediction, label in zip(predictions, labels) # batch
]
true_labels = [
[label_list[l] for p, l in zip(prediction, label) if l != -100] # simple
for prediction, label in zip(predictions, labels) # batch
]
result = seqeval.compute(predictions=true_predictions, references=true_labels, mode="strict", scheme="IOB2")
return {
"f1": result["overall_f1"]
}
六、配置训练参数
huggingface transformers使用指南之二——方便的trainer
详解Hugging Face Transformers的TrainingArguments_若石之上的博客-CSDN博客
LLM大模型之Trainer以及训练参数
!pip install accelerate # Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`
args = TrainingArguments(
output_dir="models_for_ner", # 断点和运行日志记录保存文件夹
per_device_train_batch_size=64,
per_device_eval_batch_size=128,
evaluation_strategy="epoch", # 评价策略,需要和保存策略保持一致
save_strategy="epoch",
metric_for_best_model="f1", # 选取最好模型的指标,这里是评价函数返回的字典的key f1
load_best_model_at_end=True, # 训练完后加载最好的模型
logging_steps=50, # 日志打印步数
num_train_epochs=3
)
# 如果使用evaluation_strategy="steps",
# 则需要指定eval_steps参数,否则eval_steps=logging_steps
args = TrainingArguments(
output_dir="models_for_ner", # 断点和运行记录保存文件夹
per_device_train_batch_size=64,
per_device_eval_batch_size=128,
evaluation_strategy="steps", # 评价策略,需要和保存策略保持一致
save_strategy="steps",
metric_for_best_model="f1", # 选取最好模型的指标,这里是评价函数返回的字典的key f1
load_best_model_at_end=True, # 训练完后加载最好的模型
logging_steps=50, # 日志打印步数
num_train_epochs=1
)
- 采用第二种参数:
七、创建训练器
- 如果只有训练集还想实现在训练的时候评估的效果,则只需要将eval_dataset=tokenized_datasets[“train”] 即可
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
compute_metrics=eval_metric,
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)
八、模型训练
trainer.train()
- 模型评估
trainer.evaluate() # 默认使用trainer中指定的eval_dataset
# 也可以更换其他数据集
trainer.evaluate(eval_dataset=tokenized_datasets["test"])
九、模型预测
res = trainer.predict(tokenized_datasets["test"])
res.predictions.argmax(axis=-1)
'''
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]])
'''
res.predictions.argmax(axis=-1)[0]
'''
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 5, 5, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
'''
from transformers import pipeline
# 使用pipeline进行推理,要指定id2label
model.config.id2label = {idx: label for idx, label in enumerate(label_list)}
model.config
'''
BertConfig {
"_name_or_path": "hfl/chinese-macbert-base",
"architectures": [
"BertForTokenClassification"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"directionality": "bidi",
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"id2label": {
"0": "O",
"1": "B-PER",
"2": "I-PER",
"3": "B-ORG",
"4": "I-ORG",
"5": "B-LOC",
"6": "I-LOC"
},
"initializer_range": 0.02,
"intermediate_size": 3072,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1,
"LABEL_2": 2,
"LABEL_3": 3,
"LABEL_4": 4,
"LABEL_5": 5,
"LABEL_6": 6
},
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.35.0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 21128
}
'''
# 如果模型是基于GPU训练的,那么推理时要指定device
# 对于NER任务,可以指定aggregation_strategy为simple,得到具体的实体的结果,而不是token的结果
ner_pipe = pipeline("token-classification",
model=model,
tokenizer=tokenizer,
device=0,
aggregation_strategy="simple")
res = ner_pipe("小明在北京上班")
res
'''
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[{'entity_group': 'PER',
'score': 0.44049227,
'word': '明',
'start': 1,
'end': 2},
{'entity_group': 'LOC',
'score': 0.9994525,
'word': '北 京',
'start': 3,
'end': 5}]
'''
# 可以用model_max_length参数解决Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
tokenizer = AutoTokenizer.from_pretrained('google/bert_uncased_L-4_H-256_A-4',
model_max_length=512)
# 指定aggregation_strategy为simple
ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0)
res = ner_pipe("小明在北京上班")
res
'''
[{'entity': 'I-PER',
'score': 0.44049227,
'index': 2,
'word': '明',
'start': 1,
'end': 2},
{'entity': 'B-LOC',
'score': 0.99940526,
'index': 4,
'word': '北',
'start': 3,
'end': 4},
{'entity': 'I-LOC',
'score': 0.9994997,
'index': 5,
'word': '京',
'start': 4,
'end': 5}]
'''
# 根据start和end取实际的结果
ner_result = {}
x = "小明在北京上班"
for r in res:
if r["entity_group"] not in ner_result:
ner_result[r["entity_group"]] = []
ner_result[r["entity_group"]].append(x[r["start"]: r["end"]])
ner_result
'''
{'PER': ['明'], 'LOC': ['北京']}
'''