1. 数据的质量
数据准备的步骤:
什么是tokenizing?
其实就是将文本数据转换为代表文本的数字,一般是基于字符出现的频率。需要注意的,编码和解码的tokenizer需保持一致,一般训练的模型有自己专属匹配的tokenizer。
2. 准备数据
2.1 准备环境
import pandas as pd
import datasets
from pprint import pprint
from transformers import AutoTokenizer
2.2 对文本进行标记化处理
# 调用预训练模型的分词器
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
text = "Hi, how are you?"
encoded_text = tokenizer(text)["input_ids"]
print(encoded_text)
输出如下:
[12764, 13, 849, 403, 368, 32]
尝试对编码的数据进行解码
decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into text: ", decoded_text)
输出如下:
Decoded tokens back into text: Hi, how are you?
可以看到数据又被还原了。在尝试对多个数据进行编码解码。
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])
输出如下:
Encoded several texts: [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]]
2.3 填充或者截断
实际上,对于模型来说,非常重要的一点是批处理中的所有
内容长度相同,因为你正在使用固定大小的张量进行操作
所以文本需要是相同的。因此采用填充策略来处理变长编码文本内容。对于我们的填充标记,您必须指定要用于填充的数字,具体来说,我们使用的是零,它实际上也是句子结束标记。因此,
当我们通过分词器运行填充时,您可以看到是”字符串右侧填充了许多零。
tokenizer.pad_token = tokenizer.eos_token
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])
输出如下:
Using padding: [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]
当然也可以使用截断来保持定长
encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])
输出如下:
Using truncation: [[12764, 13, 849], [42, 1353, 1175], [4374]]
我们可以看到上述字符是从右侧进行截断的。当然我们也可以从左侧进行截断处理。
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])
输出如下:
Using left-side truncation: [[403, 368, 32], [42, 1353, 1175], [4374]]
采用截断和填充的方式保持一致的策略
encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])
输出如下:
Using both padding and truncation: [[403, 368, 32], [42, 1353, 1175], [4374, 0, 0]]
2.4 准备指令数据集
import pandas as pd
filename = "lamini_docs.jsonl"
instruction_dataset_df = pd.read_json(filename, lines=True)
examples = instruction_dataset_df.to_dict()
if "question" in examples and "answer" in examples:
text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
text = examples["input"][0] + examples["output"][0]
else:
text = examples["text"][0]
prompt_template = """### Question:
{question}
### Answer:"""
num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
question = examples["question"][i]
answer = examples["answer"][i]
text_with_prompt_template = prompt_template.format(question=question)
finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})
from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])
输出如下:
One datapoint in the finetuning dataset:
{'answer': 'Lamini has documentation on Getting Started, Authentication, '
'Question Answer Model, Python Library, Batching, Error Handling, '
'Advanced topics, and class documentation on LLM Engine available '
'at https://lamini-ai.github.io/.',
'question': '### Question:\n'
'What are the different types of documents available in the '
'repository (e.g., installation guide, API documentation, '
"developer's guide)?\n"
'\n'
'### Answer:'}
2.5 对单个示例进行标记化
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
tokenized_inputs = tokenizer(
text,
return_tensors="np",
padding=True
)
print(tokenized_inputs["input_ids"])
输出如下:
[[ 4118 19782 27 187 1276 403 253 1027 3510 273 7177 2130
275 253 18491 313 70 15 72 904 12692 7102 13 8990
10097 13 13722 434 7102 6177 187 187 4118 37741 27 45
4988 74 556 10097 327 27669 11075 264 13 5271 23058 13
19782 37741 10031 13 13814 11397 13 378 16464 13 11759 10535
1981 13 21798 12989 13 285 966 10097 327 21708 46 10797
2130 387 5987 1358 77 4988 74 14 2284 15 7280 15
900 14206]]
对数据保持一致长度。
max_length = 2048
max_length = min(
tokenized_inputs["input_ids"].shape[1],
max_length,
)
tokenized_inputs = tokenizer(
text,
return_tensors="np",
truncation=True,
max_length=max_length
)
tokenized_inputs["input_ids"]
输出如下:
array([[ 4118, 19782, 27, 187, 1276, 403, 253, 1027, 3510,
273, 7177, 2130, 275, 253, 18491, 313, 70, 15,
72, 904, 12692, 7102, 13, 8990, 10097, 13, 13722,
434, 7102, 6177, 187, 187, 4118, 37741, 27, 45,
4988, 74, 556, 10097, 327, 27669, 11075, 264, 13,
5271, 23058, 13, 19782, 37741, 10031, 13, 13814, 11397,
13, 378, 16464, 13, 11759, 10535, 1981, 13, 21798,
12989, 13, 285, 966, 10097, 327, 21708, 46, 10797,
2130, 387, 5987, 1358, 77, 4988, 74, 14, 2284,
15, 7280, 15, 900, 14206]])
2.6 对指令数据集进行标记化
def tokenize_function(examples):
if "question" in examples and "answer" in examples:
text = examples["question"][0] + examples["answer"][0]
elif "input" in examples and "output" in examples:
text = examples["input"][0] + examples["output"][0]
else:
text = examples["text"][0]
tokenizer.pad_token = tokenizer.eos_token
tokenized_inputs = tokenizer(
text,
return_tensors="np",
padding=True,
)
max_length = min(
tokenized_inputs["input_ids"].shape[1],
2048
)
tokenizer.truncation_side = "left"
tokenized_inputs = tokenizer(
text,
return_tensors="np",
truncation=True,
max_length=max_length
)
return tokenized_inputs
对数据进行标记
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")
tokenized_dataset = finetuning_dataset_loaded.map(
tokenize_function,
batched=True,
batch_size=1,
drop_last_batch=True
)
print(tokenized_dataset)
输出如下:
Dataset({
features: ['question', 'answer', 'input_ids', 'attention_mask'],
num_rows: 1400
})
对数据增加列标签
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])
2.7 对数据集进行划分
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)
输出如下:
DatasetDict({
train: Dataset({
features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
num_rows: 1260
})
test: Dataset({
features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
num_rows: 140
})
})
2.8 测试自己的数据集加载
finetuning_dataset_path = "lamini/lamini_docs"
finetuning_dataset = datasets.load_dataset(finetuning_dataset_path)
print(finetuning_dataset)
输出如下:
DatasetDict({
train: Dataset({
features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
num_rows: 1260
})
test: Dataset({
features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
num_rows: 140
})
})
尝试taylor_swift数据集
taylor_swift_dataset = "lamini/taylor_swift"
bts_dataset = "lamini/bts"
open_llms = "lamini/open_llms"
dataset_swiftie = datasets.load_dataset(taylor_swift_dataset)
print(dataset_swiftie["train"][1])
输出如下:
{'question': 'What is the most popular Taylor Swift song among millennials? How does this song relate to the millennial generation? What is the significance of this song in the millennial culture?', 'answer': 'Taylor Swift\'s "Shake It Off" is the most popular song among millennials. This song relates to the millennial generation as it is an anthem of self-acceptance and embracing one\'s individuality. The song\'s message of not letting others bring you down and to just dance it off resonates with the millennial culture, which is often characterized by a strong sense of individuality and a rejection of societal norms. Additionally, the song\'s upbeat and catchy melody makes it a perfect fit for the millennial generation, which is known for its love of pop music.', 'input_ids': [1276, 310, 253, 954, 4633, 11276, 24619, 4498, 2190, 24933, 8075, 32, 1359, 1057, 436, 4498, 14588, 281, 253, 24933, 451, 5978, 32, 1737, 310, 253, 8453, 273, 436, 4498, 275, 253, 24933, 451, 4466, 32, 37979, 24619, 434, 346, 2809, 640, 733, 5566, 3, 310, 253, 954, 4633, 4498, 2190, 24933, 8075, 15, 831, 4498, 7033, 281, 253, 24933, 451, 5978, 347, 352, 310, 271, 49689, 273, 1881, 14, 14764, 593, 285, 41859, 581, 434, 2060, 414, 15, 380, 4498, 434, 3935, 273, 417, 13872, 2571, 3324, 368, 1066, 285, 281, 816, 11012, 352, 745, 8146, 684, 342, 253, 24933, 451, 4466, 13, 534, 310, 2223, 7943, 407, 247, 2266, 3282, 273, 2060, 414, 285, 247, 18235, 273, 38058, 22429, 15, 9157, 13, 253, 4498, 434, 598, 19505, 285, 5834, 90, 40641, 2789, 352, 247, 3962, 4944, 323, 253, 24933, 451, 5978, 13, 534, 310, 1929, 323, 697, 2389, 273, 1684, 3440, 15], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1276, 310, 253, 954, 4633, 11276, 24619, 4498, 2190, 24933, 8075, 32, 1359, 1057, 436, 4498, 14588, 281, 253, 24933, 451, 5978, 32, 1737, 310, 253, 8453, 273, 436, 4498, 275, 253, 24933, 451, 4466, 32, 37979, 24619, 434, 346, 2809, 640, 733, 5566, 3, 310, 253, 954, 4633, 4498, 2190, 24933, 8075, 15, 831, 4498, 7033, 281, 253, 24933, 451, 5978, 347, 352, 310, 271, 49689, 273, 1881, 14, 14764, 593, 285, 41859, 581, 434, 2060, 414, 15, 380, 4498, 434, 3935, 273, 417, 13872, 2571, 3324, 368, 1066, 285, 281, 816, 11012, 352, 745, 8146, 684, 342, 253, 24933, 451, 4466, 13, 534, 310, 2223, 7943, 407, 247, 2266, 3282, 273, 2060, 414, 285, 247, 18235, 273, 38058, 22429, 15, 9157, 13, 253, 4498, 434, 598, 19505, 285, 5834, 90, 40641, 2789, 352, 247, 3962, 4944, 323, 253, 24933, 451, 5978, 13, 534, 310, 1929, 323, 697, 2389, 273, 1684, 3440, 15]}
3. 总结
本节讲述了大模型微调前的数据准备工作,最重要的是模型的tokenizer以及截断策略和数据的划分,自己的数据集在制作过程中,仅需遵照上述流程即可,剩下的数据质量自行把握。