本文将介绍如何使用Hugging Face库从头开始构建一个预训练Transformer模型。该模型称为 KantaiBERT。
#@title Step 1: Loading the Dataset
#1.Load kant.txt using the Colab file manager
#2.Downloading the file from GitHubant
!curl -L https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-2nd-Edition/master/Chapter04/kant.txt --output "kant.txt"
#@title Step 2:APRIL 2023 UPDATE: Installing Hugging Face Transformers
'''
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.9.1
# tokenizers version at notebook update --- 0.7.0
'''
#@title Step 3: Training a Tokenizer
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
paths = [str(x) for x in Path(".").glob("**/*.txt")]
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
#@title Step 4: Saving the files to disk
import os
token_dir = '/root/information_needs/KantaiBERT'
if not os.path.exists(token_dir):
os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')
#@title Step 5 Loading the Trained Tokenizer Files
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
"./KantaiBERT/vocab.json",
"./KantaiBERT/merges.txt",
)
print(tokenizer.encode("The Critique of Pure Reason.").tokens)
# ['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.']
print(tokenizer.encode("The Critique of Pure Reason."))
# Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
tokenizer._tokenizer.post_processor = BertProcessing(
("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
#@title Checking that PyTorch Sees CUDA
import torch
torch.cuda.is_available()
#@title Step 7: Defining the configuration of the Model
from transformers import RobertaConfig
config = RobertaConfig(
vocab_size=52_000,
max_position_embeddings=514,
num_attention_heads=12,
num_hidden_layers=6,
type_vocab_size=1,
)
print(config)
# RobertaConfig {
# "attention_probs_dropout_prob": 0.1,
# "bos_token_id": 0,
# "classifier_dropout": null,
# "eos_token_id": 2,
# "hidden_act": "gelu",
# "hidden_dropout_prob": 0.1,
# "hidden_size": 768,
# "initializer_range": 0.02,
# "intermediate_size": 3072,
# "layer_norm_eps": 1e-12,
# "max_position_embeddings": 514,
# "model_type": "roberta",
# "num_attention_heads": 12,
# "num_hidden_layers": 6,
# "pad_token_id": 1,
# "position_embedding_type": "absolute",
# "transformers_version": "4.45.2",
# "type_vocab_size": 1,
# "use_cache": true,
# "vocab_size": 52000
#}
#@title Step 8: Re-creating the Tokenizer in Transformers
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)
#@title Step 9: Initializing a Model From Scratch
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)
print(model)
print(model.num_parameters())
# => 84,095,008 parameters
#@title Exploring the Parameters
LP=list(model.parameters())
lp=len(LP)
print(lp)
for p in range(0,lp):
print(LP[p])
#@title Counting the parameters
np=0
for p in range(0,lp):#number of tensors
PL2=True
try:
L2=len(LP[p][0]) #check if 2D
except:
L2=1 #not 2D but 1D
PL2=False
L1=len(LP[p])
L3=L1*L2
np+=L3 # number of parameters per tensor
if PL2==True:
print(p,L1,L2,L3) # displaying the sizes of the parameters
if PL2==False:
print(p,L1,L3) # displaying the sizes of the parameters
print(np) # total number of parameters
#@title Step 10: Building the Dataset
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path="/root/information_needs/kantai/kant.txt",
block_size=128,
)
#@title Step 11: Defining a Data Collator
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
#@title Step 12: Initializing the Trainer
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./KantaiBERT",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=64,
save_steps=10_000,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
#@title Step 13: Pre-training the Model
trainer.train()
#@title Step 14: Saving the Final Model(+tokenizer + config) to disk
trainer.save_model("./KantaiBERT")
#@title Step 15: Language Modeling with the FillMaskPipeline
from transformers import pipeline
fill_mask = pipeline(
"fill-mask",
model="./KantaiBERT",
tokenizer="./KantaiBERT"
)
fill_mask("Human thinking involves human <mask>.")