文章目录
- 1、介绍
- 1.1、 模型类型
- 1.2、Model Head
- 2、模型加载
- 3、模型调用
- 3.1、不带Model Head的模型调用
- 3.2、带Model Head的模型调用
- 4、模型微调实战
- 4.1、导包
- 4.2、加载数据
- 4.3、创建数据集
- 4.4、划分数据集
- 4.5、创建加载器
- 4.6、创建模型以及优化器
- 4.7、模型训练
- 4.8、模型评估
- 4.9、模型预测
本篇博客内容以及后续内容均来自b站up主你可是处女座啊
1、介绍
1.1、 模型类型
- 编码器模型:自编码器模型,使用encoder,拥有双向注意力机制,即计算每一个词的特征时都看到完整的上下文
- 解码器模型:自回归模型,使用decoder,拥有单向注意力机制,即计算每一个词的特征时智能看到上文,无法看到下文
- 编码解码器模型:序列到序列的模型,使用encoder-decoder encoder使用双向注意力,decoder使用单向注意力
1.2、Model Head
2、模型加载
from transformers import AutoConfig,AutoModel,AutoTokenizer
#在线加载
model = AutoModel.from_pretrained('hfl/rbt3')
#模型下载
#!git clone “https://huggingface.co/hfl/rbt3”
!git lfs clone “https://huggingface.co/hfl/rbt3” --include=“*.bin”
#离线加载
model = AutoModel.from_pretrained('rbt3')
模型参数
model.config
3、模型调用
sen = '弱小的我也有大梦想'
tokenizer = AutoTokenizer.from_pretrained('hfl/rbt3',output_attentions=True)
inputs = tokenizer(sen,return_tensors='pt')
inputs
3.1、不带Model Head的模型调用
model = AutoModel.from_pretrained('hfl/rbt3',output_attentions=True)
output = model(**inputs)
output
output.last_hidden_state.size()
3.2、带Model Head的模型调用
from transformers import AutoModelForSequenceClassification,BertForSequenceClassification
clz_model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3',num_labels=10)
clz_model(**inputs)
clz_model.config.id2label
clz_model.config.num_labels
4、模型微调实战
4.1、导包
#文本分类实战
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments
4.2、加载数据
#法一
data = pd.read_csv('./datasets/ChnSentiCorp_htl_all.csv')
data.head()
data = data.dropna()
data
#法二
dataset = load_dataset('csv',data_files='datasets/ChnSentiCorp_htl_all.csv',split='train')
dataset = dataset.filter(lambda x : x['review'] is not None)
dataset
4.3、创建数据集
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self) -> None:
super().__init__()
self.data =pd.read_csv('./datasets/ChnSentiCorp_htl_all.csv')
self.data = self.data.dropna()
def __getitem__(self, index):
return self.data.iloc[index]['review'],self.data.iloc[index]['label']
def __len__(self):
return len(self.data)
dataset = MyDataset()
for i in range(5):
print(dataset[i])
4.4、划分数据集
from torch.utils.data import random_split
#法一
trainset,validset = random_split(dataset,lengths=[0.9,0.1])
len(trainset),len(validset)
#法二
dataset = dataset.train_test_split(test_size=0.1)
dataset
4.5、创建加载器
from transformers import AutoTokenizer
import torch
tokenizer =AutoTokenizer.from_pretrained('hfl/rbt3')
def collate_func(batch):
texts,labels = [],[]
for item in batch:
texts.append(item[0])
labels.append(item[1])
inputs = tokenizer(texts,max_length=128,padding='max_length',truncation=True,
return_tensors='pt')
inputs['labels']= torch.tensor(labels)
return inputs
from torch.utils.data import DataLoader
trainloader = DataLoader(trainset,batch_size=32,shuffle=True,collate_fn=collate_func)
validloader = DataLoader(trainset,batch_size=64,shuffle=False,collate_fn=collate_func)
import torch
tokenizer =AutoTokenizer.from_pretrained('rbt3')
def process_function(examples):
tokenized_examples = tokenizer(examples['review'],max_length=128,truncation=True)
tokenized_examples['labels'] = examples["label"]
return tokenized_examples
tokenized_datasets = dataset.map(process_function,batched=True,remove_columns=dataset['train'].column_names)
tokenized_datasets
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
trainset ,validset = tokenized_datasets['train'],tokenized_datasets['test']
trainloader = DataLoader(trainset,batch_size=32,shuffle=True,collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset,batch_size=32,shuffle=True,collate_fn=DataCollatorWithPadding(tokenizer))
4.6、创建模型以及优化器
from torch.optim import Adam
from transformers import AutoModelForSequenceClassification
#法一
model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3')
if torch.cuda.is_available():
model.to('cuda')
opt = Adam(model.parameters(),lr=2e-5)
#方法二 trainer
model = AutoModelForSequenceClassification.from_pretrained('rbt3')
########## 创建评估函数
import evaluate
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
def eval_metric(eval_predict):
predictions,labels = eval_predict
predictions = predictions.argmax(axis=-1)
acc = acc_metric.compute(predictions=predictions, references=labels)
f1 = f1_metric.compute(predictions=predictions, references=labels)
acc.update(f1)
return acc
4.7、模型训练
import evaluate
clf_metrics = evaluate.combine(['accuracy', 'f1'])
def train(epoch=3,log_step=100):
global_step = 0
for ep in range(epoch):
model.train()
for batch in trainloader:
if torch.cuda.is_available():
batch = {k:v.to('cuda') for k,v in batch.items()}
opt.zero_grad()
outputs = model(**batch)
loss = outputs.loss
loss.backward()
opt.step()
global_step += 1
if global_step % log_step == 0:
print(f'epoch:{ep},global_step:{global_step},loss:{loss.item()}')
acc = evaluate()
# print(f'ep:{ep},acc:{acc}')
print(f'ep:{ep},{acc}')
def evaluate():
model.eval()
acc_num = 0
with torch.inference_mode():
for batch in validloader:
if torch.cuda.is_available():
batch = {k:v.to('cuda') for k,v in batch.items()}
outputs = model(**batch)
pred = torch.argmax(outputs.logits,dim=-1)
clf_metrics.add_batch(predictions=pred.long(),references=batch['labels'].long())
return clf_metrics.compute()
# acc_num += (pred.long() == batch['labels'].long()).float().sum()
# return acc_num / len(validset)
#创建Training Arguments
train_args = TrainingArguments(output_dir='./checkpoint', #输出文件
per_device_eval_batch_size=8, #验证时batch大小
per_device_train_batch_size=8, #训练时batch大小
logging_steps=10, #每10步打印日志
eval_strategy="epoch", #评估策略 epoch、step
save_steps=100, #每100步保存一次模型
save_strategy='epoch', #保存策略
save_total_limit=2, #保存模型的数量
learning_rate=2e-5, #学习率
weight_decay=1e-5, #衰减率
metric_for_best_model="f1", #最好模型评估标准
load_best_model_at_end = True, #加载最优模型
)
train_args
#创建Trainer
from transformers import DataCollatorWithPadding
trainer = Trainer(
model = model,
args=train_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=eval_metric
)
trainer.train()
4.8、模型评估
trainer.evaluate(tokenized_datasets["test"])
4.9、模型预测
trainer.predict(tokenized_datasets["test"])
sen = '我觉得这家酒店不错,饭很好吃'
inputs = tokenizer(sen,return_tensors='pt')
id2label = {0:'差评!',1:'好评!'}
model.eval()
with torch.inference_mode():
inputs = tokenizer(sen,return_tensors='pt')
inputs = {k:v.cuda() for k,v in inputs.items()}
logits = model(**inputs).logits
pred = torch.argmax(logits,dim=-1)
print(f'输入:{sen}\n模型预测结果:{pred.item()}')
print(pred)
from transformers import pipeline
model.config.id2label = id2label
pipe = pipeline('text-classification',model=model,tokenizer=tokenizer,device=0)
pipe(sen)