一、huggingface peft微调框架
1、定义
PEFT 是一个为大型预训练模型提供多种高效微调方法的Python库。
微调传统范式是针对每个下游任务微调模型参数。大模型参数总量庞大,这种方式变得极其昂贵和不切实际。PEFT采用的高效做法是训练少量提示参数(Prompt Tuning)或使用低秩适应(LORA)等重新参数化方法来减少微调时训练参数的数量。
二、qwen-1.5b-chat模型训练/推理/服务
1、基础环境准备
datasets==2.21.0 transformers==4.37.0 torch==1.13.0 accelerate==0.30.1 peft==0.4.0 numpy==1.26.4 Jinja2==3.1.4 |
2、人设定制数据准备
[ { "instruction": "你是谁?", "input": "", "output": "我是一个语言模型,我叫小飞同学,可以为您做很多事情。请问您有什么问题需要我帮助吗?" }, { "instruction": "你是什么?", "input": "", "output": "我是一个语言模型,我叫小飞同学,可以为您做很多事情。请问您有什么问题需要我帮助吗?" }, { "instruction": "请问您是?", "input": "", "output": "我是一个语言模型,我叫小飞同学,可以为您做很多事情。请问您有什么问题需要我帮助吗?" }, { "instruction": "你叫什么?", "input": "", "output": "我是一个语言模型,我叫小飞同学,可以为您做很多事情。请问您有什么问题需要我帮助吗?" }, { "instruction": "你的身份是?", "input": "", "output": "我是一个语言模型,我叫小飞同学,可以为您做很多事情。请问您有什么问题需要我帮助吗?" } ] |
2、模型训练
from datasets import Dataset import pandas as pd from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig # 将JSON文件转换为CSV文件 df = pd.read_json('./train.json') ds = Dataset.from_pandas(df) model_path = './huggingface/model/Qwen1.5-7B-Chat' tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True) def process_func(example): MAX_LENGTH = 384 input_ids, attention_mask, labels = [], [], [] instruction = tokenizer(f"<|im_start|>system\n现在你要扮演人工智能智能客服助手--小飞同学<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False) response = tokenizer(f"{example['output']}", add_special_tokens=False) input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id] attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1] labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id] if len(input_ids) > MAX_LENGTH: # 做一个截断 input_ids = input_ids[:MAX_LENGTH] attention_mask = attention_mask[:MAX_LENGTH] labels = labels[:MAX_LENGTH] return { "input_ids": input_ids, "attention_mask": attention_mask, "labels": labels } tokenized_id = ds.map(process_func, remove_columns=ds.column_names) import torch model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",torch_dtype=torch.bfloat16) model.enable_input_require_grads() from peft import LoraConfig, TaskType, get_peft_model config = LoraConfig( task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], inference_mode=False, # 训练模式 r=8, # Lora 秩 lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理 lora_dropout=0.1# Dropout 比例 ) model = get_peft_model(model, config) args = TrainingArguments( output_dir="./output", per_device_train_batch_size=4, gradient_accumulation_steps=4, logging_steps=10, num_train_epochs=10, save_steps=50, learning_rate=1e-4, save_on_each_node=True, gradient_checkpointing=True ) trainer = Trainer( model=model, args=args, train_dataset=tokenized_id, data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True), ) trainer.train() |
模型输出目录截图:
3、模型推理
from transformers import AutoModelForCausalLM, AutoTokenizer import torch from peft import PeftModel model_path = './huggingface/model/Qwen1.5-7B-Chat' lora_path = './output/checkpoint-50' # 加载tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path) # 加载模型 model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",torch_dtype=torch.bfloat16) from peft import LoraConfig, TaskType config = LoraConfig( task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], inference_mode=True, # 训练模式 r=8, # Lora 秩 lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理 lora_dropout=0.1# Dropout 比例 ) # 加载lora权重 model = PeftModel.from_pretrained(model, model_id=lora_path, config=config) prompt = "你是星火大模型吗?" messages = [ {"role": "system", "content": "现在你要扮演人工智能智能客服助手--小飞同学"}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) model_inputs = tokenizer([text], return_tensors="pt").to('cuda') generated_ids = model.generate( input_ids=model_inputs.input_ids, max_new_tokens=512 ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print(response) |
模型推理日志截图:
4、基于FastAPI的sse协议模型服务
import uvicorn from fastapi import FastAPI from transformers import AutoModelForCausalLM, AutoTokenizer ,TextStreamer,TextIteratorStreamer from threading import Thread import torch from peft import LoraConfig, TaskType, PeftModel from sse_starlette.sse import EventSourceResponse import json # transfomers是huggingface提供的一个工具,便于加载transformer结构的模型 app = FastAPI() def load_model(): model_path = './huggingface/model/Qwen1.5-7B-Chat' # 加载tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path) # 加载模型(加速库attn_implementation="flash_attention_2") model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",torch_dtype=torch.bfloat16 # 加载lora权重 lora_path = './output/checkpoint-50' config = LoraConfig( task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], inference_mode=True, # 训练模式 r=8, # Lora 秩 lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理 lora_dropout=0.1# Dropout 比例 ) model = PeftModel.from_pretrained(model, model_id=lora_path, config=config) return tokenizer,model tokenizer,model = load_model() def infer_model(tokenizer,model): prompt = "你是星火大模型吗?" messages = [ {"role": "system", "content": "现在你要扮演人工智能智能客服助手--小飞同学"}, {"role": "user", "content": prompt} ] #数据提取 text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) model_inputs = tokenizer([text], return_tensors="pt").to('cuda') #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) #模型推理 from threading import Thread generation_kwargs = dict(model_inputs, streamer=streamer, max_new_tokens=512) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() for res in streamer: yield json.dumps({"data":res},ensure_ascii=False) @app.get('/predict') async def predict(): #return infer_model(tokenizer,model) return EventSourceResponse(infer_model(tokenizer,model)) if __name__ == '__main__': # 在调试的时候开源加入一个reload=True的参数,正式启动的时候可以去掉 uvicorn.run(app, host="0.0.0.0", port=6605, log_level="info") |
客户端调用示例:
import json import requests import time def listen_sse(url): # 发送GET请求到SSE端点 with requests.get(url, stream=True, timeout=20) as response: try: # 确保请求成功 response.raise_for_status() # 逐行读取响应内容 result = "" for line in response.iter_lines(): if line: event_data = line.decode('utf-8') if event_data.startswith('data:'): # 去除'data:'前缀,获取实际数据 line = event_data.lstrip('data:') line_data = json.loads(line) result += line_data["data"] print(result) except requests.exceptions.HTTPError as err: print(f"HTTP error: {err}") except Exception as err: print(f"An error occurred: {err}") return sse_url = 'http://127.0.0.1:6605/predict' listen_sse(sse_url |
服务推理流式输出截图: