笔记
作业
原7b模型问题耗时: 4.5s
lmdeploy推理耗时: 0.43s
不知道是否因为没有正确的输出
lmdeploy kv-cache推理耗时:2.9s
- 推理时新增 past_key_values 参数,该参数就会以追加方式保存每一轮的K V值。kvcache变量内容为((k,v), (k,v), …, (k,v)),即有 nlayersn_{layers}n_{layers} 个 k,v 组成的一个元组,其中 k 和 v 的维度均为 [b, n_head, s, head_dims]。这里可以顺带计算出每轮推理对应的 cache 数据量为 2∗b∗s∗h∗nlayers2bshn_{layers}2bshn_{layers} ,这里 sss 值等于当前轮次值。以GPT3-175B为例,假设以 float16 来保存 KV cache,senquence长度为100,batchsize=1,则 KV cache占用显存为 2×100×12288×96×2 Byte= 472MB。推理输出的token直接作为下一轮的输入,不再拼接,因为上文信息已经在 kvcache 中
VLLM
from transformers import AutoTokenizer, GenerationConfig
from vllm import LLM, SamplingParams
class vLLMWrapper(object):
def __init__(
self,
model_dir,
tensor_parallel_size=1,
gpu_memory_utilization=0.9,
dtype="float16",
quantization=None,
):
self.generation_config = GenerationConfig.from_pretrained(
model_dir, trust_remote_code=True
)
self.tokenizer = AutoTokenizer.from_pretrained(
model_dir, trust_remote_code=True
)
# self.tokenizer.eos_token_id = self.generation_config.eos_token_id
self.stop_words_ids = [
# self.tokenizer.im_start_id,
# self.tokenizer.im_end_id,
self.tokenizer.eos_token_id,
]
os.environ["VLLM_USE_MODELSCOPE"] = "True"
self.model = LLM(
model=model_dir,
tokenizer=model_dir,
tensor_parallel_size=tensor_parallel_size, # tp
trust_remote_code=True,
quantization=quantization,
gpu_memory_utilization=gpu_memory_utilization, # 0.6
dtype=dtype,
)
def generate(self, query, history=None, system=None, extra_stop_words_ids=None):
if isinstance(inputs, str):
inputs = [inputs]
if history is None:
history = []
else:
history = copy.deepcopy(history)
sampling_params = SamplingParams(
temperature=1.0, top_p=0.5, max_tokens=512, stop=self.stop_words_ids
)
response = self.model.generate(
prompt_token_ids=[prompt_tokens],
sampling_params=sampling_params,
use_tqdm=False,
)
response = [resp.outputs[0].text for resp in response]
response_token_ids = remove_stop_words(
req_output.outputs[0].token_ids, stop_words_ids
)
response = self.tokenizer.decode(response_token_ids)
history.append((query, response))
return response, history