使用的数据集:ceval-exam
import requests
from datasets import load_dataset, concatenate_datasets
import re
from tqdm import tqdm
import re, time, tiktoken, ollama
from ollama import ChatResponse
from ollama import Options
def llm(model, query, temperature=0.6, stream=False, encoding=tiktoken.encoding_for_model("gpt-4"), max_tokens=None):
# return "A"
options = Options(
temperature=temperature,
num_gpu=0, # num_gpu=0即使用CPU计算
# num_thread=32,
# num_ctx=4096, # 上下文窗口大小
)
# 流式输出
response = ollama.chat(
model=model,
messages=[
{
"role": "system",
"content": "你是一个做题专家。请完成下列单项选择题。\n\n## output format\n只能输出一个选项编号字母,不要有解析等其他任何内容。",
},
{
"role": "user",
"content": query,
},
],
options=options,
stream=stream,
keep_alive=0
)
if stream:
chunks = ""
# 逐块打印响应内容
for chunk in response:
chunks += chunk["message"]["content"]
# print(chunk["message"]["content"], end="", flush=True)
if max_tokens != None and len(encoding.encode(chunks)) > max_tokens:
break
response = chunks
else:
# print(response["message"]["content"])
response = response["message"]["content"]
# stream=True时无效
# with open("tmp.txt", "a", encoding="utf-8") as f:
# f.write(response + "\n"+ 100*'*' + '\n')
if '<think>' in response and '</think>' in response:
response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
return response.strip()
task_list = [
"computer_network",
"operating_system",
"computer_architecture",
"college_programming",
"college_physics",
"college_chemistry",
"advanced_mathematics",
"probability_and_statistics",
"discrete_mathematics",
"electrical_engineer",
"metrology_engineer",
"high_school_mathematics",
"high_school_physics",
"high_school_chemistry",
"high_school_biology",
"middle_school_mathematics",
"middle_school_biology",
"middle_school_physics",
"middle_school_chemistry",
"veterinary_medicine",
"college_economics",
"business_administration",
"marxism",
"mao_zedong_thought",
"education_science",
"teacher_qualification",
"high_school_politics",
"high_school_geography",
"middle_school_politics",
"middle_school_geography",
"modern_chinese_history",
"ideological_and_moral_cultivation",
"logic",
"law",
"chinese_language_and_literature",
"art_studies",
"professional_tour_guide",
"legal_professional",
"high_school_chinese",
"high_school_history",
"middle_school_history",
"civil_servant",
"sports_science",
"plant_protection",
"basic_medicine",
"clinical_medicine",
"urban_and_rural_planner",
"accountant",
"fire_engineer",
"environmental_impact_assessment_engineer",
"tax_accountant",
"physician",
]
task_chinese_name_list = [
"计算机网络",
"操作系统",
"计算机架构",
"大学编程",
"大学物理",
"大学化学",
"高等数学",
"概率与统计",
"离散数学",
"电气工程师",
"计量工程师",
"高中数学",
"高中物理",
"高中化学",
"高中生物学",
"中学数学",
"中学生物学",
"中学物理",
"中学化学",
"兽医学",
"大学经济学",
"工商管理",
"马克思主义",
"毛泽东思想",
"教育科学",
"教师资格",
"高中政治",
"高中地理",
"中学政治",
"中学地理",
"现代中国史",
"思想道德修养",
"逻辑",
"法律",
"汉语与文学",
"艺术研究",
"专业旅游指南",
"法律专业",
"高中汉语",
"高中历史",
"中学历史",
"公务员",
"体育科学",
"植物保护",
"基础医学",
"临床医学",
"城市与农村规划",
"会计",
"消防工程师",
"环境影响评估工程师",
"税务会计",
"医生",
]
def test_split(model_name):
encoding = tiktoken.encoding_for_model("gpt-4")
model_name_write = model_name.replace(":", "_").replace("/", "_")
# with open(f"{model_name_write}.txt", "w", encoding="utf-8") as f:
# f.write(f"")
# 加载数据集
sum_total = 0
sum_correct = 0
for i in range(26, len(task_list)):
try:
dataset_tmp = load_dataset(r"ceval/data", name=task_list[i])
dataset = concatenate_datasets(
[dataset_tmp["dev"], dataset_tmp["val"]]
)
print(f"\nNo.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集加载完成, len(dataset)={len(dataset)}")
except:
print(f"\nNo.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集加载失败")
continue
# 初始化统计变量
correct = 0
total = len(dataset)
for item in tqdm(dataset, desc=f"No.{i}: Processing"):
# for item in dataset:
try:
# 构造完整问题
user_prompt = f"{item['question']}\nA. {item['A']}\nB. {item['B']}\nC. {item['C']}\nD. {item['D']}\n答案:"
# 调用Ollama API
model_answer = llm(model_name, user_prompt, stream=True, encoding=encoding, max_tokens=4096)
# 提取并验证答案
"""从模型输出中提取答案选项(A/B/C/D)"""
match = re.search(r"[A-D]", model_answer.upper())
extracted = match.group(0) if match else None
if extracted and extracted == item["answer"]:
correct += 1
except:
print("\nerror.")
# 输出结果
sum_total += total
sum_correct += correct
print(f"No.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集准确率: {correct}/{total} = {correct/total:.2%}")
with open(f"{model_name_write}.txt", "a", encoding="utf-8") as f:
f.write(f"No.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集准确率: {correct}/{total} = {correct/total:.2%}\n\n")
with open(f"{model_name_write}.txt", "a", encoding="utf-8") as f:
f.write(f"总准确率: {sum_correct}/{sum_total} = {sum_correct/sum_total:.2%}\n\n")
print(f"总准确率: {sum_correct}/{sum_total} = {sum_correct/sum_total:.2%}")
# huihui_ai/qwen2.5-abliterate:7b-instruct-q4_K_M
# qwen2.5:3b-instruct-q8_0
# qwen2.5:7b-instruct-q5_K_M
# deepseek-r1-7b:latest
# test_split(model_name="qwen2.5:3b-instruct-q8_0")
# test_split(model_name="qwen2.5:7b-instruct-q5_K_M")
# test_split(model_name="huihui_ai/qwen2.5-abliterate:7b-instruct-q4_K_M")
# test_split(model_name="qwen2.5:1.5b")
# test_split(model_name="qwen2.5:1.5b-instruct-fp16")
# test_split(model_name="qwen2.5:3b")
# test_split(model_name="gemma3:4b")
# test_split(model_name="qwen2.5:7b")
# test_split(model_name="gemma3:4b-it-q8_0")
# test_split(model_name="qwen2.5:0.5b-instruct-fp16")
# test_split(model_name="qwen2.5:0.5b")
test_split(model_name="deepseek-r1:1.5b")
# test_split(model_name="deepseek-r1:1.5b-qwen-distill-fp16")
# test_split(model_name="deepseek-r1:7b")