基于xiaothink对Wanyv-50M模型进行c-eval评估
使用pypi安装xiaothink:
pip install xiaothink==1.0.2
下载模型:
万语-50M
开始评估(修改模型路径后即可直接开始运行,结果保存在output文件夹里):
import os
import json
import pandas as pd
import re
from tqdm import tqdm
import random
import time
import requests
from xiaothink.llm.inference.test_formal import *
model=QianyanModel(MT=40.231,
ckpt_dir=r'path\to\wanyv\model\ckpt_test_40_2_3_1_formal_open')
def chat_x(inp,temp=0.3):
return model.chat_SingleTurn(inp,temp=temp,loop=True,stop='。')#
from collections import Counter
def pre(question: str, options_str: str) -> str:
question = question.replace('答案:', '')
options_str = options_str.replace('答案:', '')
if not 'A' in question:#你只需要直接-让我们首先一步步思考,最后在回答末尾
prompt_template = '''题目:{question}\n{options_str}\n让我们首先一步步思考,最后在回答末尾给出一个字母作为你的答案(A或B或C或D)'''
prompt_template2 = '''题目:{question}\n选项:{options_str}\n给出答案'''
prompt_template3 = '''{question}\n{options_str}\n'''
prompt_template4 = '''{question}\n{options_str}\n给出你的选择'''
prompt_template5 = '''题目:{question}\n{options_str}\n答案:'''
else:
prompt_template = '''题目:{question}\n让我们首先一步步思考,最后在回答末尾给出一个字母作为你的答案(A或B或C或D)'''
prompt_template2 = '''题目:{question}\n给出答案'''
prompt_template3 = '''{question}\n'''
prompt_template4 = '''{question}\n给出你的选择'''
prompt_template5 = '''题目:{question}\n答案:'''
ansd={}
# Run the chat_core function 5 times and collect answers
answers = []
for _ in range(1):
response = chat_x(prompt_template.format(question=question, options_str=options_str))
#print(response)
# Extract answer from response
for option in 'ABCD':
if option in response:
answers.append(option)
ansd[option]=response
break
else:
print('AI选项检查:', repr(response))
answers.append('A') # Default to 'A' if no option found
ansd['A']=''
# Count occurrences of each answer
answer_counts = Counter(answers)
# Find the most common answer(s)
most_common_answers = answer_counts.most_common()
highest_frequency = most_common_answers[0][1]
most_frequent_answers = [answer for answer, count in most_common_answers if count == highest_frequency]
# Choose one of the most frequent answers (if there's a tie, choose the first alphabetically)
final_answer = min(most_frequent_answers)
with open('ceval_text_sklm.txt','a',encoding='utf-8') as f:
f.write(
'{"instruction": "{prompt_template}", "input": "", "output": "{final_answer}"}\n'.replace('{prompt_template}',prompt_template.format(question=question, options_str=options_str).replace('\n','\\n')).replace('{final_answer}',ansd[final_answer]),)
with open('ceval_text_sklm.txt','a',encoding='utf-8') as f:
f.write(
'{"instruction": "{prompt_template}", "input": "", "output": "{final_answer}"}\n'.replace('{prompt_template}',prompt_template2.format(question=question, options_str=options_str).replace('\n','\\n')).replace('{final_answer}',ansd[final_answer]),)
with open('ceval_text_sklm.txt','a',encoding='utf-8') as f:
f.write(
'{"instruction": "{prompt_template}", "input": "", "output": "{final_answer}"}\n'.replace('{prompt_template}',prompt_template3.format(question=question, options_str=options_str).replace('\n','\\n')).replace('{final_answer}',ansd[final_answer]),)
with open('ceval_text_sklm.txt','a',encoding='utf-8') as f:
f.write(
'{"instruction": "{prompt_template}", "input": "", "output": "{final_answer}"}\n'.replace('{prompt_template}',prompt_template4.format(question=question, options_str=options_str).replace('\n','\\n')).replace('{final_answer}',ansd[final_answer]),)
with open('ceval_text_sklm.txt','a',encoding='utf-8') as f:
f.write(
'{"instruction": "{prompt_template}", "input": "", "output": "{final_answer}"}\n'.replace('{prompt_template}',prompt_template5.format(question=question, options_str=options_str).replace('\n','\\n')).replace('{final_answer}',ansd[final_answer]),)
return final_answer
class Llama_Evaluator:
def __init__(self, choices, k):
self.choices = choices
self.k = k
def eval_subject(self, subject_name,
test_df,
dev_df=None,
few_shot=False,
cot=False,
save_result_dir=None,
with_prompt=False,
constrained_decoding=False,
do_test=False):
all_answers = {}
correct_num = 0
if save_result_dir:
result = []
score = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
history = ''
answers = ['NA'] * len(test_df) if do_test is True else list(test_df['answer'])
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row, include_answer=False, cot=cot, with_prompt=with_prompt)
options_str = self.format_options(row)
instruction = history + question + "\n选项:" + options_str
ans = pre(instruction, options_str)
if ans == answers[row_index]:
correct_num += 1
correct = 1
else:
correct = 0
print(f"\n=======begin {str(row_index)}=======")
print("question: ", question)
print("options: ", options_str)
print("ans: ", ans)
print("ground truth: ", answers[row_index], "\n")
if save_result_dir:
result.append(ans)
score.append(correct)
print(f"=======end {str(row_index)}=======")
all_answers[str(row_index)] = ans
correct_ratio = 100 * correct_num / len(answers)
if save_result_dir:
test_df['model_output'] = result
test_df['correctness'] = score
test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_test.csv'))
return correct_ratio, all_answers
def format_example(self, line, include_answer=True, cot=False, with_prompt=False):
example = line['question']
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
if include_answer:
if cot:
example += "\n答案:让我们一步一步思考,\n" + \
line["explanation"] + f"\n所以答案是{line['answer']}。\n\n"
else:
example += '\n答案:' + line["answer"] + '\n\n'
else:
if with_prompt is False:
if cot:
example += "\n答案:让我们一步一步思考,\n1."
else:
example += '\n答案:'
else:
if cot:
example += "\n答案是什么?让我们一步一步思考,\n1."
else:
example += '\n答案是什么? '
return example
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
prompt = f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"
k = self.k
if self.k == -1:
k = dev_df.shape[0]
for i in range(k):
prompt += self.format_example(
dev_df.iloc[i, :],
include_answer=True,
cot=cot
)
return prompt
def format_options(self, line):
options_str = ""
for choice in self.choices:
options_str += f"{choice}: {line[f'{choice}']} "
return options_str
def main(model_path, output_dir, take, few_shot=False, cot=False, with_prompt=False, constrained_decoding=False, do_test=False, n_times=1, do_save_csv=False):
assert os.path.exists("subject_mapping.json"), "subject_mapping.json not found!"
with open("subject_mapping.json") as f:
subject_mapping = json.load(f)
filenames = os.listdir("data/val")
subject_list = [val_file.replace("_val.csv", "") for val_file in filenames]
accuracy, summary = {}, {}
run_date = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time()))
save_result_dir = os.path.join(output_dir, f"take{take}")
if not os.path.exists(save_result_dir):
os.makedirs(save_result_dir, exist_ok=True)
evaluator = Llama_Evaluator(choices=choices, k=n_times)
all_answers = {}
for index, subject_name in tqdm(list(enumerate(subject_list)),desc='主进度'):
print(f"{index / len(subject_list)} Inference starts at {run_date} on {model_path} with subject of {subject_name}!")
val_file_path = os.path.join('data/val', f'{subject_name}_val.csv')
dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv')
test_file_path = os.path.join('data/test', f'{subject_name}_test.csv')
val_df = pd.read_csv(val_file_path) if not do_test else pd.read_csv(test_file_path)
dev_df = pd.read_csv(dev_file_path) if few_shot else None
correct_ratio, answers = evaluator.eval_subject(subject_name, val_df, dev_df,
save_result_dir=save_result_dir if do_save_csv else None,
few_shot=few_shot,
cot=cot,
with_prompt=with_prompt,
constrained_decoding=constrained_decoding,
do_test=do_test)
print(f"Subject: {subject_name}")
print(f"Acc: {correct_ratio}")
accuracy[subject_name] = correct_ratio
summary[subject_name] = {"score": correct_ratio,
"num": len(val_df),
"correct": correct_ratio * len(val_df) / 100}
all_answers[subject_name] = answers
json.dump(all_answers, open(save_result_dir + '/submission.json', 'w'), ensure_ascii=False, indent=4)
print("Accuracy:")
for k, v in accuracy.items():
print(k, ": ", v)
total_num = 0
total_correct = 0
summary['grouped'] = {
"STEM": {"correct": 0.0, "num": 0},
"Social Science": {"correct": 0.0, "num": 0},
"Humanities": {"correct": 0.0, "num": 0},
"Other": {"correct": 0.0, "num": 0}
}
for subj, info in subject_mapping.items():
group = info[2]
summary['grouped'][group]["num"] += summary[subj]['num']
summary['grouped'][group]["correct"] += summary[subj]['correct']
for group, info in summary['grouped'].items():
info['score'] = info["correct"] / info["num"]
total_num += info["num"]
total_correct += info["correct"]
summary['All'] = {"score": total_correct / total_num, "num": total_num, "correct": total_correct}
json.dump(summary, open(save_result_dir + '/summary.json', 'w'), ensure_ascii=False, indent=2)
# Example usage
if __name__ == "__main__":
model_path = "path/to/model"
output_dir = "output"
take = 0
few_shot = False
cot = False
with_prompt = False
constrained_decoding = False
do_test = True#False
n_times = 1
do_save_csv = False
main(model_path, output_dir, take, few_shot, cot, with_prompt, constrained_decoding, do_test, n_times, do_save_csv)