diff --git a/README.md b/README.md index fa3bfdbc..b5fe3549 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ python3 -m fastchat.serve.gradio_web_server ## Evaluation +Check [evaluation](fastchat/eval) for an end-to-end evaluation pipeline based on GPT-4. ## Fine-tuning ### Data diff --git a/fastchat/eval/README.md b/fastchat/eval/README.md index 1a88849f..8a351ff5 100644 --- a/fastchat/eval/README.md +++ b/fastchat/eval/README.md @@ -9,16 +9,19 @@ This directory contains end-to-end pipelines for AI-enhanced evaluation. We will Make sure you have setup the OpenAI API Key in your environment. Then run: ```bash -python qa_baseline_gpt35.py --question table/question.jsonl --output table/answer/awswer_gpt35.jsonl +python qa_baseline_gpt35.py --question table/question.jsonl --output table/answer/answer_gpt35.jsonl ``` ### Bard Unfortunately, Bard has not release its public APIs till now. You may have to enter the anwsers manually. Or you could find a third-party project that interfaces with Bard. -### Vicuna +### Vicuna and others -TODO: add instructions +To generate answers with Vicuna or other models, specify path to the model checkpoint. Then run: +```bash +python model_qa.py --model-name /model/path --question-file tables/question.jsonl --answer-file table/answer/answer.jsonl +``` ## Evaluate Answers Automatically diff --git a/fastchat/eval/eval_qa_chatgpt.py b/fastchat/eval/eval_qa_chatgpt.py deleted file mode 100644 index 9e656013..00000000 --- a/fastchat/eval/eval_qa_chatgpt.py +++ /dev/null @@ -1,68 +0,0 @@ -"""Evaluate QA with ChatGPT.""" -# Note: you need to be using OpenAI Python v0.27.0 for the code below to work -import argparse -import json -import os -import time - -import openai -import tqdm - - -def get_eval(rule: str, user: str, assistant: str, max_tokens: int): - response = openai.ChatCompletion.create( - model='gpt-3.5-turbo', - messages=[{ - 'role': 'system', - 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' - }, { - 'role': 'user', - 'content': f'[User]\n{user}\n[Assistant]\n{assistant}\n[system]\n{rule}', - }], - # temperature=0.2, # TODO: figure out which temperature is best for evaluation - max_tokens=max_tokens, - ) - return response['choices'][0]['message']['content'] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') - parser.add_argument('-q', '--question') - parser.add_argument('-a', '--answer') - parser.add_argument('-r', '--rule') - parser.add_argument('-o', '--output') - parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') - args = parser.parse_args() - - with open(os.path.expanduser(args.question)) as f: - question = json.load(f) - questions_dict = {q['id']: q['question'] for q in question['questions']} - - with open(os.path.expanduser(args.answer)) as f: - answer = json.load(f) - answers_dict = {ans['id']: ans['answer'] for ans in answer['answers']} - - with open(os.path.expanduser(args.rule)) as f: - rule = f.read() - - evaluations = [] - - for qid, question in tqdm.tqdm(questions_dict.items()): - answer = answers_dict.get(qid) - if answer is None: - evaluations.append({'id': qid, 'score': 0, 'explanation': 'Could not find the answer.'}) - continue - # limit the length of input - for retries in range(3): - try: - eval_result = get_eval(rule, question, answer, args.max_tokens) - score, explanation = eval_result.split('\n', 1) - evaluations.append({'id': qid, 'score': int(score), 'explanation': explanation}) - break - except Exception as e: - print('Error', e) - if retries == 2: - evaluations.append({'id': qid, 'score': -1, 'explanation': f'#ERROR: {e}'}) - - with open(os.path.expanduser(args.output), 'w') as f: - json.dump(evaluations, f) diff --git a/fastchat/eval/eval.py b/fastchat/eval/model_qa.py similarity index 64% rename from fastchat/eval/eval.py rename to fastchat/eval/model_qa.py index da32ded2..c0d18da5 100644 --- a/fastchat/eval/eval.py +++ b/fastchat/eval/model_qa.py @@ -4,12 +4,11 @@ import os import json from tqdm import tqdm -import ray +import shortuuid from fastchat.conversation import default_conversation from fastchat.utils import disable_torch_init -@ray.remote(num_gpus=1) @torch.inference_mode() def eval_model(model_name, questions_file, answers_file): # Model @@ -20,11 +19,11 @@ def eval_model(model_name, questions_file, answers_file): torch_dtype=torch.float16).cuda() - qa_file = open(os.path.expanduser(questions_file), "r") + ques_file = open(os.path.expanduser(questions_file), "r") ans_file = open(os.path.expanduser(answers_file), "w") - for i, line in enumerate(tqdm(qa_file)): - idx = json.loads(line)["id"] - qs = json.loads(line)["question"] + for i, line in enumerate(tqdm(ques_file)): + idx = json.loads(line)["question_id"] + qs = json.loads(line)["text"] cat = json.loads(line)["category"] conv = default_conversation.copy() conv.append_message(conv.roles[0], qs) @@ -43,25 +42,20 @@ def eval_model(model_name, questions_file, answers_file): index = outputs.index(conv.sep, len(prompt)) outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() - ans_file.write(json.dumps({"id": idx, "answer": outputs, "category": cat}) + "\n") + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({"question_id": idx, + "text": outputs, + "answer_id": ans_id, + "model_id": model_name, + "metadata": {}}) + "\n") ans_file.flush() ans_file.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-name", type=str, default="facebook/opt-350m") - parser.add_argument("--questions-file", type=str, default="mini_evals/qa.jsonl") - parser.add_argument("--answers-file", type=str, default="answers.jsonl") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answer-file", type=str, default="answer.jsonl") args = parser.parse_args() - ray.init() - handle = [] - for i in range(1, 5): - model_name = args.model_name - model_name.replace('~/', '') - print(model_name) - question_file = f'mini_evals/qa_v2-{i}.jsonl' - answers_file = f'answers/v4/answers-v2-{i}.jsonl' - handle.append(eval_model.remote(model_name, question_file, answers_file)) - - results = ray.get(handle) + eval_model(args.model_name, args.question_file, args.answers_file)