diff --git a/README.md b/README.md
index fa3bfdbc..b5fe3549 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ python3 -m fastchat.serve.gradio_web_server
 
 ## Evaluation
 
+Check [evaluation](fastchat/eval) for an end-to-end evaluation pipeline based on GPT-4.
 
 ## Fine-tuning
 ### Data
diff --git a/fastchat/eval/README.md b/fastchat/eval/README.md
index 1a88849f..8a351ff5 100644
--- a/fastchat/eval/README.md
+++ b/fastchat/eval/README.md
@@ -9,16 +9,19 @@ This directory contains end-to-end pipelines for AI-enhanced evaluation. We will
 Make sure you have setup the OpenAI API Key in your environment. Then run:
 
 ```bash
-python qa_baseline_gpt35.py --question table/question.jsonl --output table/answer/awswer_gpt35.jsonl
+python qa_baseline_gpt35.py --question table/question.jsonl --output table/answer/answer_gpt35.jsonl
 ```
 
 ### Bard
 
 Unfortunately, Bard has not release its public APIs till now. You may have to enter the anwsers manually. Or you could find a third-party project that interfaces with Bard.
 
-### Vicuna
+### Vicuna and others
 
-TODO: add instructions
+To generate answers with Vicuna or other models, specify path to the model checkpoint. Then run:
+```bash
+python model_qa.py --model-name /model/path --question-file tables/question.jsonl --answer-file table/answer/answer.jsonl
+```
 
 ## Evaluate Answers Automatically
 
diff --git a/fastchat/eval/eval_qa_chatgpt.py b/fastchat/eval/eval_qa_chatgpt.py
deleted file mode 100644
index 9e656013..00000000
--- a/fastchat/eval/eval_qa_chatgpt.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""Evaluate QA with ChatGPT."""
-# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
-import argparse
-import json
-import os
-import time
-
-import openai
-import tqdm
-
-
-def get_eval(rule: str, user: str, assistant: str, max_tokens: int):
-    response = openai.ChatCompletion.create(
-        model='gpt-3.5-turbo',
-        messages=[{
-            'role': 'system',
-            'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
-        }, {
-            'role': 'user',
-            'content': f'[User]\n{user}\n[Assistant]\n{assistant}\n[system]\n{rule}',
-        }],
-        # temperature=0.2,  # TODO: figure out which temperature is best for evaluation
-        max_tokens=max_tokens,
-    )
-    return response['choices'][0]['message']['content']
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
-    parser.add_argument('-q', '--question')
-    parser.add_argument('-a', '--answer')
-    parser.add_argument('-r', '--rule')
-    parser.add_argument('-o', '--output')
-    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
-    args = parser.parse_args()
-
-    with open(os.path.expanduser(args.question)) as f:
-        question = json.load(f)
-        questions_dict = {q['id']: q['question'] for q in question['questions']}
-
-    with open(os.path.expanduser(args.answer)) as f:
-        answer = json.load(f)
-        answers_dict = {ans['id']: ans['answer'] for ans in answer['answers']}
-
-    with open(os.path.expanduser(args.rule)) as f:
-        rule = f.read()
-
-    evaluations = []
-
-    for qid, question in tqdm.tqdm(questions_dict.items()):
-        answer = answers_dict.get(qid)
-        if answer is None:
-            evaluations.append({'id': qid, 'score': 0, 'explanation': 'Could not find the answer.'})
-            continue
-        # limit the length of input
-        for retries in range(3):
-            try:
-                eval_result = get_eval(rule, question, answer, args.max_tokens)
-                score, explanation = eval_result.split('\n', 1)
-                evaluations.append({'id': qid, 'score': int(score), 'explanation': explanation})
-                break
-            except Exception as e:
-                print('Error', e)
-                if retries == 2:
-                    evaluations.append({'id': qid, 'score': -1, 'explanation': f'#ERROR: {e}'})
-
-    with open(os.path.expanduser(args.output), 'w') as f:
-        json.dump(evaluations, f)
diff --git a/fastchat/eval/eval.py b/fastchat/eval/model_qa.py
similarity index 64%
rename from fastchat/eval/eval.py
rename to fastchat/eval/model_qa.py
index da32ded2..c0d18da5 100644
--- a/fastchat/eval/eval.py
+++ b/fastchat/eval/model_qa.py
@@ -4,12 +4,11 @@
 import os
 import json
 from tqdm import tqdm
-import ray
+import shortuuid
 
 from fastchat.conversation import default_conversation
 from fastchat.utils import disable_torch_init
 
-@ray.remote(num_gpus=1)
 @torch.inference_mode()
 def eval_model(model_name, questions_file, answers_file):
     # Model
@@ -20,11 +19,11 @@ def eval_model(model_name, questions_file, answers_file):
         torch_dtype=torch.float16).cuda()
 
 
-    qa_file = open(os.path.expanduser(questions_file), "r")
+    ques_file = open(os.path.expanduser(questions_file), "r")
     ans_file = open(os.path.expanduser(answers_file), "w")
-    for i, line in enumerate(tqdm(qa_file)):
-        idx = json.loads(line)["id"]
-        qs = json.loads(line)["question"]
+    for i, line in enumerate(tqdm(ques_file)):
+        idx = json.loads(line)["question_id"]
+        qs = json.loads(line)["text"]
         cat = json.loads(line)["category"]
         conv = default_conversation.copy()
         conv.append_message(conv.roles[0], qs)
@@ -43,25 +42,20 @@ def eval_model(model_name, questions_file, answers_file):
             index = outputs.index(conv.sep, len(prompt))
 
         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
-        ans_file.write(json.dumps({"id": idx, "answer": outputs, "category": cat}) + "\n")
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
         ans_file.flush()
     ans_file.close()
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
-    parser.add_argument("--questions-file", type=str, default="mini_evals/qa.jsonl")
-    parser.add_argument("--answers-file", type=str, default="answers.jsonl")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answer-file", type=str, default="answer.jsonl")
     args = parser.parse_args()
 
-    ray.init()
-    handle = []
-    for i in range(1, 5):
-        model_name = args.model_name
-        model_name.replace('~/', '')
-        print(model_name)
-        question_file = f'mini_evals/qa_v2-{i}.jsonl'
-        answers_file = f'answers/v4/answers-v2-{i}.jsonl'
-        handle.append(eval_model.remote(model_name, question_file, answers_file))
-
-    results = ray.get(handle)
+    eval_model(args.model_name, args.question_file, args.answers_file)