-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate.py
112 lines (99 loc) · 4.22 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import argparse
import json
import os
import sys
import time
import openai
import pandas as pd
from tqdm import tqdm
# ------------------------------------------------------------------------------
# Parseargs
# ------------------------------------------------------------------------------
argparser = argparse.ArgumentParser()
argparser.add_argument("-f", "--file", type=str, required=True)
argparser.add_argument("-m", "--model", type=str, default="gpt-4-0613")
argparser.add_argument("-t", "--temp", type=float, default=0.0)
args = argparser.parse_args()
model = args.model
temp = args.temp
# ------------------------------------------------------------------------------
# Get key
# ------------------------------------------------------------------------------
try:
with open(os.path.expanduser("~/.cache/oai"), "r") as f:
openai.api_key = f.read().strip()
except:
print("Error reading openai api key from ~/.cache/oai")
exit(1)
# ------------------------------------------------------------------------------
# System prompt
# ------------------------------------------------------------------------------
sys_prompt = """You are a CFA (chartered financial analyst) taking a test to evaluate your knowledge of finance.
You will be given a question along with three possible answers (A, B, and C).
Before answering, you should think through the question step-by-step.
Explain your reasoning at each step towards answering the question.
If calculation is required, do each step of the calculation as a step in your reasoning.
Finally, indicate the correct answer
"""
def ask_gpt(question):
out = None
for _ in range(3):
try:
res = openai.ChatCompletion.create(
model=model,
temperature=temp,
messages=[
{"role": "system", "content": sys_prompt},
{"role": "user", "content": question},
],
functions=[
{
"name": "answer_question",
"description": "Think through and answer a multiple choice question on finance",
"parameters": {
"type": "object",
"properties": {
"thinking": {
"type": "array",
"items": {
"type": "string",
"description": "Thought and/or calculation for a step in the process of answering the question",
},
"description": "Step by step thought process and calculations towards answering the question",
},
"answer": {
"type": "string",
"description": "The answer to the question",
"enum": ["A", "B", "C"],
},
},
"required": ["thinking", "answer"],
},
}
],
function_call={"name": "answer_question"},
)
ans = res.choices[0].message.to_dict()["function_call"]["arguments"] # type: ignore
out = json.loads(ans)
return out
except Exception as e:
print(e)
time.sleep(3)
continue
return {"thinking": ["failed to get response"], "answer": "N"}
exam = pd.read_csv(args.file, sep=";;", engine="python")
exam["guess"] = ""
exam["thinking"] = ""
exam["correct"] = ""
correct = 0
for i, row in tqdm(exam.iterrows(), total=len(exam)):
ans = ask_gpt(row.question)
exam.loc[i, "guess"] = ans["answer"]
exam.loc[i, "thinking"] = " - ".join(ans["thinking"])
if ans["answer"] == row.answer:
correct += 1
exam.loc[i, "correct"] = "yes"
else:
exam.loc[i, "correct"] = "no"
print(f"Score: {correct}/{len(exam)} {correct/len(exam)}%")
exam.to_csv("gpt4.csv", index=False)