-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
98 lines (76 loc) · 4.23 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
This file contains the functions for evaluating the quality of reviews generated by the GPT-4 model by comparing them to human-written reviews.
The evaluation process involves summarizing the reviews and comparing them to identify matching reviews between human-written and GPT-generated sets.
"""
import json
import openai
from typing import List, Tuple
from prompts import SUMMARY_PROMPT, REVIEW_COMPARISON_RPOMPT
from utils import clean_json_output
def summary_reviews(reviews: List[str], title: str, client: openai.Client) -> Tuple[str, int]:
"""
Summarizes a list of reviews using the OpenAI GPT-4 model, formatting the result as a JSON string.
Args:
reviews (List[str]): List of review strings to be summarized.
title (str): Title of the subject to which the reviews pertain.
client (openai.Client): OpenAI client instance used to send requests to the GPT-4 model.
Returns:
Tuple[str, int]: A tuple containing the JSON-formatted summary of reviews and the length of the output.
"""
# Construct the review messages with proper formatting
review_messages = "\n\n".join(reviews) + "\n\n"
prompt = SUMMARY_PROMPT.format(Title=title, Review_Text=review_messages)
# Use the GPT-4 model to generate a summary
completion = client.chat.completions.create(
model="gpt-4-turbo",
messages=[{"role": "system", "content": prompt}]
)
# Extract and clean the JSON output
output = clean_json_output(completion.choices[0].message.content)
length = len(json.loads(output))
return output, length
def match_reviews(human_reviews: str, gpt_reviews: str, client: openai.Client) -> Tuple[str, int]:
"""
Compares two sets of reviews to identify matching reviews between human-written and GPT-generated sets.
Args:
human_reviews (str): JSON-formatted summary of human-written reviews.
gpt_reviews (str): JSON-formatted summary of GPT-generated reviews.
client (openai.Client): OpenAI client instance for sending requests.
Returns:
Tuple[str, int]: A tuple containing the JSON-formatted comparison of reviews and the length of the output.
"""
prompt = REVIEW_COMPARISON_RPOMPT.format(Review_A=human_reviews, Review_B=gpt_reviews)
completion = client.chat.completions.create(
model="gpt-4-turbo", messages=[{"role": "system", "content": prompt}]
)
output = clean_json_output(completion.choices[0].message.content)
length = len(json.loads(output))
return output, length
def count_hits(matched_reviews: str, threshold: int = 7) -> int:
"""
Counts the number of high-similarity hits from a JSON-formatted comparison of reviews,
filtering hits by a specified similarity threshold.
Args:
matched_reviews (str): JSON-formatted string containing comparison data.
threshold (int): Minimum similarity score for a review to be considered a hit. Default is 7.
Returns:
int: Count of unique high-similarity hits.
"""
comparison = json.loads(matched_reviews)
hit_count = sum(1 for _, value in comparison.items() if int(value["similarity"]) >= threshold)
return hit_count
# Example usage:
# human_reviews = ["review 1", "review 2", "review 3"]
# gpt_reviews = ["review 1", "review 2", "review 3"]
# title = "Attention is All You Need"
# client = openai.Client()
# human_summary, total_human_reviews = summary_reviews(human_reviews, title, client)
# gpt_summary, total_gpt_reviews = summary_reviews(gpt_reviews, title, client)
# matched_reviews, matched_reviews_length = match_reviews(human_summary, gpt_summary, client)
# hit_count = count_hits(matched_reviews)
# Calculate metrics:
# from metric import calculate_hit_rate, calculate_jaccard_index, calculate_sorensen_dice_coefficient, calculate_szymkiewicz_simpson_coefficient
# print("Hit Rate:", calculate_hit_rate(hit_count, total_human_reviews))
# print("Jaccard Index:", calculate_jaccard_index(hit_count, total_human_reviews, total_gpt_reviews))
# print("Sørensen-Dice Coefficient:", calculate_sorensen_dice_coefficient(hit_count, total_human_reviews, total_gpt_reviews))
# print("Szymkiewicz-Simpson Coefficient:", calculate_szymkiewicz_simpson_coefficient(hit_count, total_human_reviews, total_gpt_reviews))