evaluate.py

"""
This file contains the functions for evaluating the quality of reviews generated by the GPT-4 model by comparing them to human-written reviews.
The evaluation process involves summarizing the reviews and comparing them to identify matching reviews between human-written and GPT-generated sets.
"""
import json
import openai
from typing import List, Tuple
from prompts import SUMMARY_PROMPT, REVIEW_COMPARISON_RPOMPT
from utils import clean_json_output


def summary_reviews(reviews: List[str], title: str, client: openai.Client) -> Tuple[str, int]:
    """
    Summarizes a list of reviews using the OpenAI GPT-4 model, formatting the result as a JSON string.

    Args:
        reviews (List[str]): List of review strings to be summarized.
        title (str): Title of the subject to which the reviews pertain.
        client (openai.Client): OpenAI client instance used to send requests to the GPT-4 model.

    Returns:
        Tuple[str, int]: A tuple containing the JSON-formatted summary of reviews and the length of the output.
    """
    # Construct the review messages with proper formatting
    review_messages = "\n\n".join(reviews) + "\n\n"
    prompt = SUMMARY_PROMPT.format(Title=title, Review_Text=review_messages)

    # Use the GPT-4 model to generate a summary
    completion = client.chat.completions.create(
        model="gpt-4-turbo", 
        messages=[{"role": "system", "content": prompt}]
    )
    
    # Extract and clean the JSON output
    output = clean_json_output(completion.choices[0].message.content)
    length = len(json.loads(output))
    
    return output, length


def match_reviews(human_reviews: str, gpt_reviews: str, client: openai.Client) -> Tuple[str, int]:
    """
    Compares two sets of reviews to identify matching reviews between human-written and GPT-generated sets.

    Args:
        human_reviews (str): JSON-formatted summary of human-written reviews.
        gpt_reviews (str): JSON-formatted summary of GPT-generated reviews.
        client (openai.Client): OpenAI client instance for sending requests.

    Returns:
        Tuple[str, int]: A tuple containing the JSON-formatted comparison of reviews and the length of the output.
    """
    prompt = REVIEW_COMPARISON_RPOMPT.format(Review_A=human_reviews, Review_B=gpt_reviews)
    
    completion = client.chat.completions.create(
        model="gpt-4-turbo", messages=[{"role": "system", "content": prompt}]
    )

    output = clean_json_output(completion.choices[0].message.content)
    length = len(json.loads(output))
    
    return output, length


def count_hits(matched_reviews: str, threshold: int = 7) -> int:
    """
    Counts the number of high-similarity hits from a JSON-formatted comparison of reviews, 
    filtering hits by a specified similarity threshold.

    Args:
        matched_reviews (str): JSON-formatted string containing comparison data.
        threshold (int): Minimum similarity score for a review to be considered a hit. Default is 7.

    Returns:
        int: Count of unique high-similarity hits.
    """
    comparison = json.loads(matched_reviews)
    hit_count = sum(1 for _, value in comparison.items() if int(value["similarity"]) >= threshold)
    
    return hit_count


# Example usage:
# human_reviews = ["review 1", "review 2", "review 3"]
# gpt_reviews = ["review 1", "review 2", "review 3"]
# title = "Attention is All You Need"
# client = openai.Client()
# human_summary, total_human_reviews = summary_reviews(human_reviews, title, client)
# gpt_summary, total_gpt_reviews = summary_reviews(gpt_reviews, title, client)
# matched_reviews, matched_reviews_length = match_reviews(human_summary, gpt_summary, client)
# hit_count = count_hits(matched_reviews)

# Calculate metrics:
# from metric import calculate_hit_rate, calculate_jaccard_index, calculate_sorensen_dice_coefficient, calculate_szymkiewicz_simpson_coefficient
# print("Hit Rate:", calculate_hit_rate(hit_count, total_human_reviews))
# print("Jaccard Index:", calculate_jaccard_index(hit_count, total_human_reviews, total_gpt_reviews))
# print("Sørensen-Dice Coefficient:", calculate_sorensen_dice_coefficient(hit_count, total_human_reviews, total_gpt_reviews))
# print("Szymkiewicz-Simpson Coefficient:", calculate_szymkiewicz_simpson_coefficient(hit_count, total_human_reviews, total_gpt_reviews))