-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetric.py
83 lines (65 loc) · 3.3 KB
/
metric.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
This script provides functions to calculate various similarity metrics between sets of reviews.
These metrics include the hit rate, Jaccard index, Sørensen-Dice coefficient, and the Szymkiewicz-Simpson coefficient.
"""
def calculate_hit_rate(hit_count: int, total_human_reviews: int) -> float:
"""
Calculate the hit rate, which is the ratio of hits to the total number of human reviews.
Args:
hit_count (int): The number of hits.
total_human_reviews (int): The total number of human reviews considered.
Returns:
float: The hit rate as a float.
"""
return hit_count / total_human_reviews
def calculate_jaccard_index(
hit_count: int, total_human_reviews: int, total_gpt_reviews: int
) -> float:
"""
Calculate the Jaccard index, a statistic used for gauging the similarity and diversity of sample sets.
Jaccard index = (Intersection of human and GPT reviews) / (Union of human and GPT reviews).
Args:
hit_count (int): The number of intersecting reviews.
total_human_reviews (int): The total number of human reviews.
total_gpt_reviews (int): The total number of GPT-generated reviews.
Returns:
float: The Jaccard index as a float.
"""
return hit_count / (total_human_reviews + total_gpt_reviews - hit_count)
def calculate_sorensen_dice_coefficient(
hit_count: int, total_human_reviews: int, total_gpt_reviews: int
) -> float:
"""
Calculate the Sørensen-Dice coefficient, which is a measure of the similarity between two samples.
Sørensen-Dice coefficient = (2 * Intersection of human and GPT reviews) / (Total human reviews + Total GPT reviews).
Args:
hit_count (int): The number of intersecting reviews.
total_human_reviews (int): The total number of human reviews.
total_gpt_reviews (int): The total number of GPT-generated reviews.
Returns:
float: The Sørensen-Dice coefficient as a float.
"""
return 2 * hit_count / (total_human_reviews + total_gpt_reviews)
def calculate_szymkiewicz_simpson_coefficient(
hit_count: int, total_human_reviews: int, total_gpt_reviews: int
) -> float:
"""
Calculate the Szymkiewicz-Simpson coefficient, also known as the Simpson's coefficient, which measures the degree of overlap between two sets.
Simpson's coefficient = Intersection of human and GPT reviews / Minimum of (Total human reviews, Total GPT reviews).
Args:
hit_count (int): The number of intersecting reviews.
total_human_reviews (int): The total number of human reviews.
total_gpt_reviews (int): The total number of GPT-generated reviews.
Returns:
float: The Szymkiewicz-Simpson coefficient as a float.
"""
min_total = min(total_human_reviews, total_gpt_reviews)
return hit_count / min_total
# Example usage:
# hit_count = 10
# total_human_reviews = 50
# total_gpt_reviews = 30
# print("Hit Rate:", calculate_hit_rate(hit_count, total_human_reviews))
# print("Jaccard Index:", calculate_jaccard_index(hit_count, total_human_reviews, total_gpt_reviews))
# print("Sørensen-Dice Coefficient:", calculate_sorensen_dice_coefficient(hit_count, total_human_reviews, total_gpt_reviews))
# print("Szymkiewicz-Simpson Coefficient:", calculate_szymkiewicz_simpson_coefficient(hit_count, total_human_reviews, total_gpt_reviews))