Skip to content

Commit

Permalink
add agreement calculation for human eval
Browse files Browse the repository at this point in the history
  • Loading branch information
lwaekfjlk committed Apr 1, 2024
1 parent 7bd10bd commit 733e117
Showing 1 changed file with 221 additions and 0 deletions.
221 changes: 221 additions & 0 deletions human_eval/agreement.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
#!/usr/bin/env python3

from collections import Counter
from itertools import combinations
from typing import Callable

import numpy as np
import pandas as pd


def distNumeric(l1: float, l2: float) -> float:
return float(np.abs(l1 - l2))


def computePairwiseAgreement(
df: pd.DataFrame,
valCol: str,
groupCol: str = "HITId",
minN: int = 2,
distF: Callable[[float, float], float] = distNumeric,
) -> tuple[float, int, pd.Series]: # type: ignore[type-arg]
"""Computes pairwise agreement.
valCol: the column with the answers (e.g., Lickert scale values)
groupCol: the column identifying the rated item (e.g., HITId, post Id, etc)
"""
g = df.groupby(groupCol)[valCol]
ppas = {}
n = 0
for s, votes in g:
if len(votes) >= minN:
pa = np.mean([1 - distF(*v) for v in combinations(votes, r=2)])
ppas[s] = pa
n += 1
if pd.isnull(pa): # type: ignore
print("Pairwise agreement is null for group: ")
print(g)
# embed()
# else: print(len(votes))
if len(ppas) == 0:
return np.nan, n, pd.Series(ppas)
else:
ppa = float(np.mean(list(ppas.values())))
if pd.isnull(ppa):
print(f"Pairwise agreement probs for column {valCol}")
# embed()

return ppa, n, pd.Series(ppas)


def computeRandomAgreement(
df: pd.DataFrame,
valCol: str,
distF: Callable[[float, float], float] = distNumeric,
) -> float:
distrib = Counter(df[valCol])
agree = 0.0
tot = 0.0
i = 0
for p1 in distrib:
for p2 in distrib:
a1 = p1
a2 = p2
num, denom = 1 - distF(a1, a2), 1
if p1 == p2:
agree += distrib[p1] * (distrib[p2] - 1) * num / denom
tot += distrib[p1] * (distrib[p2] - 1)
else:
agree += distrib[p1] * (distrib[p2]) * num / denom
tot += distrib[p1] * distrib[p2]
i += 1
return agree / tot


def create_fleiss_table(
df: pd.DataFrame, col: str, groupCol: str
) -> pd.DataFrame:
# Group the data by the group column and count ratings per category
fleiss_df = df.groupby([groupCol, col])[col].count().unstack(fill_value=0)
# Convert to a numpy array and add a row representing total ratings per group
fleiss_table = fleiss_df.to_numpy()
return pd.DataFrame(fleiss_table, columns=range(fleiss_table.shape[1]))


def fleiss_kappa(
df: pd.DataFrame, n_rater: int, method: str = "fleiss"
) -> float:
df = df.copy()
n_categories = df.shape[1]

table = df.to_numpy()
# Calculate observed agreement
sum_rater = table.sum(axis=1)
# filter out rows with not enough ratings
table = table[sum_rater >= n_rater]
n_sub = table.shape[0]
p_mean = ((table**2).sum(axis=1) - n_rater).sum() / (
n_rater * (n_rater - 1) * n_sub
)
if method == "fleiss":
p_mean_exp = ((table.sum(axis=0) ** 2).sum()) / (
n_sub**2 * n_rater**2
)
elif method.startswith("rand") or method.startswith("unif"):
p_mean_exp = 1 / n_categories
if p_mean == 1 and p_mean_exp == 1:
kappa = 1
else:
kappa = (p_mean - p_mean_exp) / (1 - p_mean_exp)
return float(kappa)


def computeFleissKappa(
df: pd.DataFrame,
col: str,
groupCol: str,
n_rater: int,
method: str = "randolf",
) -> float:
df = df.copy()
df = df[[groupCol, col]]
# Calculate the sum of squared ratings per category
#print(df)
fleiss_table = create_fleiss_table(df, col, groupCol)

# Calculate Fleiss' Kappa using the modified function
score = fleiss_kappa(fleiss_table, n_rater, method=method)
return score


def computeAlpha(
df: pd.DataFrame,
valCol: str,
groupCol: str = "HITId",
minN: int = 2,
distF: Callable[[float, float], float] = distNumeric,
) -> dict[str, float | int]:
"""Computes Krippendorf's Alpha"""
d = df[~df[valCol].isnull()]
ppa, n, groups = computePairwiseAgreement(
d, valCol, groupCol=groupCol, minN=minN, distF=distF
)

d2 = d[d[groupCol].isin(groups.index)]

# Only computing random agreement on HITs that
# we computed pairwise agreement for.
if len(groups):
rnd = computeRandomAgreement(d2, valCol, distF=distF)

# Skew: computes how skewed the answers are; Krippendorf's Alpha
# behaves terribly under skewed distributions.
if d2[valCol].dtype == float or d2[valCol].dtype == int:
skew = d2[valCol].mean()
else:
if isinstance(d2[valCol].iloc[0], list) or isinstance(
d2[valCol].iloc[0], set
):
skew = 0
else:
skew = d2[valCol].describe()["freq"] / len(d2)
else:
rnd = np.nan
skew = 0
if rnd == 1:
alpha = np.nan
else:
alpha = 1 - ((1 - ppa) / (1 - rnd))
return dict(alpha=alpha, ppa=ppa, rnd_ppa=rnd, skew=skew, n=n)


if __name__ == "__main__":
input_file_name = 'human eval - mistral-instruct.csv'
eval_dim = ['believability', 'relationship', 'knowledge', 'social_rules', 'secret', 'financial_and_material_benefits', 'goal']

results = []

for dim in eval_dim:
ppas = []
alphas = []
kappas = []
for i in range(1, 3):
df = pd.read_csv(input_file_name)
col = f'{dim}_{i}'
df = df[['pk', 'prolific_id', col]]
df.rename(columns={'pk': 'id', 'prolific_id': 'raterId', col: 'rating'}, inplace=True)
longDf = df.copy()
# if NaN replace with 0
longDf["ratingBinary"] = (longDf["rating"] / longDf["rating"].abs().max()).round(
0
)
longDf["ratingBinary"] = longDf["ratingBinary"].fillna(0)
scores = computeAlpha(longDf, "ratingBinary", groupCol="id")
ppa = scores['ppa']
alpha = scores['alpha']
kappa = computeFleissKappa(longDf, "ratingBinary", "id", 2, "randolf")
ppas.append(ppa)
alphas.append(alpha)
kappas.append(kappa)
mean_ppa = np.mean(ppas)
mean_alpha = np.mean(alphas)
mean_kappa = np.mean(kappas)

# Collecting results and keep four decimal places
results.append({
'Dimension': dim,
'Pairwise Agreement': mean_ppa.round(4),
'Krippendorf\'s Alpha': mean_alpha.round(4),
'Randolf\'s Kappa': mean_kappa.round(4)
})

# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(results)

import pdb; pdb.set_trace()
# Specify your output file name
output_file_name = 'BC_agreement_results.csv'

# Save the DataFrame to a CSV file
results_df.to_csv(output_file_name, index=False)


0 comments on commit 733e117

Please sign in to comment.