diff --git a/human_eval/agreement.py b/human_eval/agreement.py
new file mode 100644
index 00000000..2b79dcbc
--- /dev/null
+++ b/human_eval/agreement.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+
+from collections import Counter
+from itertools import combinations
+from typing import Callable
+
+import numpy as np
+import pandas as pd
+
+
+def distNumeric(l1: float, l2: float) -> float:
+    return float(np.abs(l1 - l2))
+
+
+def computePairwiseAgreement(
+    df: pd.DataFrame,
+    valCol: str,
+    groupCol: str = "HITId",
+    minN: int = 2,
+    distF: Callable[[float, float], float] = distNumeric,
+) -> tuple[float, int, pd.Series]:  # type: ignore[type-arg]
+    """Computes pairwise agreement.
+    valCol: the column with the answers (e.g., Lickert scale values)
+    groupCol: the column identifying the rated item (e.g., HITId, post Id, etc)
+    """
+    g = df.groupby(groupCol)[valCol]
+    ppas = {}
+    n = 0
+    for s, votes in g:
+        if len(votes) >= minN:
+            pa = np.mean([1 - distF(*v) for v in combinations(votes, r=2)])
+            ppas[s] = pa
+            n += 1
+            if pd.isnull(pa):  # type: ignore
+                print("Pairwise agreement is null for group: ")
+                print(g)
+                # embed()
+        # else: print(len(votes))
+    if len(ppas) == 0:
+        return np.nan, n, pd.Series(ppas)
+    else:
+        ppa = float(np.mean(list(ppas.values())))
+        if pd.isnull(ppa):
+            print(f"Pairwise agreement probs for column {valCol}")
+            # embed()
+
+    return ppa, n, pd.Series(ppas)
+
+
+def computeRandomAgreement(
+    df: pd.DataFrame,
+    valCol: str,
+    distF: Callable[[float, float], float] = distNumeric,
+) -> float:
+    distrib = Counter(df[valCol])
+    agree = 0.0
+    tot = 0.0
+    i = 0
+    for p1 in distrib:
+        for p2 in distrib:
+            a1 = p1
+            a2 = p2
+            num, denom = 1 - distF(a1, a2), 1
+            if p1 == p2:
+                agree += distrib[p1] * (distrib[p2] - 1) * num / denom
+                tot += distrib[p1] * (distrib[p2] - 1)
+            else:
+                agree += distrib[p1] * (distrib[p2]) * num / denom
+                tot += distrib[p1] * distrib[p2]
+            i += 1
+    return agree / tot
+
+
+def create_fleiss_table(
+    df: pd.DataFrame, col: str, groupCol: str
+) -> pd.DataFrame:
+    # Group the data by the group column and count ratings per category
+    fleiss_df = df.groupby([groupCol, col])[col].count().unstack(fill_value=0)
+    # Convert to a numpy array and add a row representing total ratings per group
+    fleiss_table = fleiss_df.to_numpy()
+    return pd.DataFrame(fleiss_table, columns=range(fleiss_table.shape[1]))
+
+
+def fleiss_kappa(
+    df: pd.DataFrame, n_rater: int, method: str = "fleiss"
+) -> float:
+    df = df.copy()
+    n_categories = df.shape[1]
+
+    table = df.to_numpy()
+    # Calculate observed agreement
+    sum_rater = table.sum(axis=1)
+    # filter out rows with not enough ratings
+    table = table[sum_rater >= n_rater]
+    n_sub = table.shape[0]
+    p_mean = ((table**2).sum(axis=1) - n_rater).sum() / (
+        n_rater * (n_rater - 1) * n_sub
+    )
+    if method == "fleiss":
+        p_mean_exp = ((table.sum(axis=0) ** 2).sum()) / (
+            n_sub**2 * n_rater**2
+        )
+    elif method.startswith("rand") or method.startswith("unif"):
+        p_mean_exp = 1 / n_categories
+    if p_mean == 1 and p_mean_exp == 1:
+        kappa = 1
+    else:
+        kappa = (p_mean - p_mean_exp) / (1 - p_mean_exp)
+    return float(kappa)
+
+
+def computeFleissKappa(
+    df: pd.DataFrame,
+    col: str,
+    groupCol: str,
+    n_rater: int,
+    method: str = "randolf",
+) -> float:
+    df = df.copy()
+    df = df[[groupCol, col]]
+    # Calculate the sum of squared ratings per category
+    #print(df)
+    fleiss_table = create_fleiss_table(df, col, groupCol)
+
+    # Calculate Fleiss' Kappa using the modified function
+    score = fleiss_kappa(fleiss_table, n_rater, method=method)
+    return score
+
+
+def computeAlpha(
+    df: pd.DataFrame,
+    valCol: str,
+    groupCol: str = "HITId",
+    minN: int = 2,
+    distF: Callable[[float, float], float] = distNumeric,
+) -> dict[str, float | int]:
+    """Computes Krippendorf's Alpha"""
+    d = df[~df[valCol].isnull()]
+    ppa, n, groups = computePairwiseAgreement(
+        d, valCol, groupCol=groupCol, minN=minN, distF=distF
+    )
+
+    d2 = d[d[groupCol].isin(groups.index)]
+
+    # Only computing random agreement on HITs that
+    # we computed pairwise agreement for.
+    if len(groups):
+        rnd = computeRandomAgreement(d2, valCol, distF=distF)
+
+        # Skew: computes how skewed the answers are; Krippendorf's Alpha
+        # behaves terribly under skewed distributions.
+        if d2[valCol].dtype == float or d2[valCol].dtype == int:
+            skew = d2[valCol].mean()
+        else:
+            if isinstance(d2[valCol].iloc[0], list) or isinstance(
+                d2[valCol].iloc[0], set
+            ):
+                skew = 0
+            else:
+                skew = d2[valCol].describe()["freq"] / len(d2)
+    else:
+        rnd = np.nan
+        skew = 0
+    if rnd == 1:
+        alpha = np.nan
+    else:
+        alpha = 1 - ((1 - ppa) / (1 - rnd))
+    return dict(alpha=alpha, ppa=ppa, rnd_ppa=rnd, skew=skew, n=n)
+
+
+if __name__ == "__main__":
+    input_file_name = 'human eval - mistral-instruct.csv'
+    eval_dim = ['believability', 'relationship', 'knowledge', 'social_rules', 'secret', 'financial_and_material_benefits', 'goal']
+    
+    results = []
+
+    for dim in eval_dim:
+        ppas = []
+        alphas = []
+        kappas = []
+        for i in range(1, 3):
+            df = pd.read_csv(input_file_name)
+            col = f'{dim}_{i}'
+            df = df[['pk', 'prolific_id', col]]
+            df.rename(columns={'pk': 'id', 'prolific_id': 'raterId', col: 'rating'}, inplace=True)
+            longDf = df.copy()
+            # if NaN replace with 0
+            longDf["ratingBinary"] = (longDf["rating"] / longDf["rating"].abs().max()).round(
+                0
+            )
+            longDf["ratingBinary"] = longDf["ratingBinary"].fillna(0)
+            scores = computeAlpha(longDf, "ratingBinary", groupCol="id")
+            ppa = scores['ppa']
+            alpha = scores['alpha']
+            kappa = computeFleissKappa(longDf, "ratingBinary", "id", 2, "randolf")
+            ppas.append(ppa)
+            alphas.append(alpha)
+            kappas.append(kappa)
+        mean_ppa = np.mean(ppas)
+        mean_alpha = np.mean(alphas)
+        mean_kappa = np.mean(kappas)
+
+        # Collecting results and keep four decimal places
+        results.append({
+            'Dimension': dim,
+            'Pairwise Agreement': mean_ppa.round(4),
+            'Krippendorf\'s Alpha': mean_alpha.round(4),
+            'Randolf\'s Kappa': mean_kappa.round(4)
+        })
+
+    # Convert the list of dictionaries into a DataFrame
+    results_df = pd.DataFrame(results)
+
+    import pdb; pdb.set_trace()
+    # Specify your output file name
+    output_file_name = 'BC_agreement_results.csv'
+
+    # Save the DataFrame to a CSV file
+    results_df.to_csv(output_file_name, index=False)
+
+