BYU-PCCL · chrisrytting · May 12, 2023 · May 12, 2023 · May 16, 2023 · May 16, 2023
diff --git a/calculate_representativeness.py b/calculate_representativeness.py
@@ -1,5 +1,6 @@
 from lm_survey.survey.results import SurveyResults
 import numpy as np
+from lm_survey.helpers import *
 import os
 import json
 from lm_survey.survey.dependent_variable_sample import DependentVariableSample
@@ -10,28 +11,22 @@
 
 # Grab all the files called "results.json" in the "experiments" directory
 input_filepaths = glob.glob(
-    os.path.join("experiments/breadth", "**", "results.json"), recursive=True
+    os.path.join("experiments/breadth", "**", "*3.5*/results.json"),
+    recursive=True,
 )
 
-print(input_filepaths)
 
 # read input filepaths into pandas dfs
-dfs = []
 mean_reps = {}
+
+question_samples = []
 for input_filepath in input_filepaths:
-    with open(input_filepath, "r") as file:
-        results = json.load(file)
-    question_samples = [
-        DependentVariableSample(
-            **sample_dict,
-        )
-        for sample_dict in results
-    ]
-
-    survey_results = SurveyResults(question_samples=question_samples)
+    question_samples += filepath_to_DVS_list(input_filepath)
     wave = input_filepath.split("/")[3][-3:]
     # mean_reps[wave] = survey_results.get_representativeness()
-    print(f"{wave}: {survey_results.calculate_avg_samples()}")
+
+survey_results = SurveyResults(question_samples=question_samples)
+
 
 # print("Average representativeness: ", np.mean(list(mean_reps.values())))
 # print(

diff --git a/lm_survey/survey/dependent_variable_sample.py b/lm_survey/survey/dependent_variable_sample.py
@@ -96,14 +96,19 @@ def __init__(
         independent_variables: typing.Dict[str, str],
         variable_name: str,
         question: typing.Union[Question, typing.Dict[str, typing.Any]],
-        prompt: str,
         completion: typing.Union[Completion, typing.Dict[str, typing.Any]],
+        prompt: typing.Optional[str] = None,
+        chat_prompt: typing.Optional[str] = None,
         **kwargs,
     ) -> None:
         self.index = index
         self.independent_variables = independent_variables
         self.variable_name = variable_name
-        self.prompt = prompt
+        if chat_prompt:
+            self.prompt = chat_prompt
+        elif prompt:
+            self.prompt = prompt
+        # self.prompt = prompt
 
         if isinstance(completion, Completion):
             self.completion = completion

diff --git a/lm_survey/survey/results.py b/lm_survey/survey/results.py
@@ -12,10 +12,15 @@
 class SurveyResults:
     def __init__(self, question_samples: typing.List[DependentVariableSample]):
         df = pd.DataFrame(
-            data=[self._flatten_layer(sample.to_dict()) for sample in question_samples]
+            data=[
+                self._flatten_layer(sample.to_dict())
+                for sample in question_samples
+            ]
         )
         # Make the index the index column and sort by it
         df.set_index("index", inplace=True)
+        df["wave"] = df["variable_name"].str[-3:]
+
 
         self.df = df.sort_index()
         self._engineer_columns()
@@ -79,7 +84,9 @@ def _estimate_standard_error(
         groups: pandas.core.groupby.generic.DataFrameGroupBy,
         n_bootstraps: int = 1000,
     ) -> pd.Series:
-        bootstraps = pd.concat([self._bootstrap(groups) for _ in range(n_bootstraps)])
+        bootstraps = pd.concat(
+            [self._bootstrap(groups) for _ in range(n_bootstraps)]
+        )
 
         return bootstraps.groupby(level=0).std()
 
@@ -106,52 +113,64 @@ def get_mean_score(self, slice_by: typing.List[str] = []) -> pd.DataFrame:
 
         return scores_df
 
+    def get_accuracy(self):
+        """Calculcate the accuracy"""
+
     def get_representativeness(
         self,
+        groups: pandas.core.groupby.generic.DataFrameGroupBy,
         # slice_by: typing.List[str] = []
     ):
         """
         Calculate the representativeness for each dv and print it
         """
-        df = self.df.copy()
-        reps = []
-        for dv in sorted(df.variable_name.unique()):
-            if dv == "INDUSTRY_W27":
-                print("here")
-            tdf = df[df.variable_name == dv]
-            ordinal = [k["ordinal"] for k in tdf.iloc[0]["valid_options"]]
-            # Make a dictionary where the keys are the possible_completions and the values are the number of times they appear
-            D = {k.strip(): 0.0 for k in tdf.iloc[0]["possible_completions"]}
-            D_H = D.copy()
-            D_M = D.copy()
-            for k, v in (
-                tdf.correct_completion.str.strip().value_counts(normalize=True).items()
-            ):
-                D_H[k] = v
-            for k, v in (
-                tdf.top_completion.str.strip().value_counts(normalize=True).items()
-            ):
-                D_M[k] = v
-
-            # Calculate the WD between the columns D_H and D_M
-            wd = wasserstein_distance(
-                ordinal, ordinal, list(D_H.values()), list(D_M.values())
-            ) / self._get_max_wd(ordinal)
-            rep = 1 - wd
-            # For a dv, print the keys of D_H and D_M and then their respective values
-            tab = 2
-            print("For dv:", dv)
-            print(f"{tab * ' '}Rep:", rep)
-            # print(f"{tab * ' '}WD:", wd)
-            print(f"{tab * ' '}{''.ljust(tab)}D_M D_H")
-            for k in D_H.keys():
-                print(f"{tab * ' '}{k.ljust(tab)}{D_M[k]} {D_H[k]}")
-            print("\n")
-            reps.append(rep)
-        mean_score = np.mean(reps)
-        if np.isnan(mean_score):
-            raise ValueError("Mean score is NaN")
-        return mean_score
+        rep = lambda df: 1 - self._get_wd(df)
+        return groups.apply(rep)
+
+        # How we were doing before
+        # 
+        # df = self.df.copy()
+        # reps = []
+        # for dv in sorted(df.variable_name.unique()):
+        #     tdf = df[df.variable_name == dv]
+        #     ordinal = [k["ordinal"] for k in tdf.iloc[0]["valid_options"]]
+        #     # If the variable is categorical, skip it
+        #     if len(set(ordinal)) == 1:
+        #         continue
+        #     # Make a dictionary where the keys are the possible_completions and the values are the number of times they appear
+        #     D = {k.strip(): 0.0 for k in tdf.iloc[0]["possible_completions"]}
+        #     D_H = D.copy()
+        #     D_M = D.copy()
+        #     for k, v in (
+        #         tdf.correct_completion.str.strip()
+        #         .value_counts(normalize=True)
+        #         .items()
+        #     ):
+        #         D_H[k] = v
+        #     for k, v in (
+        #         tdf.top_completion.str.strip()
+        #         .value_counts(normalize=True)
+        #         .items()
+        #     ):
+        #         D_M[k] = v
+
+        #     # Calculate the WD between the columns D_H and D_M
+        #     wd = wasserstein_distance(
+        #         ordinal, ordinal, list(D_H.values()), list(D_M.values())
+        #     ) / self._get_max_wd(ordinal)
+        #     rep = 1 - wd
+        #     # For a dv, print the keys of D_H and D_M and then their respective values
+        #     tab = 2
+        #     print("For dv:", dv)
+        #     print(f"{tab * ' '}Rep:", rep)
+        #     # print(f"{tab * ' '}WD:", wd)
+        #     print(f"{tab * ' '}{''.ljust(tab)}D_M D_H")
+        #     for k in D_H.keys():
+        #         print(f"{tab * ' '}{k.ljust(tab)}{D_M[k]} {D_H[k]}")
+        #     print("\n")
+        #     reps.append(rep)
+        # mean_score = np.mean(reps)
+        # return mean_score
 
     def calculate_avg_samples(self):
         """
@@ -166,11 +185,50 @@ def calculate_avg_samples(self):
         )
         return (df.shape[0] / len(df.variable_name.unique()),)
 
+    def _calculate_D_M(self, df, by_top_completion=True, by_distribution=False):
+        """Calculate the D_M metric for a dataframe, either by looking at the sampled completion or by considering the average distribution generated by a language model."""
+        possible_completions = df.possible_completions.iloc[0]
+        prop_dict = {k.strip(): 0.0 for k in possible_completions}
+        if by_top_completion:
+            df["top_completion"] = df["top_completion"].str.strip()
+            proportions = df.groupby("top_completion").size() / df.shape[0]
+            for prop in proportions.index:
+                prop_dict[prop] = proportions[prop]
+        return prop_dict
+
+    def _calculate_D_H(self, df, use_opinion_qa = False):
+        f"""Calculate the D_H metric for a dataframe."""
+        if use_opinion_qa:
+
+        else:
+            possible_completions = df.possible_completions.iloc[0]
+            df["correct_completion"] = df["correct_completion"].str.strip()
+            proportions = df.groupby("correct_completion").size() / df.shape[0]
+            prop_dict = {k.strip(): 0.0 for k in possible_completions}
+            for prop in proportions.index:
+                prop_dict[prop] = proportions[prop]
+
+        return prop_dict
+
+    def _get_wd(self, df):
+        D_M = self._calculate_D_M(df)
+        D_H = self._calculate_D_H(df)
+        ordinal = [k["ordinal"] for k in df.iloc[0]["valid_options"]]
+
+        wd = wasserstein_distance(
+            ordinal, ordinal, list(D_H.values()), list(D_M.values())
+        ) / self._get_max_wd(ordinal)
+        return wd
+
     def _get_max_wd(self, ordered_ref_weights):
-        d0, d1 = np.zeros(len(ordered_ref_weights)), np.zeros(len(ordered_ref_weights))
+        d0, d1 = np.zeros(len(ordered_ref_weights)), np.zeros(
+            len(ordered_ref_weights)
+        )
         d0[np.argmax(ordered_ref_weights)] = 1
         d1[np.argmin(ordered_ref_weights)] = 1
-        max_wd = wasserstein_distance(ordered_ref_weights, ordered_ref_weights, d0, d1)
+        max_wd = wasserstein_distance(
+            ordered_ref_weights, ordered_ref_weights, d0, d1
+        )
         return max_wd