BYU-PCCL · chrisrytting · May 12, 2023 · May 12, 2023 · May 16, 2023 · May 16, 2023
diff --git a/configure_atp.py b/configure_atp.py
@@ -0,0 +1,31 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+from lm_survey.survey.survey import Survey
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "wave", type=Path, nargs="+", help="Path(s) to wave of ATP to configure"
+    )
+    parser.add_argument("--base-config", type=Path, help="Path to optional base config")
+    args = parser.parse_args()
+
+    for wave in args.wave:
+        config_path = wave / "config.json"
+
+        survey = Survey(name="ATP_W92", data_filename=wave / "responses.csv")
+
+        survey.generate_atp_config(config_path)
+
+        # This is a simple way to put some extra stuff in the config
+        if args.base_config:
+            with args.base_config.open("r") as f:
+                base_config = json.load(f)
+            with config_path.open("r") as f:
+                config = json.load(f)
+            config.extend(base_config)
+            with config_path.open("w") as f:
+                json.dump(config, f, indent=2)
diff --git a/estimate_survey.py b/estimate_survey.py
@@ -0,0 +1,185 @@
+import argparse
+import json
+import os
+import typing
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from lm_survey.samplers import AutoSampler, BaseSampler
+from lm_survey.survey import Survey
+
+
+def estimate_survey_costs(
+    sampler: BaseSampler,
+    survey_name: str,
+    *,
+    n_samples_per_dependent_variable: typing.Optional[int] = None,
+    n_top_mutual_info_dvs: typing.Optional[int] = None,
+    n_cull_sampled_below: typing.Optional[int] = None,
+):
+    # TODO(vinhowe): fix this
+    survey_directory = survey_name
+
+    with open(
+        os.path.join(survey_directory, "independent-variables.json"), "r"
+    ) as file:
+        independent_variable_names = json.load(file)
+
+    with open(os.path.join(survey_directory, "dependent-variables.json"), "r") as file:
+        dependent_variable_names = json.load(file)
+
+    data_filename = os.path.join(survey_directory, "responses.csv")
+    config_filename = os.path.join(survey_directory, "config.json")
+
+    survey = Survey(
+        name=survey_name,
+        data_filename=data_filename,
+        config_filename=config_filename,
+        independent_variable_names=independent_variable_names,
+        dependent_variable_names=dependent_variable_names,
+    )
+
+    if n_top_mutual_info_dvs is not None:
+        cached_mutual_info_stats_filename = os.path.join(
+            survey_directory, "cached_mutual_info_stats.csv"
+        )
+        if os.path.exists(cached_mutual_info_stats_filename):
+            mutual_info_stats = pd.read_csv(
+                cached_mutual_info_stats_filename, index_col=0
+            )
+        else:
+            mutual_info_stats = survey.mutual_info_stats()
+            mutual_info_stats.to_csv(cached_mutual_info_stats_filename)
+        # already sorted; get the first n_top_mutual_info_dvs from the index
+        dependent_variable_names = mutual_info_stats.index[:n_top_mutual_info_dvs]
+        # replace survey with a new one with only the top n_top_mutual_info_dvs
+        survey = Survey(
+            name=survey_name,
+            data_filename=data_filename,
+            config_filename=config_filename,
+            independent_variable_names=independent_variable_names,
+            dependent_variable_names=dependent_variable_names,
+        )
+
+    dependent_variable_samples = list(
+        survey.iterate(
+            n_samples_per_dependent_variable=n_samples_per_dependent_variable,
+            n_cull_sampled_below=n_cull_sampled_below,
+        )
+    )
+
+    # print random sample of prompts
+    # print(
+    #     "\n===\n===\n===\n".join(
+    #         np.random.choice(
+    #             [
+    #                 dependent_variable_sample.prompt
+    #                 for dependent_variable_sample in dependent_variable_samples
+    #             ],
+    #             10,
+    #         )
+    #     )
+    # )
+
+    prompt_count = len(dependent_variable_samples)
+
+    if hasattr(sampler, "batch_estimate_prompt_cost"):
+        completion_costs = sampler.batch_estimate_prompt_cost(
+            [
+                dependent_variable_sample.prompt
+                for dependent_variable_sample in dependent_variable_samples
+            ]
+        )
+    else:
+        completion_costs = []
+        for dependent_variable_sample in tqdm(dependent_variable_samples):
+            completion_cost = sampler.estimate_prompt_cost(
+                dependent_variable_sample.prompt
+            )
+            completion_costs.append(completion_cost)
+
+    total_completion_cost = np.sum(completion_costs)
+
+    return {
+        "prompt_count": prompt_count,
+        "cost": total_completion_cost,
+    }
+
+
+def main(
+    model_name: str,
+    survey_names: typing.List[str],
+    n_samples_per_dependent_variable: typing.Optional[int] = None,
+    n_top_mutual_info_dvs: typing.Optional[int] = None,
+    n_cull_sampled_below: typing.Optional[int] = None,
+) -> None:
+    sampler = AutoSampler(model_name=model_name)
+
+    survey_costs = {}
+    for survey_name in tqdm(survey_names):
+        estimate = estimate_survey_costs(
+            sampler=sampler,
+            survey_name=survey_name,
+            n_samples_per_dependent_variable=n_samples_per_dependent_variable,
+            n_top_mutual_info_dvs=n_top_mutual_info_dvs,
+            n_cull_sampled_below=n_cull_sampled_below,
+        )
+        survey_costs[survey_name] = estimate
+
+    total_cost = sum([estimate["cost"] for estimate in survey_costs.values()])
+
+    total_prompt_count = sum(
+        [estimate["prompt_count"] for estimate in survey_costs.values()]
+    )
+
+    if len(survey_names) > 1:
+        print(f"Cost per survey:")
+        for survey_name, survey_cost in survey_costs.items():
+            print(
+                f"{survey_name}: ${(survey_cost['cost'] / 100):.2f} ({survey_cost['prompt_count']}"
+                " prompts)"
+            )
+
+    print(f"Total cost: ${(total_cost / 100):.2f} ({total_prompt_count} prompts)")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "-n",
+        "--n_samples_per_dependent_variable",
+        type=int,
+    )
+    parser.add_argument(
+        "--n_cull_sampled_below",
+        type=int,
+    )
+    parser.add_argument(
+        "--n_top_mutual_info_dvs",
+        type=int,
+    )
+    # Positional argument for survey dir(s)
+    parser.add_argument(
+        "survey_name",
+        nargs="+",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    main(
+        model_name=args.model_name,
+        survey_names=args.survey_name,
+        n_samples_per_dependent_variable=args.n_samples_per_dependent_variable,
+        n_top_mutual_info_dvs=args.n_top_mutual_info_dvs,
+        n_cull_sampled_below=args.n_cull_sampled_below,
+    )
diff --git a/lm_survey/samplers/auto_sampler.py b/lm_survey/samplers/auto_sampler.py
@@ -21,6 +21,12 @@ def send_prompt(self, prompt, n_probs):
     def sample_several(self, prompt, temperature=0, n_tokens=10):
         return self.sampler.sample_several(prompt, temperature, n_tokens)
 
+    def estimate_prompt_cost(self, prompt: str) -> float:
+        return self.sampler.estimate_prompt_cost(prompt)
+
+    def __getattr__(self, attr):
+        return getattr(self.sampler, attr)
+
 
 if __name__ == "__main__":
     sampler = AutoSampler("gpt3-ada")

diff --git a/lm_survey/samplers/base_sampler.py b/lm_survey/samplers/base_sampler.py
@@ -42,3 +42,14 @@ def get_best_next_token(self, prompt: str, **kwargs) -> str:
         """
         logprobs = self.send_prompt(prompt=prompt, n_probs=1, **kwargs)
         return list(logprobs.keys())[0]
+
+    @abstractmethod
+    def estimate_prompt_cost(self, prompt: str, **kwargs) -> float:
+        """
+        Estimates the cost of sending the given prompt to a LM.
+        Arguments:
+            prompt (str) a prompt to be sent to LM
+        Return:
+            float the estimated cost of sending the prompt in USD cents
+        """
+        pass
diff --git a/lm_survey/samplers/hf_sampler.py b/lm_survey/samplers/hf_sampler.py
@@ -104,6 +104,9 @@ def sample_several(self, prompt, temperature=0, n_tokens=10):
         )
         return preds[0][len(prompt) + 1 :]
 
+    def estimate_prompt_cost(self, _prompt: str, **_kwargs) -> float:
+        raise NotImplementedError
+
 
 if __name__ == "__main__":
     sampler = HfSampler(model_name="/mnt/pccfs2/backed_up/models/llama/hf/llama-7b-hf")

diff --git a/lm_survey/samplers/openai_sampler.py b/lm_survey/samplers/openai_sampler.py
@@ -1,7 +1,25 @@
+import typing
+
+import tiktoken
 import torch
+
 from lm_survey.samplers.base_sampler import BaseSampler
 import openai
 
+OPENAI_TOKEN_COSTS = {
+    # cents per 1000 tokens
+    "text-davinci-003": 2,
+    "text-davinci-002": 2,
+    "text-davinci-001": 2,
+    "text-curie-001": 0.2,
+    "text-babbage-001": 0.05,
+    "text-ada-001": 0.04,
+    "davinci": 2,
+    "curie": 0.2,
+    "babbage": 0.05,
+    "ada": 0.04,
+}
+
 
 class OpenAiSampler(BaseSampler):
     def __init__(self, *args, **kwargs):
@@ -18,6 +36,8 @@ def __init__(self, *args, **kwargs):
         if openai.api_key is None:
             raise ValueError("OpenAI API key must be set")
 
+        self.tokenizer = None
+
     def rank_completions(self, prompt, completions):
         # 100 is the maximum number of log probs we can get.
         top_log_probs = self.send_prompt(prompt, n_probs=100)
@@ -64,6 +84,29 @@ def sample_several(self, prompt, temperature=0, n_tokens=10):
         )
         return response["choices"][0]["text"]  # type: ignore
 
+    def _setup_tokenizer(self):
+        if not self.tokenizer:
+            self.tokenizer = tiktoken.encoding_for_model(self.engine)
+
+    def estimate_prompt_cost(self, prompt: str):
+        self._setup_tokenizer()
+        # +1 for single token completion
+        token_count = len(self.tokenizer.encode(prompt)) + 1
+        return OPENAI_TOKEN_COSTS[self.engine] * token_count / 1000
+
+    def batch_estimate_prompt_cost(
+        self, prompts: typing.List[str]
+    ) -> typing.List[float]:
+        self._setup_tokenizer()
+        # +1 for single token completion
+        token_counts = [
+            len(encoded) + 1 for encoded in self.tokenizer.encode_batch(prompts)
+        ]
+        return [
+            OPENAI_TOKEN_COSTS[self.engine] * (token_count / 1000)
+            for token_count in token_counts
+        ]
+
 
 if __name__ == "__main__":
     sampler = OpenAiSampler("gpt3-ada")

diff --git a/lm_survey/survey/question.py b/lm_survey/survey/question.py
@@ -14,8 +14,8 @@ def __init__(
         text: typing.Optional[str] = None,
         natural_language: typing.Optional[str] = None,
     ) -> None:
-        self.raw = raw
-        self.text = text
+        self.raw = raw.lower()
+        self.text = text.lower()
         self.natural_language = natural_language
 
     def to_dict(self) -> typing.Dict[str, typing.Optional[str]]:
@@ -44,10 +44,10 @@ def __init__(
     ) -> None:
         self.key = key
         self.text = text
-        self.invalid_options = set(invalid_options)
+        self.invalid_options = set(map(str.lower, (invalid_options)))
 
         self.valid_options = {
-            option["raw"]: ValidOption(**option) for option in valid_options
+            option["raw"].lower(): ValidOption(**option) for option in valid_options
         }
 
         self.valid_options_index_map = {