From 82f0a4309d2fe3f8445fcf48dca21eadc226e826 Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Fri, 7 Apr 2023 20:04:01 -0700 Subject: [PATCH] autogen subpackage (#968) * math utils in autogen * cleanup * code utils * remove check function from code response * comment out test * GPT-4 * increase request timeout * name * logging and error handling * better doc * doc * codegen optimized * GPT series * text * no demo example * math * import openai * import openai * azure model name * azure model name * openai version * generate assertion if necessary * condition to generate assertions * init region key * rename * comments about budget * prompt --------- Co-authored-by: Susan Xueqing Liu --- README.md | 20 +- flaml/__init__.py | 2 +- flaml/{integrations => autogen}/__init__.py | 0 flaml/autogen/code_utils.py | 181 ++ flaml/autogen/math_utils.py | 312 +++ flaml/autogen/oai/__init__.py | 3 + .../oai/completion.py | 232 ++- flaml/integrations/oai/__init__.py | 3 - notebook/autogen_chatgpt.ipynb | 1525 ++++++++++++++ notebook/autogen_openai.ipynb | 1161 +++++++++++ notebook/integrate_chatgpt.ipynb | 1795 ----------------- notebook/integrate_openai.ipynb | 1232 ----------- notebook/research/autogen_code.ipynb | 787 ++++++++ notebook/research/math_level5counting.ipynb | 784 +++++++ setup.py | 2 +- test/openai/test_completion.py | 587 ++---- test/openai/test_notebook.py | 12 +- ...ntegrate - OpenAI.md => AutoGen-OpenAI.md} | 106 +- website/docs/Getting-Started.md | 24 +- website/docs/Use-Cases/Auto-Generation.md | 117 ++ 20 files changed, 5249 insertions(+), 3636 deletions(-) rename flaml/{integrations => autogen}/__init__.py (100%) create mode 100644 flaml/autogen/code_utils.py create mode 100644 flaml/autogen/math_utils.py create mode 100644 flaml/autogen/oai/__init__.py rename flaml/{integrations => autogen}/oai/completion.py (84%) delete mode 100644 flaml/integrations/oai/__init__.py create mode 100644 notebook/autogen_chatgpt.ipynb create mode 100644 notebook/autogen_openai.ipynb delete mode 100644 notebook/integrate_chatgpt.ipynb delete mode 100644 notebook/integrate_openai.ipynb create mode 100644 notebook/research/autogen_code.ipynb create mode 100644 notebook/research/math_level5counting.ipynb rename website/docs/Examples/{Integrate - OpenAI.md => AutoGen-OpenAI.md} (56%) create mode 100644 website/docs/Use-Cases/Auto-Generation.md diff --git a/README.md b/README.md index b465b2967f..7e199eb2eb 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,9 @@ ## What is FLAML FLAML is a lightweight Python library that finds accurate machine learning models automatically, efficiently and economically. It frees users from selecting -models and hyperparameters for each model. It can also be used to tune generic hyperparameters for large language models (LLM), MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on. +models and hyperparameters for each model. It can also be used to tune generic hyperparameters for foundation models, MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on. -1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large language models such as the OpenAI GPT-3 models. +1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as the GPT series. 1. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code). 1. It supports fast automatic tuning, capable of handling complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective hyperparameter optimization](https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function/#hyperparameter-optimization-algorithm) @@ -95,6 +95,22 @@ estimator = LGBMRegressor() estimator.fit(X_train, y_train) ``` +* (New) You can optimize [generations](https://microsoft.github.io/FLAML/docs/Use-Cases/Auto-Generation) by ChatGPT or GPT-4 etc. with your own tuning data, success metrics and budgets. + +```python +from flaml import oai + +config, analysis = oai.Completion.tune( + data=tune_data, + metric="success", + mode="max", + eval_func=eval_func, + inference_budget=0.05, + optimization_budget=3, + num_samples=-1, +) +``` + ## Documentation You can find a detailed documentation about FLAML [here](https://microsoft.github.io/FLAML/) where you can find the API documentation, use cases and examples. diff --git a/flaml/__init__.py b/flaml/__init__.py index a0a6138880..9fca486499 100644 --- a/flaml/__init__.py +++ b/flaml/__init__.py @@ -2,7 +2,7 @@ from flaml.automl import AutoML, logger_formatter from flaml.tune.searcher import CFO, BlendSearch, FLOW2, BlendSearchTuner, RandomSearch from flaml.onlineml.autovw import AutoVW -from flaml.integrations import oai +from flaml.autogen import oai from flaml.version import __version__ diff --git a/flaml/integrations/__init__.py b/flaml/autogen/__init__.py similarity index 100% rename from flaml/integrations/__init__.py rename to flaml/autogen/__init__.py diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py new file mode 100644 index 0000000000..abf9a10307 --- /dev/null +++ b/flaml/autogen/code_utils.py @@ -0,0 +1,181 @@ +import signal +import subprocess +import sys +from typing import List, Dict, Tuple, Optional, Union, Callable +from flaml import oai + + +def timeout_handler(signum, frame): + raise TimeoutError("Timed out!") + + +def execute_code(code: str, max_exec_time: Optional[int] = 3): + signal.signal(signal.SIGALRM, timeout_handler) + code = code.strip() + with open("codetest.py", "w") as fout: + fout.write(code) + try: + signal.alarm(max_exec_time) + result = subprocess.run( + [sys.executable, "codetest.py"], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + signal.alarm(0) + except TimeoutError: + return 0 + return int(result.returncode == 0) + + +def generate_assertions( + definition: str, model: Optional[str] = "gpt-3.5-turbo" +) -> Tuple[str, float]: + """Generate assertions for a function. + + Args: + definition (str): The function definition, including the signature and docstr. + model (str): The model used for generation. + + Returns: + str: The generated assertions. + float: The cost of the generation. + """ + prompt = """Given the signature and docstring, write the exactly same number of assertion(s) for the provided example(s) in the docstring, without assertion messages. + +func signature: +{definition} +assertions:""" + response = oai.Completion.create( + {"definition": definition}, + model=model, + prompt=prompt, + max_tokens=256, + stop="\n\n", + ) + cost = oai.Completion.cost(model, response) + assertions = oai.Completion.extract_text(response)[0] + return assertions, cost + + +def _remove_check(response): + """Remove the check function from the response.""" + # find the position of the check function + pos = response.find("def check(") + if pos == -1: + return response + return response[:pos] + + +def eval_function_completions( + responses: List[str], + definition: str, + test: Optional[str] = None, + entry_point: Optional[str] = None, + assertions: Optional[Union[str, Callable[[str], Tuple[str, float]]]] = None, +) -> Dict: + """Select a response from a list of responses for the function completion task (using generated assertions), and/or evaluate if the task is successful using a gold test. + + Args: + responses (list): The list of responses. + definition (str): The input definition. + test (Optional, str): The test code. + entry_point (Optional, str): The name of the function. + assertions (Optional, str or Callable): The assertion code which serves as a filter of the responses, or an assertion generator. + When provided, only the responses that pass the assertions will be considered for the actual test (if provided). + + Returns: + dict: The success metrics. + """ + n = len(responses) + if assertions is None: + # no assertion filter + success_list = [] + for i in range(n): + response = _remove_check(responses[i]) + code = ( + f"{response}\n{test}\ncheck({entry_point})" + if response.startswith("def") + else f"{definition}{response}\n{test}\ncheck({entry_point})" + ) + success = execute_code(code) + success_list.append(success) + return { + "expected_success": 1 - pow(1 - sum(success_list) / n, n), + "success": any(s for s in success_list), + } + if callable(assertions) and n > 1: + # assertion generator + assertions, gen_cost = assertions(definition) + else: + gen_cost = 0 + if n > 1 or test is None: + for i in range(n): + response = responses[i] = _remove_check(responses[i]) + code = ( + f"{response}\n{assertions}" + if response.startswith("def") + else f"{definition}{response}\n{assertions}" + ) + succeed_assertions = execute_code(code) + if succeed_assertions: + break + else: + # just test, no need to check assertions + succeed_assertions = False + i, response = 0, responses[0] + if test is None: + # no test code + return { + "index_selected": i, + "succeed_assertions": succeed_assertions, + "gen_cost": gen_cost, + "assertions": assertions, + } + code_test = ( + f"{response}\n{test}\ncheck({entry_point})" + if response.startswith("def") + else f"{definition}{response}\n{test}\ncheck({entry_point})" + ) + success = execute_code(code_test) + return { + "index_selected": i, + "succeed_assertions": succeed_assertions, + "success": success, + "gen_cost": gen_cost, + "assertions": assertions, + } + + +def implement( + definition: str, + configs: List[Dict], + assertions: Optional[ + Union[str, Callable[[str], Tuple[str, float]]] + ] = generate_assertions, +) -> Tuple[str, float]: + """Implement a function from a definition. + + Args: + definition (str): The function definition, including the signature and docstr. + configs (list): The list of configurations for completion. + assertions (Optional, str or Callable): The assertion code which serves as a filter of the responses, or an assertion generator. + + Returns: + str: The implementation. + float: The cost of the implementation. + int: The index of the configuration which generates the implementation. + """ + cost = 0 + if len(configs) > 1 and callable(assertions): + assertions, cost = assertions(definition) + for i, config in enumerate(configs): + response = oai.Completion.create({"definition": definition}, **config) + cost += oai.Completion.cost(config["model"], response) + responses = oai.Completion.extract_text(response) + metrics = eval_function_completions( + responses, definition, assertions=assertions + ) + assertions = metrics["assertions"] + cost += metrics["gen_cost"] + if metrics["succeed_assertions"] or i == len(configs) - 1: + return responses[metrics["index_selected"]], cost, i diff --git a/flaml/autogen/math_utils.py b/flaml/autogen/math_utils.py new file mode 100644 index 0000000000..a16b05c0c2 --- /dev/null +++ b/flaml/autogen/math_utils.py @@ -0,0 +1,312 @@ +from typing import Optional + + +def remove_boxed(string: str) -> Optional[str]: + """Source: https://github.com/hendrycks/math + Extract the text within a \\boxed{...} environment. + Example: + >>> remove_boxed(\\boxed{\\frac{2}{3}}) + \\frac{2}{3} + """ + left = "\\boxed{" + try: + assert string[: len(left)] == left + assert string[-1] == "}" + return string[len(left) : -1] + except Exception: + return None + + +def last_boxed_only_string(string: str) -> Optional[str]: + """Source: https://github.com/hendrycks/math + Extract the last \\boxed{...} or \\fbox{...} element from a string. + """ + idx = string.rfind("\\boxed") + if idx < 0: + idx = string.rfind("\\fbox") + if idx < 0: + return None + + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == "{": + num_left_braces_open += 1 + if string[i] == "}": + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx is None: + retval = None + else: + retval = string[idx : right_brace_idx + 1] + + return retval + + +def _fix_fracs(string: str) -> str: + """Source: https://github.com/hendrycks/math + Reformat fractions. + Examples: + >>> _fix_fracs("\\frac1b") + \frac{1}{b} + >>> _fix_fracs("\\frac12") + \frac{1}{2} + >>> _fix_fracs("\\frac1{72}") + \frac{1}{72} + """ + substrs = string.split("\\frac") + new_str = substrs[0] + if len(substrs) > 1: + substrs = substrs[1:] + for substr in substrs: + new_str += "\\frac" + if substr[0] == "{": + new_str += substr + else: + try: + assert len(substr) >= 2 + except Exception: + return string + a = substr[0] + b = substr[1] + if b != "{": + if len(substr) > 2: + post_substr = substr[2:] + new_str += "{" + a + "}{" + b + "}" + post_substr + else: + new_str += "{" + a + "}{" + b + "}" + else: + if len(substr) > 2: + post_substr = substr[2:] + new_str += "{" + a + "}" + b + post_substr + else: + new_str += "{" + a + "}" + b + string = new_str + return string + + +def _fix_a_slash_b(string: str) -> str: + """Source: https://github.com/hendrycks/math + Reformat fractions formatted as a/b to \\frac{a}{b}. + Example: + >>> _fix_a_slash_b("2/3") + \frac{2}{3} + """ + if len(string.split("/")) != 2: + return string + a_str = string.split("/")[0] + b_str = string.split("/")[1] + try: + a = int(a_str) + b = int(b_str) + assert string == "{}/{}".format(a, b) + new_string = "\\frac{" + str(a) + "}{" + str(b) + "}" + return new_string + except Exception: + return string + + +def _remove_right_units(string: str) -> str: + """Source: https://github.com/hendrycks/math + Remove units (on the right). + "\\text{ " only ever occurs (at least in the val set) when describing units. + """ + if "\\text{ " in string: + splits = string.split("\\text{ ") + assert len(splits) == 2 + return splits[0] + else: + return string + + +def _fix_sqrt(string: str) -> str: + """Source: https://github.com/hendrycks/math + Reformat square roots. + Example: + >>> _fix_sqrt("\\sqrt3") + \\sqrt{3} + """ + if "\\sqrt" not in string: + return string + splits = string.split("\\sqrt") + new_string = splits[0] + for split in splits[1:]: + if split[0] != "{": + a = split[0] + new_substr = "\\sqrt{" + a + "}" + split[1:] + else: + new_substr = "\\sqrt" + split + new_string += new_substr + return new_string + + +def _strip_string(string: str) -> str: + """Source: https://github.com/hendrycks/math + Apply the reformatting helper functions above. + """ + # linebreaks + string = string.replace("\n", "") + # print(string) + + # remove inverse spaces + string = string.replace("\\!", "") + # print(string) + + # replace \\ with \ + string = string.replace("\\\\", "\\") + # print(string) + + # replace tfrac and dfrac with frac + string = string.replace("tfrac", "frac") + string = string.replace("dfrac", "frac") + # print(string) + + # remove \left and \right + string = string.replace("\\left", "") + string = string.replace("\\right", "") + # print(string) + + # Remove circ (degrees) + string = string.replace("^{\\circ}", "") + string = string.replace("^\\circ", "") + + # remove dollar signs + string = string.replace("\\$", "") + + # remove units (on the right) + string = _remove_right_units(string) + + # remove percentage + string = string.replace("\\%", "") + string = string.replace("%", "") + + # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string + string = string.replace(" .", " 0.") + string = string.replace("{.", "{0.") + # if empty, return empty string + if len(string) == 0: + return string + if string[0] == ".": + string = "0" + string + + # to consider: get rid of e.g. "k = " or "q = " at beginning + if len(string.split("=")) == 2: + if len(string.split("=")[0]) <= 2: + string = string.split("=")[1] + + # fix sqrt3 --> sqrt{3} + string = _fix_sqrt(string) + + # remove spaces + string = string.replace(" ", "") + + # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. + # Even works with \frac1{72} (but not \frac{72}1). + # Also does a/b --> \\frac{a}{b} + string = _fix_fracs(string) + + # manually change 0.5 --> \frac{1}{2} + if string == "0.5": + string = "\\frac{1}{2}" + + # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y + string = _fix_a_slash_b(string) + + return string + + +def get_answer(solution: Optional[str]) -> Optional[str]: + if solution is None: + return None + last_boxed = last_boxed_only_string(solution) + if last_boxed is None: + return None + answer = remove_boxed(last_boxed) + if answer is None: + return None + return answer + + +def is_equiv(str1: Optional[str], str2: Optional[str]) -> float: + """Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in + - units + - fractions + - square roots + - superfluous LaTeX. + Source: https://github.com/hendrycks/math + """ + if str1 is None and str2 is None: + print("WARNING: Both None") + return 1.0 + if str1 is None or str2 is None: + return 0.0 + + try: + ss1 = _strip_string(str1) + ss2 = _strip_string(str2) + return float(ss1 == ss2) + except Exception: + return float(str1 == str2) + + +def is_equiv_chain_of_thought(str1: str, str2: str) -> float: + """Strips the solution first before calling `is_equiv`.""" + ans1 = get_answer(str1) + ans2 = get_answer(str2) + + return is_equiv(ans1, ans2) + + +def voting_counts(responses): + answers = {} + for i in range(len(responses)): + equiv = i + if get_answer(responses[i]) is None: + # ignore None answers + continue + for j in answers: + if is_equiv_chain_of_thought(responses[i], responses[j]): + equiv = j + break + if equiv in answers: + answers[equiv] += 1 + else: + answers[equiv] = 1 + return answers + + +def eval_math_responses(responses, solution=None, **args): + """Select a response for a math problem using voting, and check if the response is correct if the solution is provided. + + Args: + responses (list): The list of responses. + solution (str): The canonical solution. + + Returns: + dict: The success metrics. + """ + success_list = [] + n = len(responses) + if solution is not None: + for i in range(n): + response = responses[i] + succeed = is_equiv_chain_of_thought(response, solution) + success_list.append(succeed) + # voting + answers = voting_counts(responses) + # find the answer with highest votes in answers + answer, votes = max(answers.items(), key=lambda x: x[1], default=(0, 0)) + # check if the answer is correct + success_vote = is_equiv_chain_of_thought(responses[answer], solution) + return { + "expected_success": 1 - pow(1 - sum(success_list) / n, n), + "success": any(s for s in success_list), + "success_vote": success_vote, + "voted_answer": responses[answer], + "votes": votes, + } diff --git a/flaml/autogen/oai/__init__.py b/flaml/autogen/oai/__init__.py new file mode 100644 index 0000000000..c845911b42 --- /dev/null +++ b/flaml/autogen/oai/__init__.py @@ -0,0 +1,3 @@ +from flaml.autogen.oai.completion import Completion, ChatCompletion + +__all__ = ["Completion", "ChatCompletion"] diff --git a/flaml/integrations/oai/completion.py b/flaml/autogen/oai/completion.py similarity index 84% rename from flaml/integrations/oai/completion.py rename to flaml/autogen/oai/completion.py index fefcc8a376..513bcf8f21 100644 --- a/flaml/integrations/oai/completion.py +++ b/flaml/autogen/oai/completion.py @@ -2,7 +2,10 @@ import logging import numpy as np import time +from typing import List +import sys from flaml import tune, BlendSearch +from flaml.automl.logger import logger_formatter try: import openai @@ -22,6 +25,11 @@ "please install flaml[openai] option to use the flaml.oai subpackage." ) logger = logging.getLogger(__name__) +if not logger.handlers: + # Add the console handler. + _ch = logging.StreamHandler(stream=sys.stdout) + _ch.setFormatter(logger_formatter) + logger.addHandler(_ch) def get_key(config): @@ -50,6 +58,7 @@ class Completion: chat_models = { "gpt-3.5-turbo", "gpt-3.5-turbo-0301", + "gpt-35-turbo", "gpt-4", "gpt-4-32k", "gpt-4-32k-0314", @@ -67,6 +76,7 @@ class Completion: "text-davinci-003": 0.02, "gpt-3.5-turbo": 0.002, "gpt-3.5-turbo-0301": 0.002, + "gpt-35-turbo": 0.002, "gpt-4": (0.03, 0.06), "gpt-4-0314": (0.03, 0.06), "gpt-4-32k": (0.06, 0.12), @@ -95,12 +105,13 @@ class Completion: } seed = 41 + cache_path = f".cache/{seed}" # retry after this many seconds retry_time = 10 # fail a request after hitting RateLimitError for this many seconds - retry_timeout = 60 + retry_timeout = 120 # time out for request to openai server - request_timeout = 30 + request_timeout = 60 openai_completion_class = not ERROR and openai.Completion _total_cost = 0 @@ -156,14 +167,18 @@ def _get_response(cls, config: dict, eval_only=False, use_cache=True): # retry after retry_time seconds if time.time() - start_time + cls.retry_time < cls.retry_timeout: logger.info(f"retrying in {cls.retry_time} seconds...", exc_info=1) - elif not eval_only: + elif eval_only: + raise + else: break sleep(cls.retry_time) except InvalidRequestError: if "azure" == openai.api_type and "model" in config: # azure api uses "engine" instead of "model" config = config.copy() - config["engine"] = config.pop("model") + config["engine"] = config.pop("model").replace( + "gpt-3.5-turbo", "gpt-35-turbo" + ) else: raise logger.warning( @@ -219,6 +234,13 @@ def _update_invalid_n(cls, prune, region_key, max_tokens, num_completions): num_completions, invalid_n.get(max_tokens, np.inf) ) + @classmethod + def _pop_subspace(cls, config): + if "subspace" in config: + config = config.copy() + config.update(config.pop("subspace")) + return config + @classmethod def _get_prompt_messages_from_config(cls, model, config): prompt, messages = None, None @@ -254,6 +276,7 @@ def _eval(cls, config: dict, prune=True, eval_only=False): """ cost = 0 data = cls.data + config = cls._pop_subspace(config) model = config["model"] data_length = len(data) price = cls.price1K.get(model) @@ -300,8 +323,10 @@ def _eval(cls, config: dict, prune=True, eval_only=False): start_n = max_valid_n + 1 else: start_n = config_n + region_key = None params = config.copy() - params["stop"] = stop + if "stop" in config: + params["stop"] = stop temperature_or_top_p = params.pop("temperature_or_top_p", None) if temperature_or_top_p: params.update(temperature_or_top_p) @@ -329,11 +354,7 @@ def _eval(cls, config: dict, prune=True, eval_only=False): result["cost"] = cost return result # evaluate the quality of the responses - responses = ( - [r["message"]["content"].rstrip() for r in response["choices"]] - if model in cls.chat_models - else [r["text"].rstrip() for r in response["choices"]] - ) + responses = cls.extract_text(response) usage = response["usage"] n_input_tokens = usage["prompt_tokens"] n_output_tokens = usage.get("completion_tokens", 0) @@ -491,11 +512,12 @@ def eval_func(responses, **data): ``` log_file_name (str, optional): The log file. - inference_budget (float, optional): The inference budget. - optimization_budget (float, optional): The optimization budget. + inference_budget (float, optional): The inference budget, dollar per instance. + optimization_budget (float, optional): The optimization budget, dollar in total. num_samples (int, optional): The number of samples to evaluate. -1 means no hard restriction in the number of trials and the actual number is decided by optimization_budget. Defaults to 1. + logging_level (optional): logging level. Defaults to logging.WARNING. **config (dict): The search space to update over the default search. For prompt, please provide a string/Callable or a list of strings/Callables. - If prompt is provided for chat models, it will be converted to messages under role "user". @@ -570,22 +592,38 @@ def eval_func(responses, **data): cls.data = data cls.avg_input_tokens = None - search_alg = BlendSearch( - cost_attr="cost", - cost_budget=optimization_budget, - metric=metric, - mode=mode, - space=space, - ) space_model = space["model"] if not isinstance(space_model, str) and len(space_model) > 1: + # make a hierarchical search space + subspace = {} + if "max_tokens" in space: + subspace["max_tokens"] = space.pop("max_tokens") + if "temperature_or_top_p" in space: + subspace["temperature_or_top_p"] = space.pop("temperature_or_top_p") + if "best_of" in space: + subspace["best_of"] = space.pop("best_of") + if "n" in space: + subspace["n"] = space.pop("n") + choices = [] + for model in space["model"]: + choices.append({"model": model, **subspace}) + space["subspace"] = tune.choice(choices) + space.pop("model") # start all the models with the same hp config + search_alg = BlendSearch( + cost_attr="cost", + cost_budget=optimization_budget, + metric=metric, + mode=mode, + space=space, + ) config0 = search_alg.suggest("t0") points_to_evaluate = [config0] for model in space_model: - if model != config0["model"]: + if model != config0["subspace"]["model"]: point = config0.copy() - point["model"] = model + point["subspace"] = point["subspace"].copy() + point["subspace"]["model"] = model points_to_evaluate.append(point) search_alg = BlendSearch( cost_attr="cost", @@ -595,6 +633,15 @@ def eval_func(responses, **data): space=space, points_to_evaluate=points_to_evaluate, ) + else: + search_alg = BlendSearch( + cost_attr="cost", + cost_budget=optimization_budget, + metric=metric, + mode=mode, + space=space, + ) + old_level = logger.getEffectiveLevel() logger.setLevel(logging_level) with diskcache.Cache(cls.cache_path) as cls._cache: analysis = tune.run( @@ -605,7 +652,7 @@ def eval_func(responses, **data): verbose=3, ) config = analysis.best_config - params = config.copy() + params = cls._pop_subspace(config) if cls._prompts: params["prompt"] = cls._prompts[config["prompt"]] else: @@ -615,6 +662,7 @@ def eval_func(responses, **data): temperature_or_top_p = params.pop("temperature_or_top_p", None) if temperature_or_top_p: params.update(temperature_or_top_p) + logger.setLevel(old_level) return params, analysis @classmethod @@ -636,12 +684,14 @@ def create(cls, context, use_cache=True, **config): if ERROR: raise ERROR params = cls._construct_params(context, config) - if use_cache: - with diskcache.Cache(cls.cache_path) as cls._cache: - return cls._get_response(params) - return cls.openai_completion_class.create( - request_timeout=cls.request_timeout, **params - ) + if not use_cache: + return cls._get_response(params, eval_only=True, use_cache=False) + seed = cls.seed + if "seed" in params: + cls.set_cache(params.pop("seed")) + with diskcache.Cache(cls.cache_path) as cls._cache: + cls.set_cache(seed) + return cls._get_response(params, eval_only=True) @classmethod def _construct_params(cls, data_instance, config, prompt=None, messages=None): @@ -698,8 +748,7 @@ def test( use_cache=True, agg_method="avg", return_responses_and_per_instance_result=False, - seed=41, - cache_path=".cache", + logging_level=logging.WARNING, ): """Evaluate the responses created with the config for the OpenAI API call. @@ -750,54 +799,45 @@ def eval_func(responses, **data): return_responses_and_per_instance_result (bool): Whether to also return responses and per instance results in addition to the aggregated results. - seed (int): Random seed for the evaluation. Defaults to 41. - cache_path (str): Path to the cache directory. Defaults to '.cache'. - If a cache directory does not exist, it will be created, otherwise use the existing one. + logging_level (optional): logging level. Defaults to logging.WARNING. + Returns: - None in case of rate limit error or when a valid eval_func is not provided in either test or tune; + None when no valid eval_func is provided in either test or tune; Otherwise, a dict of aggregated results, responses and per instance results if `return_responses_and_per_instance_result` is True; Otherwise, a dict of aggregated results (responses and per instance results are not returned). """ - model = config["model"] result_agg, responses_list, result_list = {}, [], [] metric_keys = None - cls.set_cache(seed, cache_path) - with diskcache.Cache(cls.cache_path) as cls._cache: - for i, data_i in enumerate(data): - logger.info(f"evaluating data instance {i}") - params = cls._construct_params(data_i, config) - response = cls._get_response( - params, eval_only=True, use_cache=use_cache - ) - if response == -1: # rate limit error, treat as invalid - return None - # evaluate the quality of the responses - responses = ( - [r["message"]["content"].rstrip() for r in response["choices"]] - if model in cls.chat_models - else [r["text"].rstrip() for r in response["choices"]] + cost = 0 + model = config["model"] + old_level = logger.getEffectiveLevel() + logger.setLevel(logging_level) + for i, data_i in enumerate(data): + logger.info(f"evaluating data instance {i}") + response = cls.create(data_i, use_cache, **config) + cost += cls.cost(model, response) + # evaluate the quality of the responses + responses = cls.extract_text(response) + if eval_func is not None: + metrics = eval_func(responses, **data_i) + elif hasattr(cls, "_eval_func"): + metrics = cls._eval_func(responses, **data_i) + else: + logger.warning( + "Please either provide a valid eval_func or do the test after the tune function is called." ) - - if eval_func is not None: - metrics = eval_func(responses, **data_i) - elif hasattr(cls, "_eval_func"): - metrics = cls._eval_func(responses, **data_i) - else: - logger.warning( - "Please either provide a valid eval_func or do the test after the tune function is called" - ) - return - if not metric_keys: - metric_keys = [] - for k in metrics.keys(): - try: - _ = float(metrics[k]) - metric_keys.append(k) - except ValueError: - pass - result_list.append(metrics) - if return_responses_and_per_instance_result: - responses_list.append(responses) + return + if not metric_keys: + metric_keys = [] + for k in metrics.keys(): + try: + _ = float(metrics[k]) + metric_keys.append(k) + except ValueError: + pass + result_list.append(metrics) + if return_responses_and_per_instance_result: + responses_list.append(responses) if isinstance(agg_method, str): if agg_method in ["avg", "average"]: for key in metric_keys: @@ -824,25 +864,57 @@ def eval_func(responses, **data): "agg_method needs to be a string ('avg' or 'median'),\ or a callable, or a dictionary of callable." ) + logger.setLevel(old_level) # should we also return the result_list and responses_list or not? + if "cost" not in result_agg: + result_agg["cost"] = cost + if "inference_cost" not in result_agg: + result_agg["inference_cost"] = cost / len(data) if return_responses_and_per_instance_result: return result_agg, result_list, responses_list else: return result_agg + @classmethod + def cost(cls, model: str, response: dict): + """Compute the cost of a completion. + + Args: + model (str): The model name. + response (dict): The response from OpenAI API. + + Returns: + The cost in USD. + """ + if model not in cls.price1K: + raise ValueError(f"Unknown model: {model}") + usage = response["usage"] + n_input_tokens = usage["prompt_tokens"] + n_output_tokens = usage.get("completion_tokens", 0) + price1K = cls.price1K[model] + if isinstance(price1K, tuple): + return (price1K[0] * n_input_tokens + price1K[1] * n_output_tokens) / 1000 + return price1K * (n_input_tokens + n_output_tokens) / 1000 + + @classmethod + def extract_text(cls, response: dict) -> List[str]: + """Extract the text from a completion response. + + Args: + response (dict): The response from OpenAI API. + + Returns: + A list of text in the responses. + """ + choices = response["choices"] + if "text" in choices[0]: + return [choice["text"] for choice in choices] + return [choice["message"]["content"] for choice in choices] + class ChatCompletion(Completion): """A class for OpenAI API ChatCompletion.""" - price1K = { - "gpt-3.5-turbo": 0.002, - "gpt-3.5-turbo-0301": 0.002, - "gpt-4": (0.03, 0.06), - "gpt-4-0314": (0.03, 0.06), - "gpt-4-32k": (0.06, 0.12), - "gpt-4-32k-0314": (0.06, 0.12), - } - default_search_space = Completion.default_search_space.copy() default_search_space["model"] = tune.choice(["gpt-3.5-turbo", "gpt-4"]) openai_completion_class = not ERROR and openai.ChatCompletion diff --git a/flaml/integrations/oai/__init__.py b/flaml/integrations/oai/__init__.py deleted file mode 100644 index 12320692d1..0000000000 --- a/flaml/integrations/oai/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from flaml.integrations.oai.completion import Completion, ChatCompletion - -__all__ = ["Completion", "ChatCompletion"] diff --git a/notebook/autogen_chatgpt.ipynb b/notebook/autogen_chatgpt.ipynb new file mode 100644 index 0000000000..6ac607077b --- /dev/null +++ b/notebook/autogen_chatgpt.ipynb @@ -0,0 +1,1525 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved. \n", + "\n", + "Licensed under the MIT License.\n", + "\n", + "# Use FLAML to Tune ChatGPT\n", + "\n", + "FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of LLMs.\n", + "\n", + "In this notebook, we tune OpenAI ChatGPT (both GPT-3.5 and GPT-4) models for math problem solving. We use [the MATH benchmark](https://crfm.stanford.edu/helm/latest/?group=math_chain_of_thought) for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning. \n", + "\n", + "## Requirements\n", + "\n", + "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n", + "```bash\n", + "pip install flaml[openai]==1.2.0\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:52.317406Z", + "iopub.status.busy": "2023-02-13T23:40:52.316561Z", + "iopub.status.idle": "2023-02-13T23:40:52.321193Z", + "shell.execute_reply": "2023-02-13T23:40:52.320628Z" + } + }, + "outputs": [], + "source": [ + "# %pip install flaml[openai]==1.2.0 datasets" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set your OpenAI key:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:52.324240Z", + "iopub.status.busy": "2023-02-13T23:40:52.323783Z", + "iopub.status.idle": "2023-02-13T23:40:52.330570Z", + "shell.execute_reply": "2023-02-13T23:40:52.329750Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = \"\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uncomment the following to use Azure OpenAI:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:52.333547Z", + "iopub.status.busy": "2023-02-13T23:40:52.333249Z", + "iopub.status.idle": "2023-02-13T23:40:52.336508Z", + "shell.execute_reply": "2023-02-13T23:40:52.335858Z" + } + }, + "outputs": [], + "source": [ + "# import openai\n", + "# openai.api_type = \"azure\"\n", + "# openai.api_base = \"https://.openai.azure.com/\"\n", + "# openai.api_version = \"2023-03-15-preview\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset\n", + "\n", + "First, we load the competition_math dataset. The dataset contains 201 \"Level 2\" Algebra examples. We use a random sample of 20 examples for tuning the generation hyperparameters and the remaining for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:52.339977Z", + "iopub.status.busy": "2023-02-13T23:40:52.339556Z", + "iopub.status.idle": "2023-02-13T23:40:54.603349Z", + "shell.execute_reply": "2023-02-13T23:40:54.602630Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using custom data configuration default\n", + "Found cached dataset competition_math (/home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c23bfd043e284ea29f8a6b4de2974637", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00=3.7`. To run this notebook example, please install flaml with the [openai] option:\n", + "```bash\n", + "pip install flaml[openai]==1.2.0\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:36.910966Z", + "iopub.status.busy": "2023-02-24T23:25:36.910473Z", + "iopub.status.idle": "2023-02-24T23:25:36.914554Z", + "shell.execute_reply": "2023-02-24T23:25:36.914030Z" + } + }, + "outputs": [], + "source": [ + "# %pip install flaml[openai]==1.2.0 datasets" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set your OpenAI key:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:36.917301Z", + "iopub.status.busy": "2023-02-24T23:25:36.917011Z", + "iopub.status.idle": "2023-02-24T23:25:36.923156Z", + "shell.execute_reply": "2023-02-24T23:25:36.922619Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = \"\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you use Azure OpenAI, uncomment the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:36.925804Z", + "iopub.status.busy": "2023-02-24T23:25:36.925423Z", + "iopub.status.idle": "2023-02-24T23:25:36.928191Z", + "shell.execute_reply": "2023-02-24T23:25:36.927673Z" + } + }, + "outputs": [], + "source": [ + "# import openai\n", + "# openai.api_type = \"azure\"\n", + "# openai.api_base = \"https://.openai.azure.com/\"\n", + "# openai.api_version = \"2023-03-15-preview\" # change if necessary" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset\n", + "\n", + "First, we load the humaneval dataset. The dataset contains 164 examples. We use the first 20 for tuning the generation hyperparameters and the remaining for evaluation. In each example, the \"prompt\" is the prompt string for eliciting the code generation (renamed into \"definition\"), \"test\" is the Python code for unit test for the example, and \"entry_point\" is the function name to be tested." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:36.931255Z", + "iopub.status.busy": "2023-02-24T23:25:36.930838Z", + "iopub.status.idle": "2023-02-24T23:25:39.148799Z", + "shell.execute_reply": "2023-02-24T23:25:39.148113Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset openai_humaneval (/home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0be40d7ad7f049f1946bd69b0c570f33", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00 [0,0,0,0,3,3]\n", + " compare([0,5,0,0,0,4],[4,1,1,0,0,-2]) -> [4,4,1,0,0,6]\n", + " \"\"\"\n", + "\n" + ] + } + ], + "source": [ + "print(tune_data[1][\"definition\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is one example of the unit test code for verifying the correctness of the generated code:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:39.158398Z", + "iopub.status.busy": "2023-02-24T23:25:39.157766Z", + "iopub.status.idle": "2023-02-24T23:25:39.161396Z", + "shell.execute_reply": "2023-02-24T23:25:39.160797Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def check(candidate):\n", + "\n", + " # Check some simple cases\n", + " assert candidate([1,2,3,4,5,1],[1,2,3,4,2,-2])==[0,0,0,0,3,3], \"This prints if this assert fails 1 (good for debugging!)\"\n", + " assert candidate([0,0,0,0,0,0],[0,0,0,0,0,0])==[0,0,0,0,0,0], \"This prints if this assert fails 1 (good for debugging!)\"\n", + " assert candidate([1,2,3],[-1,-2,-3])==[2,4,6], \"This prints if this assert fails 1 (good for debugging!)\"\n", + " assert candidate([1,2,3,5],[-1,2,3,4])==[2,0,0,1], \"This prints if this assert fails 1 (good for debugging!)\"\n", + "\n", + " # Check some edge cases that are easy to work out by hand.\n", + " assert True, \"This prints if this assert fails 2 (also good for debugging!)\"\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(tune_data[1][\"test\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Success Metric\n", + "\n", + "Before we start tuning, we need to define the success metric we want to optimize. For each code generation task, we can use the model to generate multiple candidates, and then select one from them. If the final selected response can pass a unit test, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:39.164187Z", + "iopub.status.busy": "2023-02-24T23:25:39.163867Z", + "iopub.status.idle": "2023-02-24T23:25:39.169009Z", + "shell.execute_reply": "2023-02-24T23:25:39.168427Z" + } + }, + "outputs": [], + "source": [ + "from functools import partial\n", + "from flaml.autogen.code_utils import eval_function_completions, generate_assertions\n", + "\n", + "eval_with_generated_assertions = partial(eval_function_completions, assertions=generate_assertions)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "This function will first generate assertion statements for each problem. Then, it uses the assertions to select the generated responses.\n", + "\n", + "## Use the tuning data to find a good configuration\n", + "\n", + "### Import the oai and tune subpackages from flaml.\n", + "\n", + "FLAML has provided an API for hyperparameter optimization of OpenAI models: `oai.Completion.tune` and to make a request with the tuned config: `oai.Completion.create`. First, we import oai from flaml:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:39.179030Z", + "iopub.status.busy": "2023-02-24T23:25:39.178624Z", + "iopub.status.idle": "2023-02-24T23:25:40.584410Z", + "shell.execute_reply": "2023-02-24T23:25:40.583802Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "from flaml import oai" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For (local) reproducibility and cost efficiency, we cache responses from OpenAI." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:40.587815Z", + "iopub.status.busy": "2023-02-24T23:25:40.587283Z", + "iopub.status.idle": "2023-02-24T23:25:40.590826Z", + "shell.execute_reply": "2023-02-24T23:25:40.590158Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "oai.Completion.set_cache(seed)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n", + "\n", + "### Perform tuning\n", + "\n", + "The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n", + "\n", + "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.02 means the target inference budget is 0.02 dollars, which translates to 1000 tokens (input + output combined) if the text Davinci model is used.\n", + "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 5 means 5 dollars are allowed in total, which translates to 250K tokens for the text Davinci model.\n", + "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n", + "\n", + "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n", + "\n", + "```python\n", + "default_search_space = {\n", + " \"model\": tune.choice([\n", + " \"text-ada-001\",\n", + " \"text-babbage-001\",\n", + " \"text-davinci-003\",\n", + " \"gpt-3.5-turbo\",\n", + " \"gpt-4\",\n", + " ]),\n", + " \"temperature_or_top_p\": tune.choice(\n", + " [\n", + " {\"temperature\": tune.uniform(0, 1)},\n", + " {\"top_p\": tune.uniform(0, 1)},\n", + " ]\n", + " ),\n", + " \"max_tokens\": tune.lograndint(50, 1000),\n", + " \"n\": tune.randint(1, 100),\n", + " \"prompt\": \"{prompt}\",\n", + "}\n", + "```\n", + "\n", + "The default search space can be overridden by users' input.\n", + "For example, the following code specifies three choices for the prompt and two choices of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used. If you don't have access to gpt-4 or would like to modify the choice of models, you can provide a different search space for model." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:40.593603Z", + "iopub.status.busy": "2023-02-24T23:25:40.593269Z", + "iopub.status.idle": "2023-02-24T23:26:38.349191Z", + "shell.execute_reply": "2023-02-24T23:26:38.348392Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m[I 2023-04-07 17:47:31,801]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n", + "\u001b[32m[I 2023-04-07 17:47:31,804]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[flaml.tune.tune: 04-07 17:47:31] {832} INFO - trial 1 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n", + "[flaml.tune.tune: 04-07 17:47:48] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.010323600000000004, 'cost': 0.010323600000000004, 'inference_cost': 0.00022578, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 16.660529136657715}\n", + "[flaml.tune.tune: 04-07 17:47:48] {832} INFO - trial 2 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n", + "[flaml.tune.tune: 04-07 17:48:05] {215} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.03038410000000001, 'cost': 0.020060500000000002, 'inference_cost': 0.001003025, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-babbage-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 16.726527452468872}\n", + "[flaml.tune.tune: 04-07 17:48:05] {832} INFO - trial 3 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n", + "[flaml.tune.tune: 04-07 17:48:08] {215} INFO - result: {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 3.7132015228271484}\n", + "[flaml.tune.tune: 04-07 17:48:08] {832} INFO - trial 4 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n", + "[flaml.tune.tune: 04-07 17:48:18] {215} INFO - result: {'index_selected': 13.85, 'succeed_assertions': 0.55, 'success': 0.5, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.9526220999999998, 'cost': 0.065458, 'inference_cost': 0.0033335, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-3.5-turbo', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 9.689077615737915}\n", + "[flaml.tune.tune: 04-07 17:48:18] {832} INFO - trial 5 config: {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}\n", + "[flaml.tune.tune: 04-07 17:48:18] {215} INFO - result: {'success': 0, 'total_cost': 1.0297820999999998, 'cost': 0.07715999999999999, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'gpt-4', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 0.002007722854614258}\n", + "[flaml.tune.tune: 04-07 17:48:18] {855} WARNING - fail to sample a trial for 100 times in a row, stopping.\n" + ] + } + ], + "source": [ + "config, analysis = oai.Completion.tune(\n", + " data=tune_data, # the data for tuning\n", + " metric=\"success\", # the metric to optimize\n", + " mode=\"max\", # the optimization mode\n", + " eval_func=eval_with_generated_assertions, # the evaluation function to return the success metrics\n", + " # log_file_name=\"logs/humaneval.log\", # the log file name\n", + " inference_budget=0.05, # the inference budget (dollar per instance)\n", + " optimization_budget=1, # the optimization budget (dollar in total)\n", + " # num_samples can further limit the number of trials for different hyperparameter configurations;\n", + " # -1 means decided by the optimization budget only\n", + " num_samples=-1,\n", + " prompt=[\n", + " \"{definition}\",\n", + " \"# Python 3{definition}\",\n", + " \"Complete the following Python function:{definition}\",\n", + " ], # the prompt templates to choose from\n", + " stop=[[\"\\nclass\", \"\\ndef\", \"\\nif\", \"\\nprint\"], None], # the stop sequences\n", + ")\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Output tuning results\n", + "\n", + "After the tuning, we can print out the config and the result found by FLAML:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:26:38.352710Z", + "iopub.status.busy": "2023-02-24T23:26:38.352378Z", + "iopub.status.idle": "2023-02-24T23:26:38.356939Z", + "shell.execute_reply": "2023-02-24T23:26:38.356217Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "optimized config {'prompt': '# Python 3{definition}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'model': 'text-davinci-003', 'max_tokens': 148, 'n': 27, 'top_p': 0.755486898036596}\n", + "best result on tuning data {'index_selected': 3.95, 'succeed_assertions': 0.9, 'success': 0.75, 'gen_cost': 0.00046369999999999994, 'assertions': 'assert vowels_count(\"abcde\") == 2\\nassert vowels_count(\"ACEDY\") == 3', 'total_cost': 0.8871640999999999, 'cost': 0.8567799999999999, 'inference_cost': 0.042096, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}, 'experiment_tag': 'exp', 'time_total_s': 3.7132015228271484}\n" + ] + } + ], + "source": [ + "print(\"optimized config\", config)\n", + "print(\"best result on tuning data\", analysis.best_result)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Make a request with the tuned config\n", + "\n", + "We can apply the tuned config on the request for an example task:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:26:38.359902Z", + "iopub.status.busy": "2023-02-24T23:26:38.359506Z", + "iopub.status.idle": "2023-02-24T23:26:39.343921Z", + "shell.execute_reply": "2023-02-24T23:26:39.343051Z" + }, + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"choices\": [\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i]-guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 1,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 2,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n diff = abs(game[i] - guess[i])\\n result.append(diff)\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 3,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 4,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n diff = abs(game[i] - guess[i])\\n result.append(diff)\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 5,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 6,\n", + " \"logprobs\": null,\n", + " \"text\": \" results = []\\n for i in range(len(game)):\\n if game[i] == guess[i]:\\n results.append(0)\\n else:\\n results.append(abs(game[i] - guess[i]))\\n return results\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 7,\n", + " \"logprobs\": null,\n", + " \"text\": \" res = []\\n for i in range(len(game)):\\n res.append(abs(game[i] - guess[i]))\\n return res\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 8,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 9,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i]-guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 10,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 11,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n diff = abs(game[i] - guess[i])\\n result.append(diff)\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 12,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n if game[i] == guess[i]:\\n result.append(0)\\n else:\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 13,\n", + " \"logprobs\": null,\n", + " \"text\": \" #your code here\\n result = []\\n for i in range(len(game)):\\n if game[i] == guess[i]:\\n result.append(0)\\n else:\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 14,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 15,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n diff = abs(game[i] - guess[i])\\n result.append(diff)\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 16,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i]-guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 17,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 18,\n", + " \"logprobs\": null,\n", + " \"text\": \" # Your code here\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 19,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 20,\n", + " \"logprobs\": null,\n", + " \"text\": \" #create an empty list\\n result = []\\n #iterate over the two lists and compare the values\\n for i in range(len(game)):\\n diff = abs(game[i] - guess[i])\\n result.append(diff)\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 21,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 22,\n", + " \"logprobs\": null,\n", + " \"text\": \" # initialize the result array\\n result = []\\n \\n # loop over the arrays and calculate the difference\\n for i in range(len(game)):\\n diff = abs(game[i] - guess[i])\\n result.append(diff)\\n \\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 23,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i]-guess[i]))\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 24,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n diff = abs(game[i] - guess[i])\\n result.append(diff)\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 25,\n", + " \"logprobs\": null,\n", + " \"text\": \" # Your code here\\n result = []\\n for i in range(len(game)):\\n diff = abs(game[i] - guess[i])\\n result.append(diff)\\n return result\"\n", + " },\n", + " {\n", + " \"finish_reason\": \"stop\",\n", + " \"index\": 26,\n", + " \"logprobs\": null,\n", + " \"text\": \" result = []\\n for i in range(len(game)):\\n result.append(abs(game[i]-guess[i]))\\n return result\"\n", + " }\n", + " ],\n", + " \"created\": 1680456621,\n", + " \"id\": \"cmpl-70vozowIIN2Dcy5lOGYaIiYWvFFmh\",\n", + " \"model\": \"text-davinci-003\",\n", + " \"object\": \"text_completion\",\n", + " \"usage\": {\n", + " \"completion_tokens\": 1198,\n", + " \"prompt_tokens\": 243,\n", + " \"total_tokens\": 1441\n", + " }\n", + "}\n", + "{'index_selected': 0, 'succeed_assertions': 1, 'success': 1, 'gen_cost': 0.000702, 'assertions': 'assert compare([1,2,3,4,5,1],[1,2,3,4,2,-2]) == [0,0,0,0,3,3]\\nassert compare([0,5,0,0,0,4],[4,1,1,0,0,-2]) == [4,4,1,0,0,6]'}\n" + ] + } + ], + "source": [ + "response = oai.Completion.create(context=tune_data[1], **config)\n", + "print(response)\n", + "print(eval_with_generated_assertions(oai.Completion.extract_text(response), **tune_data[1]))\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluate the success rate on the test data\n", + "\n", + "You can use flaml's `oai.Completion.test` to evaluate the performance of an entire dataset with the tuned config. The following code will take a while to evaluate all the 144 test data instances. The cost is about $6 if you uncomment it and run it." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:26:39.347295Z", + "iopub.status.busy": "2023-02-24T23:26:39.346994Z", + "iopub.status.idle": "2023-02-24T23:29:27.160335Z", + "shell.execute_reply": "2023-02-24T23:29:27.159519Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "performance on test data with the tuned config: {'index_selected': 5.208333333333333, 'succeed_assertions': 0.8402777777777778, 'success': 0.7777777777777778, 'gen_cost': 0.00045375000000000005, 'cost': 5.785519999999999, 'inference_cost': 0.04017722222222222}\n" + ] + } + ], + "source": [ + "# result = oai.Completion.test(test_data, config)\n", + "# print(\"performance on test data with the tuned config:\", result)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result will vary with the inference budget and optimization budget.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "vscode": { + "interpreter": { + "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "24dd93300e0442788ee6cc1310e5bf14": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HTMLStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "background": null, + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "35cd066a31b242bb87b2c106ee72e5f2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8e7ee7687a99410d88a98a74ecfcea99", + "IPY_MODEL_421e02a11a974b40b3ddb75382b3b640", + "IPY_MODEL_77db9797e78b49438d21c5c8da34b4cb" + ], + "layout": "IPY_MODEL_47d3046236a54b0e8f9ae455a82c7e0b", + "tabbable": null, + "tooltip": null + } + }, + "3d5d106a38954af2bb3bde5777702f4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HTMLStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "background": null, + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "3e1ebb31412443b0bca86a301cbdac11": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "421e02a11a974b40b3ddb75382b3b640": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_e6398d4027c9459a97965b9d91ae484f", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3e1ebb31412443b0bca86a301cbdac11", + "tabbable": null, + "tooltip": null, + "value": 1 + } + }, + "47d3046236a54b0e8f9ae455a82c7e0b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "754800f7feb04acea977696e4787d1ff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "77db9797e78b49438d21c5c8da34b4cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "HTMLView", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_7b6c4e1c11e249409a1edcd63be450d8", + "placeholder": "​", + "style": "IPY_MODEL_3d5d106a38954af2bb3bde5777702f4e", + "tabbable": null, + "tooltip": null, + "value": " 1/1 [00:00<00:00, 44.40it/s]" + } + }, + "7b6c4e1c11e249409a1edcd63be450d8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8e7ee7687a99410d88a98a74ecfcea99": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "HTMLView", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_754800f7feb04acea977696e4787d1ff", + "placeholder": "​", + "style": "IPY_MODEL_24dd93300e0442788ee6cc1310e5bf14", + "tabbable": null, + "tooltip": null, + "value": "100%" + } + }, + "e6398d4027c9459a97965b9d91ae484f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook/integrate_chatgpt.ipynb b/notebook/integrate_chatgpt.ipynb deleted file mode 100644 index 57d549391b..0000000000 --- a/notebook/integrate_chatgpt.ipynb +++ /dev/null @@ -1,1795 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved. \n", - "\n", - "Licensed under the MIT License.\n", - "\n", - "# Use FLAML to Tune ChatGPT\n", - "\n", - "FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of LLMs.\n", - "\n", - "In this notebook, we tune OpenAI ChatGPT (both GPT-3.5 and GPT-4) models for math problem solving. We use [the MATH benchmark](https://crfm.stanford.edu/helm/latest/?group=math_chain_of_thought) for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning. \n", - "\n", - "## Requirements\n", - "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n", - "```bash\n", - "pip install flaml[openai]==1.2.0\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:40:52.317406Z", - "iopub.status.busy": "2023-02-13T23:40:52.316561Z", - "iopub.status.idle": "2023-02-13T23:40:52.321193Z", - "shell.execute_reply": "2023-02-13T23:40:52.320628Z" - } - }, - "outputs": [], - "source": [ - "# %pip install flaml[openai]==1.2.0 datasets" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set your OpenAI key:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:40:52.324240Z", - "iopub.status.busy": "2023-02-13T23:40:52.323783Z", - "iopub.status.idle": "2023-02-13T23:40:52.330570Z", - "shell.execute_reply": "2023-02-13T23:40:52.329750Z" - } - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " os.environ[\"OPENAI_API_KEY\"] = \"\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Uncomment the following to use Azure OpenAI:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:40:52.333547Z", - "iopub.status.busy": "2023-02-13T23:40:52.333249Z", - "iopub.status.idle": "2023-02-13T23:40:52.336508Z", - "shell.execute_reply": "2023-02-13T23:40:52.335858Z" - } - }, - "outputs": [], - "source": [ - "# openai.api_type = \"azure\"\n", - "# openai.api_base = \"https://.openai.azure.com/\"\n", - "# openai.api_version = \"2022-12-01\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load dataset\n", - "\n", - "First, we load the competition_math dataset. The dataset contains 201 \"Level 2\" Algebra examples. We use a random sample of 20 examples for tuning the generation hyperparameters and the remaining for evaluation. We use one demonstration example in the prompt." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:40:52.339977Z", - "iopub.status.busy": "2023-02-13T23:40:52.339556Z", - "iopub.status.idle": "2023-02-13T23:40:54.603349Z", - "shell.execute_reply": "2023-02-13T23:40:54.602630Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using custom data configuration default\n", - "Found cached dataset competition_math (/home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8358c4bf9cc44b99916c9b6cb1e3a279", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/2 [00:00 Optional[str]:\n", - " \"\"\"Source: https://github.com/hendrycks/math\n", - " Extract the text within a \\\\boxed{...} environment.\n", - " Example:\n", - " >>> remove_boxed(\\\\boxed{\\\\frac{2}{3}})\n", - " \\\\frac{2}{3}\n", - " \"\"\"\n", - " left = \"\\\\boxed{\"\n", - " try:\n", - " assert string[: len(left)] == left\n", - " assert string[-1] == \"}\"\n", - " return string[len(left) : -1]\n", - " except Exception:\n", - " return None\n", - "\n", - "\n", - "def last_boxed_only_string(string: str) -> Optional[str]:\n", - " \"\"\"Source: https://github.com/hendrycks/math\n", - " Extract the last \\\\boxed{...} or \\\\fbox{...} element from a string.\n", - " \"\"\"\n", - " idx = string.rfind(\"\\\\boxed\")\n", - " if idx < 0:\n", - " idx = string.rfind(\"\\\\fbox\")\n", - " if idx < 0:\n", - " return None\n", - "\n", - " i = idx\n", - " right_brace_idx = None\n", - " num_left_braces_open = 0\n", - " while i < len(string):\n", - " if string[i] == \"{\":\n", - " num_left_braces_open += 1\n", - " if string[i] == \"}\":\n", - " num_left_braces_open -= 1\n", - " if num_left_braces_open == 0:\n", - " right_brace_idx = i\n", - " break\n", - " i += 1\n", - "\n", - " if right_brace_idx is None:\n", - " retval = None\n", - " else:\n", - " retval = string[idx : right_brace_idx + 1]\n", - "\n", - " return retval\n", - "\n", - "\n", - "def _fix_fracs(string: str) -> str:\n", - " \"\"\"Source: https://github.com/hendrycks/math\n", - " Reformat fractions.\n", - " Examples:\n", - " >>> _fix_fracs(\"\\\\frac1b\")\n", - " \\frac{1}{b}\n", - " >>> _fix_fracs(\"\\\\frac12\")\n", - " \\frac{1}{2}\n", - " >>> _fix_fracs(\"\\\\frac1{72}\")\n", - " \\frac{1}{72}\n", - " \"\"\"\n", - " substrs = string.split(\"\\\\frac\")\n", - " new_str = substrs[0]\n", - " if len(substrs) > 1:\n", - " substrs = substrs[1:]\n", - " for substr in substrs:\n", - " new_str += \"\\\\frac\"\n", - " if substr[0] == \"{\":\n", - " new_str += substr\n", - " else:\n", - " try:\n", - " assert len(substr) >= 2\n", - " except Exception:\n", - " return string\n", - " a = substr[0]\n", - " b = substr[1]\n", - " if b != \"{\":\n", - " if len(substr) > 2:\n", - " post_substr = substr[2:]\n", - " new_str += \"{\" + a + \"}{\" + b + \"}\" + post_substr\n", - " else:\n", - " new_str += \"{\" + a + \"}{\" + b + \"}\"\n", - " else:\n", - " if len(substr) > 2:\n", - " post_substr = substr[2:]\n", - " new_str += \"{\" + a + \"}\" + b + post_substr\n", - " else:\n", - " new_str += \"{\" + a + \"}\" + b\n", - " string = new_str\n", - " return string\n", - "\n", - "\n", - "def _fix_a_slash_b(string: str) -> str:\n", - " \"\"\"Source: https://github.com/hendrycks/math\n", - " Reformat fractions formatted as a/b to \\\\frac{a}{b}.\n", - " Example:\n", - " >>> _fix_a_slash_b(\"2/3\")\n", - " \\frac{2}{3}\n", - " \"\"\"\n", - " if len(string.split(\"/\")) != 2:\n", - " return string\n", - " a_str = string.split(\"/\")[0]\n", - " b_str = string.split(\"/\")[1]\n", - " try:\n", - " a = int(a_str)\n", - " b = int(b_str)\n", - " assert string == \"{}/{}\".format(a, b)\n", - " new_string = \"\\\\frac{\" + str(a) + \"}{\" + str(b) + \"}\"\n", - " return new_string\n", - " except Exception:\n", - " return string\n", - "\n", - "\n", - "def _remove_right_units(string: str) -> str:\n", - " \"\"\"Source: https://github.com/hendrycks/math\n", - " Remove units (on the right).\n", - " \"\\\\text{ \" only ever occurs (at least in the val set) when describing units.\n", - " \"\"\"\n", - " if \"\\\\text{ \" in string:\n", - " splits = string.split(\"\\\\text{ \")\n", - " assert len(splits) == 2\n", - " return splits[0]\n", - " else:\n", - " return string\n", - "\n", - "\n", - "def _fix_sqrt(string: str) -> str:\n", - " \"\"\"Source: https://github.com/hendrycks/math\n", - " Reformat square roots.\n", - " Example:\n", - " >>> _fix_sqrt(\"\\\\sqrt3\")\n", - " \\sqrt{3}\n", - " \"\"\"\n", - " if \"\\\\sqrt\" not in string:\n", - " return string\n", - " splits = string.split(\"\\\\sqrt\")\n", - " new_string = splits[0]\n", - " for split in splits[1:]:\n", - " if split[0] != \"{\":\n", - " a = split[0]\n", - " new_substr = \"\\\\sqrt{\" + a + \"}\" + split[1:]\n", - " else:\n", - " new_substr = \"\\\\sqrt\" + split\n", - " new_string += new_substr\n", - " return new_string\n", - "\n", - "\n", - "def _strip_string(string: str) -> str:\n", - " \"\"\"Source: https://github.com/hendrycks/math\n", - " Apply the reformatting helper functions above.\n", - " \"\"\"\n", - " # linebreaks\n", - " string = string.replace(\"\\n\", \"\")\n", - " # print(string)\n", - "\n", - " # remove inverse spaces\n", - " string = string.replace(\"\\\\!\", \"\")\n", - " # print(string)\n", - "\n", - " # replace \\\\ with \\\n", - " string = string.replace(\"\\\\\\\\\", \"\\\\\")\n", - " # print(string)\n", - "\n", - " # replace tfrac and dfrac with frac\n", - " string = string.replace(\"tfrac\", \"frac\")\n", - " string = string.replace(\"dfrac\", \"frac\")\n", - " # print(string)\n", - "\n", - " # remove \\left and \\right\n", - " string = string.replace(\"\\\\left\", \"\")\n", - " string = string.replace(\"\\\\right\", \"\")\n", - " # print(string)\n", - "\n", - " # Remove circ (degrees)\n", - " string = string.replace(\"^{\\\\circ}\", \"\")\n", - " string = string.replace(\"^\\\\circ\", \"\")\n", - "\n", - " # remove dollar signs\n", - " string = string.replace(\"\\\\$\", \"\")\n", - "\n", - " # remove units (on the right)\n", - " string = _remove_right_units(string)\n", - "\n", - " # remove percentage\n", - " string = string.replace(\"\\\\%\", \"\")\n", - " string = string.replace(\"\\%\", \"\")\n", - "\n", - " # \" 0.\" equivalent to \" .\" and \"{0.\" equivalent to \"{.\" Alternatively, add \"0\" if \".\" is the start of the string\n", - " string = string.replace(\" .\", \" 0.\")\n", - " string = string.replace(\"{.\", \"{0.\")\n", - " # if empty, return empty string\n", - " if len(string) == 0:\n", - " return string\n", - " if string[0] == \".\":\n", - " string = \"0\" + string\n", - "\n", - " # to consider: get rid of e.g. \"k = \" or \"q = \" at beginning\n", - " if len(string.split(\"=\")) == 2:\n", - " if len(string.split(\"=\")[0]) <= 2:\n", - " string = string.split(\"=\")[1]\n", - "\n", - " # fix sqrt3 --> sqrt{3}\n", - " string = _fix_sqrt(string)\n", - "\n", - " # remove spaces\n", - " string = string.replace(\" \", \"\")\n", - "\n", - " # \\frac1b or \\frac12 --> \\frac{1}{b} and \\frac{1}{2}, etc.\n", - " # Even works with \\frac1{72} (but not \\frac{72}1).\n", - " # Also does a/b --> \\\\frac{a}{b}\n", - " string = _fix_fracs(string)\n", - "\n", - " # manually change 0.5 --> \\frac{1}{2}\n", - " if string == \"0.5\":\n", - " string = \"\\\\frac{1}{2}\"\n", - "\n", - " # NOTE: X/Y changed to \\frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y\n", - " string = _fix_a_slash_b(string)\n", - "\n", - " return string\n", - "\n", - "\n", - "def get_answer(solution: Optional[str]) -> Optional[str]:\n", - " if solution is None:\n", - " return None\n", - " last_boxed = last_boxed_only_string(solution)\n", - " if last_boxed is None:\n", - " return None\n", - " answer = remove_boxed(last_boxed)\n", - " if answer is None:\n", - " return None\n", - " return answer\n", - "\n", - "\n", - "def is_equiv(str1: Optional[str], str2: Optional[str]) -> float:\n", - " \"\"\"Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in\n", - " - units\n", - " - fractions\n", - " - square roots\n", - " - superfluous LaTeX.\n", - " Source: https://github.com/hendrycks/math\n", - " \"\"\"\n", - " if str1 is None and str2 is None:\n", - " print(\"WARNING: Both None\")\n", - " return 1.0\n", - " if str1 is None or str2 is None:\n", - " return 0.0\n", - "\n", - " try:\n", - " ss1 = _strip_string(str1)\n", - " ss2 = _strip_string(str2)\n", - " return float(ss1 == ss2)\n", - " except Exception:\n", - " return float(str1 == str2)\n", - "\n", - "\n", - "def is_equiv_chain_of_thought(str1: str, str2: str) -> float:\n", - " \"\"\"Strips the solution first before calling `is_equiv`.\"\"\"\n", - " ans1 = get_answer(str1)\n", - " ans2 = get_answer(str2)\n", - "\n", - " return is_equiv(ans1, ans2)\n", - "\n", - "\n", - "def success_metrics(responses, solution, **args):\n", - " \"\"\"Check if each response is correct.\n", - " \n", - " Args:\n", - " responses (list): The list of responses.\n", - " solution (str): The canonical solution.\n", - " \n", - " Returns:\n", - " dict: The success metrics.\n", - " \"\"\"\n", - " success_list = []\n", - " n = len(responses)\n", - " for i in range(n):\n", - " response = responses[i]\n", - " succeed = is_equiv_chain_of_thought(response, solution)\n", - " success_list.append(succeed)\n", - " # voting\n", - " answers = {}\n", - " for i in range(n):\n", - " equiv = i\n", - " if get_answer(responses[i]) is None:\n", - " # ignore None answers\n", - " continue\n", - " for j in answers:\n", - " if is_equiv_chain_of_thought(responses[i], responses[j]):\n", - " equiv = j\n", - " break\n", - " if equiv in answers:\n", - " answers[equiv] += 1\n", - " else:\n", - " answers[equiv] = 1\n", - " # find the answer with highest votes in answers\n", - " answer = max(answers.items(), key=lambda x: x[1], default=(0, 0))[0]\n", - " # check if the answer is correct\n", - " success_vote = is_equiv_chain_of_thought(responses[answer], solution)\n", - " return {\n", - " \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n", - " \"success\": any(s for s in success_list),\n", - " \"success_vote\": success_vote,\n", - " \"voted_answer\": responses[answer],\n", - " }\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Use the tuning data to find a good configuration\n", - "\n", - "### Import the oai and tune subpackages from flaml.\n", - "\n", - "FLAML has provided an API for hyperparameter optimization of OpenAI ChatGPT models: `oai.ChatCompletion.tune` and to make a request with the tuned config: `oai.ChatCompletion.create`. First, we import oai from flaml:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:40:54.634335Z", - "iopub.status.busy": "2023-02-13T23:40:54.633929Z", - "iopub.status.idle": "2023-02-13T23:40:56.105700Z", - "shell.execute_reply": "2023-02-13T23:40:56.105085Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "from flaml import oai" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For (local) reproducibility and cost efficiency, we cache responses from OpenAI." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:40:56.109177Z", - "iopub.status.busy": "2023-02-13T23:40:56.108624Z", - "iopub.status.idle": "2023-02-13T23:40:56.112651Z", - "shell.execute_reply": "2023-02-13T23:40:56.112076Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "oai.ChatCompletion.set_cache(seed)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n", - "\n", - "### Perform tuning\n", - "\n", - "The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n", - "\n", - "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.004 means the target inference budget is 0.004 dollars, which translates to 2000 tokens (input + output combined) if the gpt-3.5-turbo model is used.\n", - "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 1 means 1 dollars are allowed in total, which translates to 500K tokens for the gpt-3.5-turbo model.\n", - "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n", - "\n", - "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n", - "\n", - "```python\n", - "default_search_space = {\n", - " \"model\": tune.choice([\n", - " \"gpt-3.5-turbo\",\n", - " \"gpt-4\",\n", - " ]),\n", - " \"temperature_or_top_p\": tune.choice(\n", - " [\n", - " {\"temperature\": tune.uniform(0, 1)},\n", - " {\"top_p\": tune.uniform(0, 1)},\n", - " ]\n", - " ),\n", - " \"max_tokens\": tune.lograndint(50, 1000),\n", - " \"n\": tune.randint(1, 100),\n", - " \"prompt\": \"{prompt}\",\n", - "}\n", - "```\n", - "\n", - "The default search space can be overridden by users' input.\n", - "For example, the following code specifies a fixed prompt template. For hyperparameters which don't appear in users' input, the default search space will be used." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:40:56.115383Z", - "iopub.status.busy": "2023-02-13T23:40:56.114975Z", - "iopub.status.idle": "2023-02-13T23:41:55.045654Z", - "shell.execute_reply": "2023-02-13T23:41:55.044973Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m[I 2023-03-26 04:03:37,074]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n", - "\u001b[32m[I 2023-03-26 04:03:37,077]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 1 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}\n", - "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.9, 'success': 0.9, 'success_vote': 0.9, 'voted_answer': 'We use the distance formula to find the distance between the two points: $\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{3^2+(-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=\\\\boxed{5}$.', 'total_cost': 0.13772999999999996, 'cost': 0.13772999999999996, 'inference_cost': 0.0068864999999999985, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.004978179931640625}\n", - "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 2 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}\n", - "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.8, 'success': 0.8, 'success_vote': 0.8, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}$$ Letting $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0)$, we have: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=5$$ Therefore, the distance between the points $(0,4)$ and $(3,0)$ is $\\\\boxed{5}$.', 'total_cost': 0.145722, 'cost': 0.007992, 'inference_cost': 0.00039759999999999996, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0047664642333984375}\n", - "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 3 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0}\n", - "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.5140933870421127, 'success': 0.55, 'success_vote': 0.5, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=5.$$ Therefore, the distance between the points (0,4) and (3,0) is $\\\\boxed{5}$.', 'total_cost': 0.21644799999999997, 'cost': 0.07072600000000001, 'inference_cost': 0.0035343, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.4985070123025904}, 'config/max_tokens': 97, 'config/n': 20, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.010622501373291016}\n", - "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9533933461949365}, 'max_tokens': 50, 'n': 51, 'prompt': 0}\n", - "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.3386014997741698, 'success': 0.4, 'success_vote': 0.35, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=\\\\boxed{5}.$$', 'total_cost': 0.3192479999999999, 'cost': 0.10279999999999999, 'inference_cost': 0.005138, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9533933461949365}, 'max_tokens': 50, 'n': 51, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.9533933461949365}, 'config/max_tokens': 50, 'config/n': 51, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.015543699264526367}\n", - "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 5 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}\n", - "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.9999998720099207, 'success': 1.0, 'success_vote': 0.95, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2},$$ where $(x_1,y_1)$ and $(x_2,y_2)$ are the given points. Plugging in the values $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0),$ we have: $$\\\\sqrt{(3-0)^2+(0-4)^2} = \\\\sqrt{9+16} = \\\\sqrt{25}.$$ Therefore, the distance between the two points is $\\\\boxed{5}$.', 'total_cost': 0.6322379999999999, 'cost': 0.31299, 'inference_cost': 0.015323400000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/n': 54, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.05237627029418945}\n", - "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 6 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.4340139933332937}, 'max_tokens': 317, 'n': 51, 'prompt': 0}\n", - "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'success_vote': 0, 'total_cost': 0.7246679999999999, 'cost': 0.09243, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.4340139933332937}, 'max_tokens': 317, 'n': 51, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.4340139933332937}, 'config/max_tokens': 317, 'config/n': 51, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.001924753189086914}\n", - "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 7 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0}\n", - "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.7572581384563789, 'success': 0.8, 'success_vote': 0.8, 'voted_answer': 'We use the distance formula: \\\\begin{align*}\\n\\\\text{distance}&=\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}\\\\\\\\\\n&=\\\\sqrt{(3-0)^2+(0-4)^2}\\\\\\\\\\n&=\\\\sqrt{9+16}\\\\\\\\\\n&=\\\\sqrt{25}\\\\\\\\\\n&=\\\\boxed{5}.\\n\\\\end{align*}', 'total_cost': 0.7647499999999999, 'cost': 0.04008199999999999, 'inference_cost': 0.0020021, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9086488808086682}, 'config/max_tokens': 129, 'config/n': 9, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.007839441299438477}\n", - "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 8 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6262871483113925}, 'max_tokens': 257, 'n': 82, 'prompt': 0}\n", - "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'success_vote': 0, 'total_cost': 1.0214359999999998, 'cost': 0.25668599999999997, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6262871483113925}, 'max_tokens': 257, 'n': 82, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6262871483113925}, 'config/max_tokens': 257, 'config/n': 82, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.009511232376098633}\n", - "[flaml.tune.tune: 03-26 04:03:37] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n" - ] - } - ], - "source": [ - "import logging\n", - "\n", - "prompts = [\"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.\"]\n", - "config, analysis = oai.ChatCompletion.tune(\n", - " data=tune_data, # the data for tuning\n", - " metric=\"success_vote\", # the metric to optimize\n", - " mode=\"max\", # the optimization mode\n", - " eval_func=success_metrics, # the evaluation function to return the success metrics\n", - " # log_file_name=\"logs/math.log\", # the log file name\n", - " inference_budget=0.03, # the inference budget (dollar)\n", - " optimization_budget=1, # the optimization budget (dollar)\n", - " # num_samples can further limit the number of trials for different hyperparameter configurations;\n", - " # -1 means decided by the optimization budget only\n", - " num_samples=-1,\n", - " # model=\"chatgpt-35-turbo-0301\", # uncomment if using Azure OpenAI\n", - " # model=\"gpt-3.5-turbo\", # uncomment if you don't have access to gpt-4\n", - " prompt=prompts, # the prompt templates to choose from\n", - " # stop=\"###\", # the stop sequence\n", - " logging_level=logging.INFO, # the logging level\n", - ")\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Output tuning results\n", - "\n", - "After the tuning, we can print out the config and the result found by FLAML:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:41:55.049204Z", - "iopub.status.busy": "2023-02-13T23:41:55.048871Z", - "iopub.status.idle": "2023-02-13T23:41:55.053284Z", - "shell.execute_reply": "2023-02-13T23:41:55.052574Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "optimized config {'model': 'gpt-3.5-turbo', 'max_tokens': 424, 'n': 54, 'prompt': '{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.', 'stop': None, 'temperature': 0.9177741225129434}\n", - "best result on tuning data {'expected_success': 0.9999998720099207, 'success': 1.0, 'success_vote': 0.95, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2},$$ where $(x_1,y_1)$ and $(x_2,y_2)$ are the given points. Plugging in the values $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0),$ we have: $$\\\\sqrt{(3-0)^2+(0-4)^2} = \\\\sqrt{9+16} = \\\\sqrt{25}.$$ Therefore, the distance between the two points is $\\\\boxed{5}$.', 'total_cost': 0.6322379999999999, 'cost': 0.31299, 'inference_cost': 0.015323400000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/n': 54, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.05237627029418945}\n" - ] - } - ], - "source": [ - "print(\"optimized config\", config)\n", - "print(\"best result on tuning data\", analysis.best_result)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Make a request with the tuned config\n", - "\n", - "We can apply the tuned config on the request for an example task:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:41:55.056205Z", - "iopub.status.busy": "2023-02-13T23:41:55.055631Z", - "iopub.status.idle": "2023-02-13T23:41:56.039259Z", - "shell.execute_reply": "2023-02-13T23:41:56.038427Z" - }, - "slideshow": { - "slide_type": "subslide" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"choices\": [\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 0,\n", - " \"message\": {\n", - " \"content\": \"We want to get rid of the square root in the denominator. We can do this by multiplying both the numerator and denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\sqrt{21}.$$ Thus, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}=\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 1,\n", - " \"message\": {\n", - " \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}.$$Therefore, the answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 2,\n", - " \"message\": {\n", - " \"content\": \"Using the definition of square roots, we see that $\\\\sqrt{21}\\\\cdot\\\\sqrt{21}=21$. Therefore, we can write $\\\\frac{21}{\\\\sqrt{21}}$ as $\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 3,\n", - " \"message\": {\n", - " \"content\": \"We start by multiplying both the numerator and the denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}.$$ Simplifying the fraction, we get: $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 4,\n", - " \"message\": {\n", - " \"content\": \"We can simplify $\\\\sqrt{21}$ by finding its prime factorization: $21=3\\\\cdot7$, so $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, \\\\[\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.\\\\] To rationalize this denominator, we need to multiply both the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$, which gives \\\\[\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}\\\\] \\\\[=\\\\boxed{3\\\\sqrt{3}}.\\\\]\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 5,\n", - " \"message\": {\n", - " \"content\": \"We can start by simplifying the denominator. Since $\\\\sqrt{21}$ equals $\\\\sqrt{3} \\\\cdot \\\\sqrt{7}$, we can write:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}}$$\\n\\nTo rationalize the denominator, we need to multiply both the numerator and denominator by $\\\\sqrt{3} \\\\cdot \\\\sqrt{7}$:\\n\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}} \\\\cdot \\\\frac{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}} &= \\\\frac{21 \\\\cdot \\\\sqrt{3} \\\\cdot \\\\sqrt{7}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7} \\\\cdot \\\\sqrt{3} \\\\cdot \\\\sqrt{7}} \\\\\\\\\\n&= \\\\frac{21 \\\\cdot \\\\sqrt{3} \\\\cdot \\\\sqrt{7}}{3 \\\\cdot 7} \\\\\\\\\\n&= \\\\boxed{\\\\frac{3 \\\\sqrt{21}}{7}}\\n\\\\end{align*}\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 6,\n", - " \"message\": {\n", - " \"content\": \"We have $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}$. Therefore, $\\\\boxed{\\\\sqrt{21}}$ is our final answer.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 7,\n", - " \"message\": {\n", - " \"content\": \"We have $\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\sqrt{21}$. Thus, our final answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 8,\n", - " \"message\": {\n", - " \"content\": \"We can begin by multiplying both the numerator and denominator of the fraction by $\\\\sqrt{21}$, since $\\\\sqrt{21}/\\\\sqrt{21} = 1$:\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}}&=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}\\\\\\\\\\n&=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}\\\\\\\\\\n&=\\\\sqrt{21}.\\n\\\\end{align*}\\nTherefore, the simplified expression is $\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 9,\n", - " \"message\": {\n", - " \"content\": \"We start by multiplying the numerator and denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{(\\\\sqrt{21})^2} = \\\\frac{21\\\\sqrt{21}}{21}$$ Simplifying, we get: $$\\\\frac{21\\\\sqrt{21}}{21}= \\\\sqrt{21}$$ Therefore, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}=\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 10,\n", - " \"message\": {\n", - " \"content\": \"We can simplify the denominator by rationalizing it, which means getting rid of the square root in the denominator. To do this, we can multiply both the numerator and denominator by $\\\\sqrt{21}$:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}$$\\n\\nTherefore, $\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 11,\n", - " \"message\": {\n", - " \"content\": \"We start by rationalizing the denominator. We want to get rid of the square root in the denominator, so we need to multiply both the numerator and denominator by something that will give us a perfect square in the denominator. We notice that $\\\\sqrt{21} = \\\\sqrt{3 \\\\cdot 7}$, so we can multiply both the numerator and denominator by $\\\\sqrt{3}$ to get:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{3}}{\\\\sqrt{3}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{21}\\\\sqrt{3}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3 \\\\cdot 21}}$$\\n\\nNow we can simplify the denominator:\\n\\n$$\\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3 \\\\cdot 21}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{21}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{3 \\\\cdot 7}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3^2 \\\\cdot 7}} = \\\\boxed{\\\\frac{3\\\\sqrt{7}}{2}}$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 12,\n", - " \"message\": {\n", - " \"content\": \"We can simplify $\\\\sqrt{21}$ by recognizing that $21=3\\\\cdot7$ and $\\\\sqrt{3^2\\\\cdot7}=3\\\\sqrt{7}$. Therefore, we have $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3^2\\\\cdot7}} = \\\\frac{21}{3\\\\sqrt{7}} = \\\\frac{7\\\\cdot3}{3\\\\cdot\\\\sqrt{7}} = \\\\frac{7}{\\\\sqrt{7}}.$$We can rationalize this by multiplying top and bottom by $\\\\sqrt{7}$, giving us $$\\\\frac{7}{\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{7}}{\\\\sqrt{7}} = \\\\frac{7\\\\sqrt{7}}{7} = \\\\boxed{\\\\sqrt{7}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 13,\n", - " \"message\": {\n", - " \"content\": \"We want to get rid of the square root sign in the denominator. One way to do this is to multiply both the numerator and denominator by $\\\\sqrt{21}$. We get \\\\[\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21\\\\cdot\\\\sqrt{21}}{(\\\\sqrt{21})\\\\cdot(\\\\sqrt{21})} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\boxed{\\\\sqrt{21}}.\\\\]Note that we simplified by dividing $21$ by $21$ to get $1$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 14,\n", - " \"message\": {\n", - " \"content\": \"To rationalize the denominator, we need to eliminate the square root. We can do this by multiplying both the numerator and denominator by $\\\\sqrt{21}$:$$\\\\dfrac{21}{\\\\sqrt{21}}\\\\cdot\\\\dfrac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\dfrac{21\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 15,\n", - " \"message\": {\n", - " \"content\": \"To rationalize the denominator, we need to eliminate the radical from the denominator. We can do this by multiplying both the numerator and denominator by $\\\\sqrt{21}$. \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}&=\\\\frac{21\\\\sqrt{21}}{21}\\\\\\\\\\n&=\\\\frac{\\\\cancelto{1}{21}\\\\cdot \\\\cancel{\\\\sqrt{21}}\\\\cdot \\\\sqrt{21}}{\\\\cancel{21}}\\\\\\\\\\n&=\\\\sqrt{21}\\n\\\\end{align*}Therefore, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}=\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 16,\n", - " \"message\": {\n", - " \"content\": \"We begin by multiplying the top and bottom of the fraction by $\\\\sqrt{21}$: \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} &= \\\\frac{21\\\\sqrt{21}}{21}\\\\\\\\\\n&= \\\\frac{\\\\cancel{21}\\\\cdot\\\\sqrt{21}}{\\\\cancel{21}}\\\\\\\\\\n&= \\\\sqrt{\\\\boxed{21}}.\\n\\\\end{align*}\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 17,\n", - " \"message\": {\n", - " \"content\": \"We rationalize the denominator as follows: \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21\\\\cdot \\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}}\\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21}\\\\\\\\\\n&= \\\\boxed{\\\\sqrt{21}}.\\n\\\\end{align*}\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 18,\n", - " \"message\": {\n", - " \"content\": \"We can begin by simplifying the square root in the denominator. The largest perfect square that divides into 21 is 3, so we can break apart the square root as follows: $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3\\\\cdot7}} = \\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}$$ Now we can use the property that $\\\\frac{1}{\\\\sqrt{x}} = \\\\frac{\\\\sqrt{x}}{x}$. Applying this to our expression, we have: $$\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}} = \\\\frac{21\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}} = \\\\frac{21\\\\sqrt{21}}{3\\\\cdot7} = \\\\frac{\\\\cancel{21}\\\\sqrt{21}}{\\\\cancel{3}\\\\cdot\\\\cancel{7}} = \\\\boxed{\\\\sqrt{21}}$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 19,\n", - " \"message\": {\n", - " \"content\": \"We start by writing $\\\\sqrt{21}$ as a product of simpler terms. Since $21=3\\\\cdot7$ and neither factor has a perfect square factor other than 1, we can write $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, we have \\\\[\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.\\\\] To rationalize the denominator, we multiply both the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$: \\\\[\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}} =\\\\frac{21\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7} = \\\\boxed{\\\\frac{3\\\\sqrt{21}}{7}}.\\\\]\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 20,\n", - " \"message\": {\n", - " \"content\": \"We want to remove the radical from the denominator. To do this, we multiply the top and bottom of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}.$$ Simplifying the numerator, we have $21\\\\cdot\\\\sqrt{21}=21^{\\\\frac{3}{2}}$. Therefore, the simplified version of $\\\\frac{21}{\\\\sqrt{21}}$ is $\\\\boxed{21^{\\\\frac{3}{2}}}$ or $\\\\boxed{21\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 21,\n", - " \"message\": {\n", - " \"content\": \"We can simplify $\\\\frac{21}{\\\\sqrt{21}}$ by multiplying both the numerator and denominator by $\\\\sqrt{21}$:\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21\\\\cdot\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}} \\\\\\\\\\n&= \\\\frac{21\\\\cdot\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\boxed{\\\\sqrt{21}}.\\n\\\\end{align*}\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 22,\n", - " \"message\": {\n", - " \"content\": \"We can begin by simplifying the square root in the denominator. We notice that $\\\\sqrt{21}$ can be divided into $\\\\sqrt{3}$ and $\\\\sqrt{7}$. Specifically, $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, we have \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}} \\\\\\\\\\n&= \\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{1}{\\\\sqrt{7}} \\\\\\\\\\n&= \\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{\\\\sqrt{7}}{7} \\\\\\\\\\n&= \\\\frac{21\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot 7} \\\\\\\\\\n&= \\\\boxed{\\\\frac{3\\\\sqrt{7}}{\\\\sqrt{3}}}.\\n\\\\end{align*}\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 23,\n", - " \"message\": {\n", - " \"content\": \"We start by noticing that $\\\\sqrt{21}$ can be simplified. The prime factorization of 21 is $3\\\\cdot7$, so $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.$$ To rationalize the denominator, we need to get rid of the $\\\\sqrt{3}$ and $\\\\sqrt{7}$ in the denominator. We can do this by multiplying the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. We get $$\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\sqrt{21}}{3\\\\cdot7}=\\\\frac{3\\\\cdot7\\\\cdot\\\\sqrt{21}}{3\\\\cdot7}.$$ Cancelling the 3's and the 7's, we get $$\\\\frac{3\\\\cdot7\\\\cdot\\\\sqrt{21}}{3\\\\cdot7} = \\\\boxed{\\\\sqrt{21}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 24,\n", - " \"message\": {\n", - " \"content\": \"We want to get rid of the radical in the denominator. One way to do this is to multiply both the numerator and the denominator by the radical (or a simplified version of it). In this case, $\\\\sqrt{21}$ is already simplified, so we can just multiply top and bottom by $\\\\sqrt{21}$: \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}\\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\boxed{\\\\sqrt{21}}.\\n\\\\end{align*}We simplified $\\\\frac{21\\\\sqrt{21}}{21}$ by canceling the common factor of 21 and leaving only $\\\\sqrt{21}$ in the numerator.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 25,\n", - " \"message\": {\n", - " \"content\": \"We know that $\\\\sqrt{21}$ can be simplified as $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, \\n\\n$$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.$$ \\n\\nWe can now rationalize the denominator by multiplying both the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$:\\n\\n$$\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}=\\\\boxed{3\\\\sqrt{3}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 26,\n", - " \"message\": {\n", - " \"content\": \"Since the denominator has a radical, we want to get rid of it. We can do this by multiplying both the numerator and denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}$$\\nSimplifying, we see that the $21$s cancel: $$\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 27,\n", - " \"message\": {\n", - " \"content\": \"We can begin by simplifying $\\\\sqrt{21}$ as a product of its prime factors: $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.$$ To rationalize the denominator, we can multiply both the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$: $$\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}.$$ Simplifying the fraction in the numerator, we have $$\\\\frac{21\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}=\\\\frac{3\\\\cdot7\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}=\\\\boxed{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 28,\n", - " \"message\": {\n", - " \"content\": \"We want to get rid of the radical in the denominator. To do this, we can multiply the numerator and denominator by $\\\\sqrt{21}$, which is equal to $\\\\sqrt{21}\\\\cdot\\\\sqrt{21} = 21$: \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} \\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\sqrt{21}.\\n\\\\end{align*}Thus, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 29,\n", - " \"message\": {\n", - " \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 30,\n", - " \"message\": {\n", - " \"content\": \"We begin by multiplying both the numerator and denominator of the fraction $\\\\frac{21}{\\\\sqrt{21}}$ by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}.$$ Simplifying the numerator by canceling $21$ from both the numerator and denominator, we get: $$\\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}.$$ Therefore, $\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{21}}$ (after rationalizing the denominator).\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 31,\n", - " \"message\": {\n", - " \"content\": \"We start by writing $\\\\dfrac{21}{\\\\sqrt{21}}$ as $\\\\dfrac{21}{\\\\sqrt{21}} \\\\cdot \\\\dfrac{\\\\sqrt{21}}{\\\\sqrt{21}}$. This gives us \\\\begin{align*}\\n\\\\dfrac{21}{\\\\sqrt{21}} \\\\cdot \\\\dfrac{\\\\sqrt{21}}{\\\\sqrt{21}} &= \\\\dfrac{21\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot \\\\sqrt{21}} \\\\\\\\\\n&= \\\\dfrac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&=\\\\boxed{\\\\sqrt{21}}.\\n\\\\end{align*}\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 32,\n", - " \"message\": {\n", - " \"content\": \"We can rationalize the denominator $\\\\sqrt{21}$ by multiplying both the numerator and denominator by $\\\\sqrt{21}$. \\n\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21 \\\\cdot \\\\sqrt{21}}{\\\\sqrt{21} \\\\cdot \\\\sqrt{21}} \\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\sqrt{21}\\n\\\\end{align*}\\n\\nTherefore, the simplified answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 33,\n", - " \"message\": {\n", - " \"content\": \"We can begin by multiplying the numerator and denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}.$$ Simplifying, we find that $\\\\frac{21\\\\sqrt{21}}{21} = \\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 34,\n", - " \"message\": {\n", - " \"content\": \"We can rationalize the denominator by multiplying the numerator and denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot \\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot \\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 35,\n", - " \"message\": {\n", - " \"content\": \"We can start by simplifying $\\\\sqrt{21}$. Since $21$ has no perfect square factors other than $1$, we have $\\\\sqrt{21} = \\\\sqrt{3\\\\cdot7}= \\\\sqrt{3}\\\\sqrt{7}$. Thus, we have:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3}\\\\sqrt{7}}$$\\n\\nTo rationalize the denominator, we want to get rid of the radical in the denominator by multiplying both the numerator and denominator by $\\\\sqrt{3}\\\\sqrt{7}$. This gives:\\n\\n$$\\\\frac{21}{\\\\sqrt{3}\\\\sqrt{7}} \\\\cdot \\\\frac{\\\\sqrt{3}\\\\sqrt{7}}{\\\\sqrt{3}\\\\sqrt{7}} = \\\\frac{21\\\\sqrt{3}\\\\sqrt{7}}{3\\\\cdot7} = \\\\frac{3\\\\sqrt{3}\\\\sqrt{7}}{1}$$\\n\\nTherefore, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}$ simplified with a rationalized denominator is $\\\\boxed{3\\\\sqrt{3}\\\\sqrt{7}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 36,\n", - " \"message\": {\n", - " \"content\": \"We start by multiplying the numerator and denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21\\\\cdot\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}.$$ Now we can simplify by canceling the common factor of 21: $$\\\\frac{21\\\\sqrt{21}}{21} = \\\\boxed{\\\\sqrt{21}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 37,\n", - " \"message\": {\n", - " \"content\": \"We can simplify this fraction by rationalizing the denominator, which means getting rid of the square root in the denominator.\\n\\nTo do this, we can multiply both the numerator and denominator by $\\\\sqrt{21}$:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}$$\\n\\nSimplifying the fraction, we get:\\n\\n$$\\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}$$\\n\\nTherefore, $\\\\boxed{\\\\sqrt{21}}$ is the final, simplified answer.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 38,\n", - " \"message\": {\n", - " \"content\": \"We notice that $\\\\sqrt{21}=3\\\\sqrt{7}$. So, rationalizing the denominator of $\\\\frac{21}{\\\\sqrt{21}}$ is the same as multiplying it by $\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}$:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}= \\\\frac{3\\\\cdot7\\\\cdot\\\\sqrt{3\\\\cdot7}}{3\\\\cdot7} = \\\\frac{\\\\sqrt{3\\\\cdot7}}{1} = \\\\boxed{\\\\sqrt{21}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 39,\n", - " \"message\": {\n", - " \"content\": \"We start by noting that $\\\\sqrt{21} = \\\\sqrt{3\\\\cdot7} = \\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}} = \\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{1}{\\\\sqrt{7}} = \\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{\\\\sqrt{7}}{7} = \\\\boxed{\\\\frac{3\\\\sqrt{7}}{\\\\sqrt{3}}}. $$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 40,\n", - " \"message\": {\n", - " \"content\": \"We can begin by simplifying the denominator. Recall that $\\\\sqrt{21}$ can be written as $21^{\\\\frac{1}{2}}$. So, we have:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{21^{\\\\frac{1}{2}}}$$\\n\\nTo rationalize the denominator, we need to get rid of the radical in the denominator. We can do this by multiplying both the numerator and the denominator by $\\\\sqrt{21}$. This gives:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}$$\\n\\nNow we can simplify:\\n\\n$$\\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}$$\\n\\nTherefore, $$\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}} = \\\\sqrt{21}$$\\n\\nAnd our final answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 41,\n", - " \"message\": {\n", - " \"content\": \"We want to eliminate the radical in the denominator of the fraction $\\\\frac{21}{\\\\sqrt{21}}$. To do this, we can multiply both the numerator and denominator by $\\\\sqrt{21}$. This gives us \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21 \\\\cdot \\\\sqrt{21}}{\\\\sqrt{21} \\\\cdot \\\\sqrt{21}}\\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21}\\\\\\\\\\n&= \\\\sqrt{21}.\\n\\\\end{align*} Therefore, $\\\\boxed{\\\\sqrt{21}}$ is our final answer.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 42,\n", - " \"message\": {\n", - " \"content\": \"We start by multiplying the numerator and denominator of the fraction by $\\\\sqrt{21}$:\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21 \\\\cdot \\\\sqrt{21}}{\\\\sqrt{21} \\\\cdot \\\\sqrt{21}} \\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\sqrt{21}\\n\\\\end{align*}\\nTherefore, $\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 43,\n", - " \"message\": {\n", - " \"content\": \"We can begin by simplifying the denominator using the property $\\\\sqrt{n^2}=n$ for any positive real number $n$. We have:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3^2\\\\cdot 7}}=\\\\frac{21}{3\\\\sqrt{7}}$$\\n\\nWe can simplify this fraction further by dividing both the numerator and denominator by the greatest common factor, which is 3. We obtain:\\n\\n$$\\\\frac{21}{3\\\\sqrt{7}}=\\\\frac{7}{\\\\sqrt{7}}=\\\\frac{7\\\\cdot\\\\sqrt{7}}{7}=\\\\boxed{\\\\sqrt{7}}$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 44,\n", - " \"message\": {\n", - " \"content\": \"We want to get rid of the square root in the denominator. To do this, we can multiply both the numerator and the denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}}=\\\\frac{21\\\\cdot \\\\sqrt{21}}{21}.$$Simplifying the fraction, we have $\\\\frac{21}{21}=1$, so our final answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 45,\n", - " \"message\": {\n", - " \"content\": \"Since $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$, we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{1}{\\\\sqrt{7}}=\\\\boxed{3\\\\sqrt{7}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 46,\n", - " \"message\": {\n", - " \"content\": \"We have \\\\[\\n\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\cdot \\\\sqrt{21}}{21}=\\\\sqrt{21}.\\n\\\\] Thus, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}= \\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 47,\n", - " \"message\": {\n", - " \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\frac{\\\\cancel{21}\\\\sqrt{\\\\cancel{21}\\\\cdot3}}{\\\\cancel{21}}=\\\\boxed{\\\\sqrt{3}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 48,\n", - " \"message\": {\n", - " \"content\": \"We want to find a way to eliminate the radical in the denominator. One way to do this is to multiply the numerator and denominator by the radical, since $(\\\\sqrt{21})^2=21$. Doing this gives us: $$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\frac{\\\\cancel{21}\\\\sqrt{3}\\\\cdot\\\\cancel{7}}{\\\\cancel{21}} = \\\\boxed{\\\\sqrt{3}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"length\",\n", - " \"index\": 49,\n", - " \"message\": {\n", - " \"content\": \"We begin by noticing that $\\\\sqrt{21}=21^{\\\\frac12}$, so we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{21^{\\\\frac12}}.$$ Using the rule $a^{m}\\\\cdot a^{n}=a^{m+n}$, we can write $21=3\\\\cdot7$ as $$21^{\\\\frac12}=3^{\\\\frac12}\\\\cdot7^{\\\\frac12}.$$ Thus, we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{21^{\\\\frac12}}=\\\\frac{21}{3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}.$$ To rationalize the denominator, we multiply the top and bottom of the fraction by $3^{\\\\frac12}\\\\cdot7^{\\\\frac12}$: $$\\\\frac{21}{3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}\\\\cdot\\\\frac{3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}{3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}=\\\\frac{21\\\\cdot3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}{3\\\\cdot7}.$$ Simplifying the numerator, we have $$\\\\frac{21\\\\cdot3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}{3\\\\cdot7}=\\\\frac{3^{\\\\frac12}\\\\cdot7^{\\\\frac32}}{3}.$$ Now, using the rule $a^{m}/a^{n}=a^{m-n}$, we simplify further: $$\\\\frac{3^{\\\\frac12}\\\\cdot7^{\\\\frac32}}{3}=3^{-\\\\frac12}\\\\cdot7^{\\\\frac32}=\\\\frac{7^{\\\\frac32}}{\\\\sqrt{3}}}.$$ Finally, we rationalize the denominator by multiplying top and bottom by $\\\\sqrt{3}$: $$\\\\frac{7^{\\\\frac32}}{\\\\sqrt{3}}\\\\cdot\\\\frac{\\\\sqrt{3}}{\\\\sqrt{3}}=\\\\boxed{\\\\\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 50,\n", - " \"message\": {\n", - " \"content\": \"To rationalize the denominator, we need to get rid of the square root. We can do this by multiplying both the numerator and denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}.$$ Therefore, the answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 51,\n", - " \"message\": {\n", - " \"content\": \"We start with $\\\\frac{21}{\\\\sqrt{21}}$. To rationalize the denominator, we want to get rid of the square root. We know that $\\\\sqrt{21}$ can be simplified as $\\\\sqrt{3 \\\\cdot 7} = \\\\sqrt{3} \\\\cdot \\\\sqrt{7}$. Therefore, we have \\\\[\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}}.\\\\] We can now use the property $\\\\frac{a}{b \\\\cdot c} = \\\\frac{a}{b} \\\\cdot \\\\frac{1}{c}$ to split up the denominator: \\\\[\\\\frac{21}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}} = \\\\frac{21}{\\\\sqrt{3}} \\\\cdot \\\\frac{1}{\\\\sqrt{7}} = \\\\frac{21\\\\sqrt{7}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7} \\\\cdot \\\\sqrt{7}} = \\\\frac{21\\\\sqrt{7}}{7\\\\sqrt{3}}.\\\\] Finally, we simplify by dividing both the numerator and the denominator by $7$: \\\\[\\\\frac{21\\\\sqrt{7}}{7\\\\sqrt{3}} = \\\\frac{3\\\\sqrt{7}}{\\\\sqrt{3}} = \\\\frac{3\\\\sqrt{7}}{\\\\sqrt{3}} \\\\cdot \\\\frac{\\\\sqrt{3}}{\\\\sqrt{3}} = \\\\boxed{3\\\\sqrt{21}}.\\\\]\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 52,\n", - " \"message\": {\n", - " \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\frac{3\\\\cdot 7\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot 7}.$$ Simplifying this, we get $$\\\\frac{3\\\\cdot 7\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot 7} = \\\\frac{\\\\cancel{3}\\\\cdot \\\\cancel{7}\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\cancel{3}\\\\cdot \\\\cancel{7}} = \\\\sqrt{3}\\\\sqrt{7}.$$ Therefore, we have $$\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{3}\\\\sqrt{7}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 53,\n", - " \"message\": {\n", - " \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}=\\\\frac{\\\\cancel{21}\\\\cdot\\\\sqrt{\\\\cancel{21}\\\\cdot 3}}{\\\\cancel{21}}=\\\\boxed{\\\\sqrt{3}}.$$\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " }\n", - " ],\n", - " \"created\": 1679622917,\n", - " \"id\": \"chatcmpl-6xQw98DIHC3S1iQAacY3vjL6TLPRL\",\n", - " \"model\": \"gpt-3.5-turbo-0301\",\n", - " \"object\": \"chat.completion\",\n", - " \"usage\": {\n", - " \"completion_tokens\": 7762,\n", - " \"prompt_tokens\": 50,\n", - " \"total_tokens\": 7812\n", - " }\n", - "}\n", - "{'expected_success': 1.0, 'success': True, 'success_vote': 1.0, 'voted_answer': 'We want to get rid of the square root in the denominator. We can do this by multiplying both the numerator and denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\sqrt{21}.$$ Thus, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}=\\\\boxed{\\\\sqrt{21}}$.'}\n" - ] - } - ], - "source": [ - "responses = oai.ChatCompletion.create(context=tune_data[1], **config)\n", - "metric_results = success_metrics([response[\"message\"][\"content\"].rstrip() for response in responses[\"choices\"]], **tune_data[1])\n", - "print(\"response on an example data instance:\", responses)\n", - "print(\"metric_results on the example data instance:\", metric_results)\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Evaluate the success rate on the test data\n", - "\n", - "You can use flaml's `oai.ChatCompletion.test` to evaluate the performance of an entire dataset with the tuned config. The following code will take a while (30 mins to 1 hour) to evaluate all the test data instances if uncommented and run. It will cost roughly $3. " - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-13T23:41:56.042764Z", - "iopub.status.busy": "2023-02-13T23:41:56.042086Z", - "iopub.status.idle": "2023-02-13T23:53:05.597643Z", - "shell.execute_reply": "2023-02-13T23:53:05.596603Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'expected_success': 0.9878128576084944, 'success': 0.9950248756218906, 'success_vote': 0.9203980099502488, 'voted_answer': \"We have that $1$ kilowatt is equivalent to $1.36$ horsepower. Therefore, we can set up the proportion $\\\\frac{1\\\\text{ kW}}{1.36\\\\text{ hp}} = \\\\frac{x\\\\text{ kW}}{500\\\\text{ hp}}$, where $x$ is the number of kilowatts that Eric's car's engine can generate. Solving for $x$, we get $x = \\\\frac{(1\\\\text{ kW})(500\\\\text{ hp})}{1.36\\\\text{ hp}} \\\\approx \\\\boxed{368 \\\\text{ kW}}$.\", 'total_cost': 4.194939999999996, 'cost': 3.1735039999999994, 'inference_cost': 0.01577204825870647}\n" - ] - } - ], - "source": [ - "# result = oai.Completion.test(test_data, config, success_metrics)\n", - "# print(\"performance on test data with the tuned config:\", result)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What about the default, untuned gpt-4 config (with the same prompt as the tuned config)? We can evaluate it and compare:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'expected_success': 0.6965174129353234, 'success': 0.6965174129353234, 'success_vote': 0.6965174129353234, 'voted_answer': \"If we let $x$ be the number of kilowatts, then we can set up the proportion $\\\\frac{x\\\\text{ kW}}{500\\\\text{ hp}}=\\\\frac{1\\\\text{ kW}}{1.36\\\\text{ hp}}$. Solving for $x$, we get $x=\\\\frac{500}{1.36} = 367.65$. Rounding to the nearest integer, we get that Eric's car's engine has $\\\\boxed{368}$ kilowatts.\", 'total_cost': 6.009489999999993, 'cost': 1.8145500000000006, 'inference_cost': 0.008809679104477611}\n" - ] - } - ], - "source": [ - "# assuming you have access to gpt-4; otherwise use gpt-3.5-turbo\n", - "# the following code will cost roughly $2 if uncommented and run.\n", - "\n", - "# default_config = {\"model\": 'gpt-4', \"prompt\": prompts[0]}\n", - "# default_result = oai.Completion.test(test_data, default_config, success_metrics)\n", - "# print(\"performance on test data from gpt-4 with a default config:\", default_result)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tuned config succeeds in 92.0% test cases\n", - "untuned config succeeds in 69.7% test cases\n" - ] - } - ], - "source": [ - "# print(\"tuned config succeeds in {:.1f}% test cases\".format(result[\"success_vote\"] * 100))\n", - "# print(\"untuned config succeeds in {:.1f}% test cases\".format(default_result[\"success_vote\"] * 100))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the untuned config has a lower inference cost. What if we heuristically increase the number of responses n to 5?" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'expected_success': 0.9181755223880596, 'success': 0.9552238805970149, 'success_vote': 0.8756218905472637, 'voted_answer': \"To figure out how many kilowatts of power Eric's car can generate, we need to find the conversion factor for metric horsepower to kilowatts. To do this, we start by dividing the power in Eric's car in horsepower by the number of kilowatts per horsepower: $$\\\\frac{500\\\\text{ hp}}{1.36\\\\text{ hp/kW}}$$Now, to get to kilowatts, we divide by 1 hp, which gives us $$\\\\frac{500}{1.36}\\\\approx \\\\boxed{368}\\\\text{ kW}$$\", 'total_cost': 14.071600000000004, 'cost': 8.06211, 'inference_cost': 0.039892067164179104}\n" - ] - } - ], - "source": [ - "# config_larger = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"n\": 5}\n", - "# default_result = oai.ChatCompletion.test(test_data, config_larger, success_metrics)\n", - "# print(\"performance on test data from gpt-4 with a default config and n=5:\", default_result)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We find that the 'success_vote' metric is increased at the cost of exceeding the inference budget. But the tuned configuration has both higher 'success_vote' (92% vs. 87%) and lower average inference cost ($0.016 vs. $0.04 per instance).\n", - "\n", - "A developer could use flaml to tune the configuration to satisfy the target inference budget while maximizing the value out of it." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - }, - "vscode": { - "interpreter": { - "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" - } - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "2d910cfd2d2a4fc49fc30fbbdc5576a7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "454146d0f7224f038689031002906e6f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26", - "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755", - "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45" - ], - "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555", - "tabbable": null, - "tooltip": null - } - }, - "577e1e3cc4db4942b0883577b3b52755": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62", - "tabbable": null, - "tooltip": null, - "value": 1 - } - }, - "6086462a12d54bafa59d3c4566f06cb2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "74a6ba0c3cbc4051be0a83e152fe1e62": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "7d3f3d9e15894d05a4d188ff4f466554": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HTMLStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HTMLStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "background": null, - "description_width": "", - "font_size": null, - "text_color": null - } - }, - "b40bdfb1ac1d4cffb7cefcb870c64d45": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "HTMLView", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8", - "placeholder": "​", - "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466", - "tabbable": null, - "tooltip": null, - "value": " 1/1 [00:00<00:00, 44.69it/s]" - } - }, - "ca245376fd9f4354af6b2befe4af4466": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HTMLStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HTMLStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "background": null, - "description_width": "", - "font_size": null, - "text_color": null - } - }, - "dc83c7bff2f241309537a8119dfc7555": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e4ae2b6f5a974fd4bafb6abb9d12ff26": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "HTMLView", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2", - "placeholder": "​", - "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554", - "tabbable": null, - "tooltip": null, - "value": "100%" - } - }, - "f1355871cc6f4dd4b50d9df5af20e5c8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - } - }, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebook/integrate_openai.ipynb b/notebook/integrate_openai.ipynb deleted file mode 100644 index 0d4d2ff208..0000000000 --- a/notebook/integrate_openai.ipynb +++ /dev/null @@ -1,1232 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved. \n", - "\n", - "Licensed under the MIT License.\n", - "\n", - "# Use FLAML to Tune OpenAI Models\n", - "\n", - "FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of LLMs.\n", - "\n", - "In this notebook, we tune OpenAI models for code generation. We use [the HumanEval benchmark](https://huggingface.co/datasets/openai_humaneval) released by OpenAI for synthesizing programs from docstrings. \n", - "\n", - "## Requirements\n", - "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n", - "```bash\n", - "pip install flaml[openai]==1.1.3\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:36.910966Z", - "iopub.status.busy": "2023-02-24T23:25:36.910473Z", - "iopub.status.idle": "2023-02-24T23:25:36.914554Z", - "shell.execute_reply": "2023-02-24T23:25:36.914030Z" - } - }, - "outputs": [], - "source": [ - "# %pip install flaml[openai]==1.1.3 datasets" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set your OpenAI key:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:36.917301Z", - "iopub.status.busy": "2023-02-24T23:25:36.917011Z", - "iopub.status.idle": "2023-02-24T23:25:36.923156Z", - "shell.execute_reply": "2023-02-24T23:25:36.922619Z" - } - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " os.environ[\"OPENAI_API_KEY\"] = \"\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you use Azure OpenAI, uncomment the following:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:36.925804Z", - "iopub.status.busy": "2023-02-24T23:25:36.925423Z", - "iopub.status.idle": "2023-02-24T23:25:36.928191Z", - "shell.execute_reply": "2023-02-24T23:25:36.927673Z" - } - }, - "outputs": [], - "source": [ - "# openai.api_type = \"azure\"\n", - "# openai.api_base = \"https://.openai.azure.com/\"\n", - "# openai.api_version = \"2022-12-01\" # change if necessary" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load dataset\n", - "\n", - "First, we load the humaneval dataset. The dataset contains 164 examples. We use the first 20 for tuning the generation hyperparameters and the remaining for evaluation. In each example, the \"prompt\" is the prompt string for eliciting the code generation, \"test\" is the Python code for unit test for the example, and \"entry_point\" is the function name to be tested." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:36.931255Z", - "iopub.status.busy": "2023-02-24T23:25:36.930838Z", - "iopub.status.idle": "2023-02-24T23:25:39.148799Z", - "shell.execute_reply": "2023-02-24T23:25:39.148113Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset openai_humaneval (/home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d025d7cf0bc3438ba290e24d97855d8f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00 [0,0,0,0,3,3]\n", - " compare([0,5,0,0,0,4],[4,1,1,0,0,-2]) -> [4,4,1,0,0,6]\n", - " \"\"\"\n", - "\n" - ] - } - ], - "source": [ - "print(tune_data[1][\"prompt\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is one example of the unit test code for verifying the correctness of the generated code:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:39.158398Z", - "iopub.status.busy": "2023-02-24T23:25:39.157766Z", - "iopub.status.idle": "2023-02-24T23:25:39.161396Z", - "shell.execute_reply": "2023-02-24T23:25:39.160797Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def check(candidate):\n", - "\n", - " # Check some simple cases\n", - " assert candidate([1,2,3,4,5,1],[1,2,3,4,2,-2])==[0,0,0,0,3,3], \"This prints if this assert fails 1 (good for debugging!)\"\n", - " assert candidate([0,0,0,0,0,0],[0,0,0,0,0,0])==[0,0,0,0,0,0], \"This prints if this assert fails 1 (good for debugging!)\"\n", - " assert candidate([1,2,3],[-1,-2,-3])==[2,4,6], \"This prints if this assert fails 1 (good for debugging!)\"\n", - " assert candidate([1,2,3,5],[-1,2,3,4])==[2,0,0,1], \"This prints if this assert fails 1 (good for debugging!)\"\n", - "\n", - " # Check some edge cases that are easy to work out by hand.\n", - " assert True, \"This prints if this assert fails 2 (also good for debugging!)\"\n", - "\n", - "\n" - ] - } - ], - "source": [ - "print(tune_data[1][\"test\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Success Metric\n", - "\n", - "Before we start tuning, we need to define the success metric we want to opotimize. For each code generation task, if one of the returned responses can pass the test, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks.\n", - "\n", - "### Define a code executor\n", - "\n", - "First, we write a simple code executor. The code executor takes the generated code and the test code as the input, and execute them with a timer." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:39.164187Z", - "iopub.status.busy": "2023-02-24T23:25:39.163867Z", - "iopub.status.idle": "2023-02-24T23:25:39.169009Z", - "shell.execute_reply": "2023-02-24T23:25:39.168427Z" - } - }, - "outputs": [], - "source": [ - "import signal\n", - "import subprocess\n", - "import sys\n", - "\n", - "def timeout_handler(signum, frame):\n", - " raise TimeoutError(\"Timed out!\")\n", - "\n", - "signal.signal(signal.SIGALRM, timeout_handler)\n", - "max_exec_time = 3 # seconds\n", - "\n", - "def execute_code(code):\n", - " code = code.strip()\n", - " with open(\"codetest.py\", \"w\") as fout:\n", - " fout.write(code)\n", - " try:\n", - " signal.alarm(max_exec_time)\n", - " result = subprocess.run(\n", - " [sys.executable, \"codetest.py\"],\n", - " stdout=subprocess.DEVNULL,\n", - " stderr=subprocess.PIPE,\n", - " )\n", - " signal.alarm(0)\n", - " except TimeoutError:\n", - " return 0\n", - " return int(result.returncode == 0)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function will create a temp file \"codetest.py\" and execute it in a separate process. It allows for 3 seconds to finish that code.\n", - "\n", - "### Define a function to evaluate the success for a given program synthesis task" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:39.171752Z", - "iopub.status.busy": "2023-02-24T23:25:39.171347Z", - "iopub.status.idle": "2023-02-24T23:25:39.176343Z", - "shell.execute_reply": "2023-02-24T23:25:39.175510Z" - } - }, - "outputs": [], - "source": [ - "def success_metrics(responses, prompt, test, entry_point):\n", - " \"\"\"Check if the task is successful.\n", - "\n", - " Args:\n", - " responses (list): The list of responses.\n", - " prompt (str): The input prompt.\n", - " test (str): The test code.\n", - " entry_point (str): The name of the function.\n", - "\n", - " Returns:\n", - " dict: The success metrics.\n", - " \"\"\"\n", - " success_list = []\n", - " n = len(responses)\n", - " for i in range(n):\n", - " response = responses[i]\n", - " code = f\"{prompt}{response}\\n{test}\\ncheck({entry_point})\"\n", - " succeed = execute_code(code)\n", - " success_list.append(succeed)\n", - " return {\n", - " \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n", - " \"success\": any(s for s in success_list),\n", - " }\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Use the tuning data to find a good configuration\n", - "\n", - "### Import the oai and tune subpackages from flaml.\n", - "\n", - "FLAML has provided an API for hyperparameter optimization of OpenAI models: `oai.Completion.tune` and to make a request with the tuned config: `oai.Completion.create`. First, we import oai from flaml:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:39.179030Z", - "iopub.status.busy": "2023-02-24T23:25:39.178624Z", - "iopub.status.idle": "2023-02-24T23:25:40.584410Z", - "shell.execute_reply": "2023-02-24T23:25:40.583802Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "from flaml import oai, tune" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For (local) reproducibility and cost efficiency, we cache responses from OpenAI." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:40.587815Z", - "iopub.status.busy": "2023-02-24T23:25:40.587283Z", - "iopub.status.idle": "2023-02-24T23:25:40.590826Z", - "shell.execute_reply": "2023-02-24T23:25:40.590158Z" - }, - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "oai.Completion.set_cache(seed)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n", - "\n", - "### Perform tuning\n", - "\n", - "The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n", - "\n", - "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.02 means the target inference budget is 0.02 dollars, which translates to 1000 tokens (input + output combined) if the text Davinci model is used.\n", - "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 5 means 5 dollars are allowed in total, which translates to 250K tokens for the text Davinci model.\n", - "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n", - "\n", - "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n", - "\n", - "```python\n", - "default_search_space = {\n", - " \"model\": tune.choice([\n", - " \"text-ada-001\",\n", - " \"text-babbage-001\",\n", - " \"text-davinci-003\",\n", - " \"gpt-3.5-turbo\",\n", - " \"gpt-4\",\n", - " ]),\n", - " \"temperature_or_top_p\": tune.choice(\n", - " [\n", - " {\"temperature\": tune.uniform(0, 1)},\n", - " {\"top_p\": tune.uniform(0, 1)},\n", - " ]\n", - " ),\n", - " \"max_tokens\": tune.lograndint(50, 1000),\n", - " \"n\": tune.randint(1, 100),\n", - " \"prompt\": \"{prompt}\",\n", - "}\n", - "```\n", - "\n", - "The default search space can be overridden by users' input.\n", - "For example, the following code specifies four choices for the prompt and a fixed list of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used. If you don't have access to gpt-4 or would like to modify the choice of models, you can provide a different search space for model." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:25:40.593603Z", - "iopub.status.busy": "2023-02-24T23:25:40.593269Z", - "iopub.status.idle": "2023-02-24T23:26:38.349191Z", - "shell.execute_reply": "2023-02-24T23:26:38.348392Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m[I 2023-03-26 02:53:26,384]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n", - "\u001b[32m[I 2023-03-26 02:53:26,387]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 03-26 02:53:26] {811} INFO - trial 1 config: {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:29] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.09264000000000001, 'cost': 0.09264000000000001, 'inference_cost': 0.004632, 'training_iteration': 0, 'config': {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-davinci-003', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 3.5772321224212646}\n", - "[flaml.tune.tune: 03-26 02:53:29] {811} INFO - trial 2 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:30] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.09429879999999999, 'cost': 0.0016588, 'inference_cost': 7.264e-05, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5873167514801025}\n", - "[flaml.tune.tune: 03-26 02:53:30] {811} INFO - trial 3 config: {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:31] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.09782479999999999, 'cost': 0.003526, 'inference_cost': 0.00016342499999999997, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-babbage-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6068365573883057}\n", - "[flaml.tune.tune: 03-26 02:53:31] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:31] {215} INFO - result: {'expected_success': 0.2, 'success': 0.2, 'total_cost': 0.10643079999999999, 'cost': 0.008606, 'inference_cost': 0.0004394, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5878369808197021}\n", - "[flaml.tune.tune: 03-26 02:53:31] {811} INFO - trial 5 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:32] {215} INFO - result: {'expected_success': 0.8, 'success': 0.8, 'total_cost': 0.2603308, 'cost': 0.15389999999999998, 'inference_cost': 0.007861499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6071126461029053}\n", - "[flaml.tune.tune: 03-26 02:53:32] {811} INFO - trial 6 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 1, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.2629064, 'cost': 0.0025756000000000004, 'inference_cost': 0.00011848, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.7605307121989587}, 'config/max_tokens': 82, 'config/n': 9, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.4761645793914795}\n", - "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 7 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.14217004760152696}, 'max_tokens': 152, 'n': 67, 'prompt': 2, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.4132364, 'cost': 0.15033000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.14217004760152696}, 'max_tokens': 152, 'n': 67, 'prompt': 2, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.14217004760152696}, 'config/max_tokens': 152, 'config/n': 67, 'config/prompt': 2, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0022079944610595703}\n", - "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 8 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.30070005663620336}, 'max_tokens': 70, 'n': 83, 'prompt': 3, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.6260264, 'cost': 0.21278999999999998, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.30070005663620336}, 'max_tokens': 70, 'n': 83, 'prompt': 3, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.30070005663620336}, 'config/max_tokens': 70, 'config/n': 83, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0022161006927490234}\n", - "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 9 config: {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.16501589771914849}, 'max_tokens': 161, 'n': 10, 'prompt': 3, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:53:43] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.6310854, 'cost': 0.005059, 'inference_cost': 0.00023457499999999997, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.16501589771914849}, 'max_tokens': 161, 'n': 10, 'prompt': 3, 'stop': 0}, 'config/model': 'text-babbage-001', 'config/temperature_or_top_p': {'temperature': 0.16501589771914849}, 'config/max_tokens': 161, 'config/n': 10, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.868851661682129}\n", - "[flaml.tune.tune: 03-26 02:53:43] {811} INFO - trial 10 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.5902013629854229}, 'max_tokens': 56, 'n': 36, 'prompt': 3, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:54:05] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.6344234000000001, 'cost': 0.003338, 'inference_cost': 0.0001522, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.5902013629854229}, 'max_tokens': 56, 'n': 36, 'prompt': 3, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.5902013629854229}, 'config/max_tokens': 56, 'config/n': 36, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 21.348156690597534}\n", - "[flaml.tune.tune: 03-26 02:54:05] {811} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.763240587143681}, 'max_tokens': 693, 'n': 42, 'prompt': 0, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:54:33] {215} INFO - result: {'expected_success': 0.3476191678990812, 'success': 0.35, 'total_cost': 0.7530034000000003, 'cost': 0.11858000000000002, 'inference_cost': 0.005490999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.763240587143681}, 'max_tokens': 693, 'n': 42, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.763240587143681}, 'config/max_tokens': 693, 'config/n': 42, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 28.24349284172058}\n", - "[flaml.tune.tune: 03-26 02:54:33] {811} INFO - trial 12 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.2927979762895091}, 'max_tokens': 60, 'n': 97, 'prompt': 2, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:54:33] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.9340534000000004, 'cost': 0.18105, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.2927979762895091}, 'max_tokens': 60, 'n': 97, 'prompt': 2, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.2927979762895091}, 'config/max_tokens': 60, 'config/n': 97, 'config/prompt': 2, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.002497434616088867}\n", - "[flaml.tune.tune: 03-26 02:54:33] {811} INFO - trial 13 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.7186028103822503}, 'max_tokens': 288, 'n': 4, 'prompt': 1, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:54:35] {215} INFO - result: {'expected_success': 0.28359375, 'success': 0.35, 'total_cost': 0.9496594000000004, 'cost': 0.015605999999999998, 'inference_cost': 0.0007894, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.7186028103822503}, 'max_tokens': 288, 'n': 4, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.7186028103822503}, 'config/max_tokens': 288, 'config/n': 4, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 2.29030704498291}\n", - "[flaml.tune.tune: 03-26 02:54:35] {811} INFO - trial 14 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'top_p': 0.3653649712141158}, 'max_tokens': 96, 'n': 75, 'prompt': 1, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:55:20] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.9550898000000005, 'cost': 0.0054304, 'inference_cost': 0.00026122, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'top_p': 0.3653649712141158}, 'max_tokens': 96, 'n': 75, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'top_p': 0.3653649712141158}, 'config/max_tokens': 96, 'config/n': 75, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 44.837317943573}\n", - "[flaml.tune.tune: 03-26 02:55:20] {811} INFO - trial 15 config: {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.3814115349046321}, 'max_tokens': 791, 'n': 92, 'prompt': 3, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:55:20] {215} INFO - result: {'expected_success': 0, 'total_cost': 1.0798498000000005, 'cost': 0.12475999999999998, 'training_iteration': 0, 'config': {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.3814115349046321}, 'max_tokens': 791, 'n': 92, 'prompt': 3, 'stop': 0}, 'config/model': 'text-davinci-003', 'config/temperature_or_top_p': {'temperature': 0.3814115349046321}, 'config/max_tokens': 791, 'config/n': 92, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0024149417877197266}\n", - "[flaml.tune.tune: 03-26 02:55:20] {811} INFO - trial 16 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4284507389678964}, 'max_tokens': 398, 'n': 11, 'prompt': 3, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:55:29] {215} INFO - result: {'expected_success': 0.5484931390416686, 'success': 0.55, 'total_cost': 1.1118038000000003, 'cost': 0.031954, 'inference_cost': 0.0015885000000000003, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4284507389678964}, 'max_tokens': 398, 'n': 11, 'prompt': 3, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4284507389678964}, 'config/max_tokens': 398, 'config/n': 11, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 9.271101951599121}\n", - "[flaml.tune.tune: 03-26 02:55:29] {811} INFO - trial 17 config: {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}\n", - "[flaml.tune.tune: 03-26 02:55:46] {215} INFO - result: {'expected_success': 0.8822303234803123, 'success': 0.9, 'total_cost': 2.1304238, 'cost': 1.0186199999999999, 'inference_cost': 0.0484995, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}, 'config/model': 'gpt-4', 'config/max_tokens': 211, 'config/n': 13, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.25447895557126815}, 'experiment_tag': 'exp', 'time_total_s': 16.604310512542725}\n", - "[flaml.tune.tune: 03-26 02:55:46] {811} INFO - trial 18 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9761031076386442}, 'max_tokens': 349, 'n': 23, 'prompt': 0, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:56:05] {215} INFO - result: {'expected_success': 0.3551828400470255, 'success': 0.4, 'total_cost': 2.1919698000000003, 'cost': 0.061546, 'inference_cost': 0.0030944, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9761031076386442}, 'max_tokens': 349, 'n': 23, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9761031076386442}, 'config/max_tokens': 349, 'config/n': 23, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 19.451276063919067}\n", - "[flaml.tune.tune: 03-26 02:56:05] {811} INFO - trial 19 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9822374507369328}, 'max_tokens': 393, 'n': 22, 'prompt': 0, 'stop': 0}\n", - "[flaml.tune.tune: 03-26 02:56:18] {215} INFO - result: {'expected_success': 0.2898979473186428, 'success': 0.35, 'total_cost': 2.2507018000000003, 'cost': 0.058732, 'inference_cost': 0.0029537, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9822374507369328}, 'max_tokens': 393, 'n': 22, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9822374507369328}, 'config/max_tokens': 393, 'config/n': 22, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 13.075204372406006}\n", - "[flaml.tune.tune: 03-26 02:56:18] {811} INFO - trial 20 config: {'model': 'gpt-4', 'max_tokens': 348, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.36865945026811975}}\n", - "[flaml.tune.tune: 03-26 02:56:19] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 2.4012418000000006, 'cost': 0.15053999999999995, 'inference_cost': 0.007693499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 348, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.36865945026811975}}, 'config/model': 'gpt-4', 'config/max_tokens': 348, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'experiment_tag': 'exp', 'time_total_s': 0.6143312454223633}\n", - "[flaml.tune.tune: 03-26 02:56:19] {811} INFO - trial 21 config: {'model': 'text-ada-001', 'max_tokens': 130, 'n': 22, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.22084263211180838}}\n", - "[flaml.tune.tune: 03-26 02:56:32] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.4043414000000016, 'cost': 0.0030996000000000005, 'inference_cost': 0.00014468, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'max_tokens': 130, 'n': 22, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.22084263211180838}}, 'config/model': 'text-ada-001', 'config/max_tokens': 130, 'config/n': 22, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.22084263211180838}, 'experiment_tag': 'exp', 'time_total_s': 13.137321710586548}\n", - "[flaml.tune.tune: 03-26 02:56:32] {811} INFO - trial 22 config: {'model': 'text-ada-001', 'max_tokens': 342, 'n': 4, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.2881152790307279}}\n", - "[flaml.tune.tune: 03-26 02:56:35] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.4061918000000024, 'cost': 0.0018504, 'inference_cost': 8.222e-05, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'max_tokens': 342, 'n': 4, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.2881152790307279}}, 'config/model': 'text-ada-001', 'config/max_tokens': 342, 'config/n': 4, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.2881152790307279}, 'experiment_tag': 'exp', 'time_total_s': 2.4484035968780518}\n", - "[flaml.tune.tune: 03-26 02:56:35] {811} INFO - trial 23 config: {'model': 'gpt-4', 'max_tokens': 253, 'n': 23, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.41254458573656}}\n", - "[flaml.tune.tune: 03-26 02:56:35] {215} INFO - result: {'expected_success': 0, 'total_cost': 2.618831800000003, 'cost': 0.21264, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 253, 'n': 23, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.41254458573656}}, 'config/model': 'gpt-4', 'config/max_tokens': 253, 'config/n': 23, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.41254458573656}, 'experiment_tag': 'exp', 'time_total_s': 0.003139972686767578}\n", - "[flaml.tune.tune: 03-26 02:56:35] {811} INFO - trial 24 config: {'model': 'gpt-4', 'max_tokens': 176, 'n': 3, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.0964133254059763}}\n", - "[flaml.tune.tune: 03-26 02:56:36] {215} INFO - result: {'expected_success': 0.8185185185185185, 'success': 0.85, 'total_cost': 2.912231800000003, 'cost': 0.29339999999999994, 'inference_cost': 0.014836499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 176, 'n': 3, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.0964133254059763}}, 'config/model': 'gpt-4', 'config/max_tokens': 176, 'config/n': 3, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.0964133254059763}, 'experiment_tag': 'exp', 'time_total_s': 1.8556303977966309}\n", - "[flaml.tune.tune: 03-26 02:56:36] {811} INFO - trial 25 config: {'model': 'text-babbage-001', 'max_tokens': 343, 'n': 27, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24286268913046594}}\n", - "[flaml.tune.tune: 03-26 02:56:55] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.9569863000000023, 'cost': 0.04475450000000001, 'inference_cost': 0.00222485, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'max_tokens': 343, 'n': 27, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24286268913046594}}, 'config/model': 'text-babbage-001', 'config/max_tokens': 343, 'config/n': 27, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.24286268913046594}, 'experiment_tag': 'exp', 'time_total_s': 19.013901472091675}\n", - "[flaml.tune.tune: 03-26 02:56:55] {811} INFO - trial 26 config: {'model': 'text-babbage-001', 'max_tokens': 130, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.26609522201207036}}\n", - "[flaml.tune.tune: 03-26 02:56:56] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.9595088000000023, 'cost': 0.0025224999999999996, 'inference_cost': 0.00011325, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'max_tokens': 130, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.26609522201207036}}, 'config/model': 'text-babbage-001', 'config/max_tokens': 130, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.26609522201207036}, 'experiment_tag': 'exp', 'time_total_s': 0.5786199569702148}\n", - "[flaml.tune.tune: 03-26 02:56:56] {811} INFO - trial 27 config: {'model': 'gpt-4', 'max_tokens': 212, 'n': 29, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24802150727233283}}\n", - "[flaml.tune.tune: 03-26 02:56:56] {215} INFO - result: {'expected_success': 0, 'total_cost': 3.0123088000000022, 'cost': 0.05279999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 212, 'n': 29, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24802150727233283}}, 'config/model': 'gpt-4', 'config/max_tokens': 212, 'config/n': 29, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.24802150727233283}, 'experiment_tag': 'exp', 'time_total_s': 0.0019483566284179688}\n", - "[flaml.tune.tune: 03-26 02:56:56] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n" - ] - } - ], - "source": [ - "config, analysis = oai.Completion.tune(\n", - " data=tune_data, # the data for tuning\n", - " metric=\"expected_success\", # the metric to optimize\n", - " mode=\"max\", # the optimization mode\n", - " eval_func=success_metrics, # the evaluation function to return the success metrics\n", - " # log_file_name=\"logs/humaneval.log\", # the log file name\n", - " inference_budget=0.05, # the inference budget (dollar)\n", - " optimization_budget=3, # the optimization budget (dollar)\n", - " # num_samples can further limit the number of trials for different hyperparameter configurations;\n", - " # -1 means decided by the optimization budget only\n", - " num_samples=-1,\n", - " prompt=[\n", - " \"{prompt}\",\n", - " \"# Python 3{prompt}\",\n", - " \"Complete the following Python function:{prompt}\",\n", - " \"Complete the following Python function while including necessary import statements inside the function:{prompt}\",\n", - " ], # the prompt templates to choose from\n", - " stop=[\"\\nclass\", \"\\ndef\", \"\\nif\", \"\\nprint\"], # the stop sequence\n", - ")\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Output tuning results\n", - "\n", - "After the tuning, we can print out the config and the result found by FLAML:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:26:38.352710Z", - "iopub.status.busy": "2023-02-24T23:26:38.352378Z", - "iopub.status.idle": "2023-02-24T23:26:38.356939Z", - "shell.execute_reply": "2023-02-24T23:26:38.356217Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "optimized config {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': '# Python 3{prompt}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'temperature': 0.25447895557126815}\n", - "best result on tuning data {'expected_success': 0.8822303234803123, 'success': 0.9, 'total_cost': 2.1304238, 'cost': 1.0186199999999999, 'inference_cost': 0.0484995, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}, 'config/model': 'gpt-4', 'config/max_tokens': 211, 'config/n': 13, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.25447895557126815}, 'experiment_tag': 'exp', 'time_total_s': 16.604310512542725}\n" - ] - } - ], - "source": [ - "print(\"optimized config\", config)\n", - "print(\"best result on tuning data\", analysis.best_result)\n", - "\n", - "# save results to notebook_output.txt\n", - "from flaml.version import __version__ as flaml_version\n", - "import datetime\n", - "results = {\"optimized config\": config, \"best result on tuning data\": analysis.best_result,}\n", - "result_info_dict = {\"result_name\": \"integrate_openai.ipynb + optimized config and best result on tuning data\",\n", - " \"flaml_version\": flaml_version, \n", - " \"time\": datetime.datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\"),\n", - " \"results\": results}\n", - "result_info = \"result name: {result_name}, flaml version: {flaml_version}, time: {time}, results: {results}\".format(**result_info_dict)\n", - "with open(\"notebook_output.txt\", \"a\") as f:\n", - " f.write(\"\\n\")\n", - " f.write(result_info)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Make a request with the tuned config\n", - "\n", - "We can apply the tuned config on the request for an example task:" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:26:38.359902Z", - "iopub.status.busy": "2023-02-24T23:26:38.359506Z", - "iopub.status.idle": "2023-02-24T23:26:39.343921Z", - "shell.execute_reply": "2023-02-24T23:26:39.343051Z" - }, - "slideshow": { - "slide_type": "subslide" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"choices\": [\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 0,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 1,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 2,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 3,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n return [abs(game[i] - guess[i]) for i in range(len(game))]\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 4,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 5,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 6,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n if game[i] == guess[i]:\\n result.append(0)\\n else:\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 7,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n return [abs(game[i] - guess[i]) for i in range(len(game))]\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 8,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n return [abs(a - b) for a, b in zip(game, guess)]\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 9,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 10,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 11,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " },\n", - " {\n", - " \"finish_reason\": \"stop\",\n", - " \"index\": 12,\n", - " \"message\": {\n", - " \"content\": \"def compare(game, guess):\\n result = []\\n for i in range(len(game)):\\n if game[i] == guess[i]:\\n result.append(0)\\n else:\\n result.append(abs(game[i] - guess[i]))\\n return result\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " }\n", - " ],\n", - " \"created\": 1679636800,\n", - " \"id\": \"chatcmpl-6xUY4niTRrpJ5UShayb9QncgjS8rg\",\n", - " \"model\": \"gpt-4-0314\",\n", - " \"object\": \"chat.completion\",\n", - " \"usage\": {\n", - " \"completion_tokens\": 440,\n", - " \"prompt_tokens\": 236,\n", - " \"total_tokens\": 676\n", - " }\n", - "}\n", - "{'expected_success': 1.0, 'success': True}\n" - ] - } - ], - "source": [ - "responses = oai.Completion.create(context=tune_data[1], **config)\n", - "metric_results = success_metrics([response[\"message\"][\"content\"] if config[\"model\"] in oai.Completion.chat_models else response[\"text\"] for response in responses[\"choices\"]], **tune_data[1])\n", - "print(\"response on an example data instance:\", responses)\n", - "print(\"metric_results on the example data instance:\", metric_results)\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Evaluate the success rate on the test data\n", - "\n", - "You can use flaml's `oai.Completion.test` to evaluate the performance of an entire dataset with the tuned config. The following code will take a while to evaluate all the 144 test data instances. The cost is about $7 if you uncomment it and run it." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "execution": { - "iopub.execute_input": "2023-02-24T23:26:39.347295Z", - "iopub.status.busy": "2023-02-24T23:26:39.346994Z", - "iopub.status.idle": "2023-02-24T23:29:27.160335Z", - "shell.execute_reply": "2023-02-24T23:29:27.159519Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'expected_success': 0.8326778348739547, 'success': 0.8472222222222222, 'total_cost': 10.024478799999999, 'cost': 7.01217, 'inference_cost': 0.049131249999999994}\n" - ] - } - ], - "source": [ - "result = oai.Completion.test(test_data, config, success_metrics)\n", - "print(\"performance on test data with the tuned config:\", result)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result will vary with the inference budget and optimization budget.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "tutorial", - "language": "python", - "name": "tutorial" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "vscode": { - "interpreter": { - "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" - } - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "24dd93300e0442788ee6cc1310e5bf14": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HTMLStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HTMLStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "background": null, - "description_width": "", - "font_size": null, - "text_color": null - } - }, - "35cd066a31b242bb87b2c106ee72e5f2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_8e7ee7687a99410d88a98a74ecfcea99", - "IPY_MODEL_421e02a11a974b40b3ddb75382b3b640", - "IPY_MODEL_77db9797e78b49438d21c5c8da34b4cb" - ], - "layout": "IPY_MODEL_47d3046236a54b0e8f9ae455a82c7e0b", - "tabbable": null, - "tooltip": null - } - }, - "3d5d106a38954af2bb3bde5777702f4e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HTMLStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HTMLStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "background": null, - "description_width": "", - "font_size": null, - "text_color": null - } - }, - "3e1ebb31412443b0bca86a301cbdac11": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "421e02a11a974b40b3ddb75382b3b640": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_e6398d4027c9459a97965b9d91ae484f", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_3e1ebb31412443b0bca86a301cbdac11", - "tabbable": null, - "tooltip": null, - "value": 1 - } - }, - "47d3046236a54b0e8f9ae455a82c7e0b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "754800f7feb04acea977696e4787d1ff": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "77db9797e78b49438d21c5c8da34b4cb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "HTMLView", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_7b6c4e1c11e249409a1edcd63be450d8", - "placeholder": "​", - "style": "IPY_MODEL_3d5d106a38954af2bb3bde5777702f4e", - "tabbable": null, - "tooltip": null, - "value": " 1/1 [00:00<00:00, 44.40it/s]" - } - }, - "7b6c4e1c11e249409a1edcd63be450d8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8e7ee7687a99410d88a98a74ecfcea99": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "HTMLView", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_754800f7feb04acea977696e4787d1ff", - "placeholder": "​", - "style": "IPY_MODEL_24dd93300e0442788ee6cc1310e5bf14", - "tabbable": null, - "tooltip": null, - "value": "100%" - } - }, - "e6398d4027c9459a97965b9d91ae484f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - } - }, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebook/research/autogen_code.ipynb b/notebook/research/autogen_code.ipynb new file mode 100644 index 0000000000..a796761eb2 --- /dev/null +++ b/notebook/research/autogen_code.ipynb @@ -0,0 +1,787 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved. \n", + "\n", + "Licensed under the MIT License.\n", + "\n", + "# Use FLAML to Optimize Code Generation Performance\n", + "\n", + "In this notebook, we optimize OpenAI models for code generation. We use [the HumanEval benchmark](https://huggingface.co/datasets/openai_humaneval) released by OpenAI for synthesizing programs from docstrings. \n", + "\n", + "## Requirements\n", + "\n", + "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n", + "```bash\n", + "pip install flaml[openai]==1.2.0\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:36.910966Z", + "iopub.status.busy": "2023-02-24T23:25:36.910473Z", + "iopub.status.idle": "2023-02-24T23:25:36.914554Z", + "shell.execute_reply": "2023-02-24T23:25:36.914030Z" + } + }, + "outputs": [], + "source": [ + "# %pip install flaml[openai]==1.2.0 datasets" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set your OpenAI key:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:36.917301Z", + "iopub.status.busy": "2023-02-24T23:25:36.917011Z", + "iopub.status.idle": "2023-02-24T23:25:36.923156Z", + "shell.execute_reply": "2023-02-24T23:25:36.922619Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = \"\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you use Azure OpenAI, uncomment the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:36.925804Z", + "iopub.status.busy": "2023-02-24T23:25:36.925423Z", + "iopub.status.idle": "2023-02-24T23:25:36.928191Z", + "shell.execute_reply": "2023-02-24T23:25:36.927673Z" + } + }, + "outputs": [], + "source": [ + "# import openai\n", + "# openai.api_type = \"azure\"\n", + "# openai.api_base = \"https://.openai.azure.com/\"\n", + "# openai.api_version = \"2023-03-15-preview\" # change if necessary" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset\n", + "\n", + "First, we load the humaneval dataset. The dataset contains 164 examples. In each example, the \"prompt\" is the prompt string for eliciting the code generation (renamed into \"definition\"), \"test\" is the Python code for unit test for the example, and \"entry_point\" is the function name to be tested." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-24T23:25:36.931255Z", + "iopub.status.busy": "2023-02-24T23:25:36.930838Z", + "iopub.status.idle": "2023-02-24T23:25:39.148799Z", + "shell.execute_reply": "2023-02-24T23:25:39.148113Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset openai_humaneval (/home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1fdc8853bf2a4aecaa2cd024ad99b5a2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00=3.7`. To run this notebook example, please install flaml with the [openai] option:\n", + "```bash\n", + "pip install flaml[openai]==1.2.0\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:52.317406Z", + "iopub.status.busy": "2023-02-13T23:40:52.316561Z", + "iopub.status.idle": "2023-02-13T23:40:52.321193Z", + "shell.execute_reply": "2023-02-13T23:40:52.320628Z" + } + }, + "outputs": [], + "source": [ + "# %pip install flaml[openai]==1.2.0 datasets" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set your OpenAI key:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:52.324240Z", + "iopub.status.busy": "2023-02-13T23:40:52.323783Z", + "iopub.status.idle": "2023-02-13T23:40:52.330570Z", + "shell.execute_reply": "2023-02-13T23:40:52.329750Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = \"\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uncomment the following to use Azure OpenAI:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:52.333547Z", + "iopub.status.busy": "2023-02-13T23:40:52.333249Z", + "iopub.status.idle": "2023-02-13T23:40:52.336508Z", + "shell.execute_reply": "2023-02-13T23:40:52.335858Z" + } + }, + "outputs": [], + "source": [ + "# import openai\n", + "# openai.api_type = \"azure\"\n", + "# openai.api_base = \"https://.openai.azure.com/\"\n", + "# openai.api_version = \"2023-03-15-preview\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset\n", + "\n", + "First, we load the competition_math dataset. We use a random sample of 50 examples for testing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:52.339977Z", + "iopub.status.busy": "2023-02-13T23:40:52.339556Z", + "iopub.status.idle": "2023-02-13T23:40:54.603349Z", + "shell.execute_reply": "2023-02-13T23:40:54.602630Z" + } + }, + "outputs": [], + "source": [ + "import datasets\n", + "\n", + "seed = 41\n", + "data = datasets.load_dataset(\"competition_math\")\n", + "train_data = data[\"train\"].shuffle(seed=seed)\n", + "test_data = data[\"test\"].shuffle(seed=seed)\n", + "n_tune_data = 20\n", + "tune_data = [\n", + " {\n", + " \"problem\": train_data[x][\"problem\"],\n", + " \"solution\": train_data[x][\"solution\"],\n", + " }\n", + " for x in range(len(train_data)) if train_data[x][\"level\"] == \"Level 5\" and train_data[x][\"type\"] == \"Counting & Probability\"\n", + "][:n_tune_data]\n", + "test_data = [\n", + " {\n", + " \"problem\": test_data[x][\"problem\"],\n", + " \"solution\": test_data[x][\"solution\"],\n", + " }\n", + " for x in range(len(test_data)) if test_data[x][\"level\"] == \"Level 5\" and test_data[x][\"type\"] == \"Counting & Probability\"\n", + "]\n", + "print(len(tune_data), len(test_data))\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Check a tuning example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:54.607152Z", + "iopub.status.busy": "2023-02-13T23:40:54.606441Z", + "iopub.status.idle": "2023-02-13T23:40:54.610504Z", + "shell.execute_reply": "2023-02-13T23:40:54.609759Z" + }, + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(tune_data[1][\"problem\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is one example of the canonical solution:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:54.613590Z", + "iopub.status.busy": "2023-02-13T23:40:54.613168Z", + "iopub.status.idle": "2023-02-13T23:40:54.616873Z", + "shell.execute_reply": "2023-02-13T23:40:54.616193Z" + } + }, + "outputs": [], + "source": [ + "print(tune_data[1][\"solution\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Success Metric\n", + "\n", + "For each math task, we use voting to select a response with the most common answers out of all the generated responses. If it has an equivalent answer to the canonical solution, we consider the task as successfully solved. Then we can optimize the mean success rate of a collection of tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:54.626998Z", + "iopub.status.busy": "2023-02-13T23:40:54.626593Z", + "iopub.status.idle": "2023-02-13T23:40:54.631383Z", + "shell.execute_reply": "2023-02-13T23:40:54.630770Z" + } + }, + "outputs": [], + "source": [ + "from flaml.autogen.math_utils import eval_math_responses" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Import the oai and tune subpackages from flaml.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:54.634335Z", + "iopub.status.busy": "2023-02-13T23:40:54.633929Z", + "iopub.status.idle": "2023-02-13T23:40:56.105700Z", + "shell.execute_reply": "2023-02-13T23:40:56.105085Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "from flaml import oai" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For (local) reproducibility and cost efficiency, we cache responses from OpenAI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:56.109177Z", + "iopub.status.busy": "2023-02-13T23:40:56.108624Z", + "iopub.status.idle": "2023-02-13T23:40:56.112651Z", + "shell.execute_reply": "2023-02-13T23:40:56.112076Z" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "oai.ChatCompletion.set_cache(seed)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2023-02-13T23:40:56.115383Z", + "iopub.status.busy": "2023-02-13T23:40:56.114975Z", + "iopub.status.idle": "2023-02-13T23:41:55.045654Z", + "shell.execute_reply": "2023-02-13T23:41:55.044973Z" + } + }, + "outputs": [], + "source": [ + "prompt = \"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluate the success rate on the test data\n", + "\n", + "You can use flaml's `oai.ChatCompletion.test` to evaluate the performance of an entire dataset with the tuned config." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "config_n1 = {\"model\": 'gpt-4', \"prompt\": prompt, \"max_tokens\": 600, \"n\": 1}\n", + "n1_result = oai.ChatCompletion.test(test_data[:50], config_n1, eval_math_responses)\n", + "print(n1_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "oai.ChatCompletion.request_timeout = 120\n", + "config_n10 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 10}\n", + "n10_result = oai.ChatCompletion.test(test_data[:50], config_n10, eval_math_responses, logging_level=logging.INFO)\n", + "print(n10_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config_n30 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": 30}\n", + "n30_result = oai.ChatCompletion.test(test_data[:50], config_n30, eval_math_responses, logging_level=logging.INFO)\n", + "print(n30_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "import matplotlib.pyplot as plt\n", + "\n", + "prompts = [\"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.\"]\n", + "markers = [\"o\", \"s\", \"D\", \"v\", \"p\", \"h\", \"d\", \"P\", \"X\", \"H\", \"8\", \"4\", \"3\", \"2\", \"1\", \"x\", \"+\", \">\", \"<\", \"^\", \"v\", \"1\", \"2\", \"3\", \"4\", \"8\", \"s\", \"p\", \"*\", \"h\", \"H\", \"d\", \"D\", \"|\", \"_\"]\n", + "for j, n in enumerate([10, 30]):\n", + " config = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"max_tokens\": 600, \"n\": n}\n", + " metrics = []\n", + " x, y = [], []\n", + " votes_success = defaultdict(lambda: [0, 0])\n", + " for i, data_i in enumerate(test_data[:50]):\n", + " response = oai.ChatCompletion.create(context=data_i, **config)\n", + " responses = oai.ChatCompletion.extract_text(response)\n", + " metrics.append(eval_math_responses(responses, **data_i))\n", + " votes = metrics[-1][\"votes\"]\n", + " success = metrics[-1][\"success_vote\"]\n", + " votes_success[votes][0] += 1\n", + " votes_success[votes][1] += success\n", + " for votes in votes_success:\n", + " x.append(votes)\n", + " y.append(votes_success[votes][1] / votes_success[votes][0])\n", + "\n", + " plt.scatter(x, y, marker=markers[j])\n", + " plt.xlabel(\"top vote\")\n", + " plt.ylabel(\"success rate\")\n", + "plt.legend([\"n=10\", \"n=30\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "vscode": { + "interpreter": { + "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "2d910cfd2d2a4fc49fc30fbbdc5576a7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "454146d0f7224f038689031002906e6f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26", + "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755", + "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45" + ], + "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555", + "tabbable": null, + "tooltip": null + } + }, + "577e1e3cc4db4942b0883577b3b52755": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62", + "tabbable": null, + "tooltip": null, + "value": 1 + } + }, + "6086462a12d54bafa59d3c4566f06cb2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74a6ba0c3cbc4051be0a83e152fe1e62": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7d3f3d9e15894d05a4d188ff4f466554": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HTMLStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "background": null, + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "b40bdfb1ac1d4cffb7cefcb870c64d45": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "HTMLView", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8", + "placeholder": "​", + "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466", + "tabbable": null, + "tooltip": null, + "value": " 1/1 [00:00<00:00, 44.69it/s]" + } + }, + "ca245376fd9f4354af6b2befe4af4466": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HTMLStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "background": null, + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "dc83c7bff2f241309537a8119dfc7555": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4ae2b6f5a974fd4bafb6abb9d12ff26": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "HTMLView", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2", + "placeholder": "​", + "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554", + "tabbable": null, + "tooltip": null, + "value": "100%" + } + }, + "f1355871cc6f4dd4b50d9df5af20e5c8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.py b/setup.py index b576486606..60e6e8de20 100644 --- a/setup.py +++ b/setup.py @@ -120,7 +120,7 @@ "pytorch-forecasting>=0.9.0", ], "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"], - "openai": ["openai==0.27.0", "diskcache", "optuna==2.8.0"], + "openai": ["openai==0.27.4", "diskcache", "optuna==2.8.0"], "synapse": ["joblibspark>=0.5.0", "optuna==2.8.0", "pyspark>=3.2.0"], }, classifiers=[ diff --git a/test/openai/test_completion.py b/test/openai/test_completion.py index d20f0f63c9..1d04ab4435 100644 --- a/test/openai/test_completion.py +++ b/test/openai/test_completion.py @@ -1,10 +1,15 @@ import datasets -import signal -import subprocess import sys import numpy as np import pytest +from functools import partial from flaml import oai +from flaml.autogen.code_utils import ( + eval_function_completions, + generate_assertions, + implement, +) +from flaml.autogen.math_utils import eval_math_responses @pytest.mark.skipif( @@ -12,58 +17,16 @@ reason="do not run on windows", ) def test_humaneval(num_samples=1): - def timeout_handler(signum, frame): - raise TimeoutError("Timed out!") - - signal.signal(signal.SIGALRM, timeout_handler) - max_exec_time = 3 # seconds - - def execute_code(code): - code = code.strip() - with open("codetest.py", "w") as fout: - fout.write(code) - try: - signal.alarm(max_exec_time) - result = subprocess.run( - [sys.executable, "codetest.py"], - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - ) - signal.alarm(0) - except TimeoutError: - return 0 - return int(result.returncode == 0) - - def success_metrics(responses, prompt, test, entry_point): - """Check if the response is correct. - - Args: - responses (list): The list of responses. - prompt (str): The input prompt. - test (str): The test code. - entry_point (str): The name of the function. - - Returns: - dict: The success metrics. - """ - success_list = [] - n = len(responses) - for i in range(n): - response = responses[i] - code = f"{prompt}{response}\n{test}\ncheck({entry_point})" - succeed = execute_code(code) - success_list.append(succeed) - return { - "expected_success": 1 - pow(1 - np.mean(success_list), n), - "success": any(s for s in success_list), - } + eval_with_generated_assertions = partial( + eval_function_completions, assertions=generate_assertions + ) seed = 41 data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed) n_tune_data = 20 tune_data = [ { - "prompt": data[x]["prompt"], + "definition": data[x]["prompt"], "test": data[x]["test"], "entry_point": data[x]["entry_point"], } @@ -71,7 +34,7 @@ def success_metrics(responses, prompt, test, entry_point): ] test_data = [ { - "prompt": data[x]["prompt"], + "definition": data[x]["prompt"], "test": data[x]["test"], "entry_point": data[x]["entry_point"], } @@ -79,335 +42,80 @@ def success_metrics(responses, prompt, test, entry_point): ] oai.Completion.set_cache(seed) try: - # a minimal tuning example - config, _ = oai.Completion.tune( - data=tune_data, - metric="success", - mode="max", - eval_func=success_metrics, - n=1, - ) - responses = oai.Completion.create(context=test_data[0], **config) - # a minimal tuning example for tuning chat completion models using the Completion class - config, _ = oai.Completion.tune( - data=tune_data, - metric="success", - mode="max", - eval_func=success_metrics, - n=1, - model="gpt-3.5-turbo", - ) - responses = oai.Completion.create(context=test_data[0], **config) - # a minimal tuning example for tuning chat completion models using the Completion class - config, _ = oai.ChatCompletion.tune( - data=tune_data, - metric="success", - mode="max", - eval_func=success_metrics, - n=1, - messages=[{"role": "user", "content": "{prompt}"}], - ) - responses = oai.ChatCompletion.create(context=test_data[0], **config) - print(responses) - # a more comprehensive tuning example - config, analysis = oai.Completion.tune( - data=tune_data, - metric="expected_success", - mode="max", - eval_func=success_metrics, - log_file_name="logs/humaneval.log", - inference_budget=0.002, - optimization_budget=2, - num_samples=num_samples, - prompt=[ - "{prompt}", - "# Python 3{prompt}", - "Complete the following Python function:{prompt}", - "Complete the following Python function while including necessary import statements inside the function:{prompt}", - ], - stop=["\nclass", "\ndef", "\nif", "\nprint"], - ) - print(config) - print(analysis.best_result) - print(test_data[0]) - responses = oai.Completion.create(context=test_data[0], **config) - print(responses) - oai.Completion.data = test_data[:num_samples] - result = oai.Completion._eval(analysis.best_config, prune=False, eval_only=True) - print("result without pruning", result) - result = oai.Completion.test(test_data[:num_samples], config=config) - print(result) + import openai + import diskcache except ImportError as exc: print(exc) + return + # a minimal tuning example + config, _ = oai.Completion.tune( + data=tune_data, + metric="success", + mode="max", + eval_func=eval_function_completions, + n=1, + prompt="{definition}", + ) + responses = oai.Completion.create(context=test_data[0], **config) + # a minimal tuning example for tuning chat completion models using the Completion class + config, _ = oai.Completion.tune( + data=tune_data, + metric="succeed_assertions", + mode="max", + eval_func=eval_with_generated_assertions, + n=1, + model="gpt-3.5-turbo", + prompt="{definition}", + ) + responses = oai.Completion.create(context=test_data[0], **config) + # a minimal tuning example for tuning chat completion models using the Completion class + config, _ = oai.ChatCompletion.tune( + data=tune_data, + metric="expected_success", + mode="max", + eval_func=eval_function_completions, + n=1, + messages=[{"role": "user", "content": "{definition}"}], + ) + responses = oai.ChatCompletion.create(context=test_data[0], **config) + print(responses) + code, cost, _ = implement(tune_data[1], [config]) + print(code) + print(cost) + print(eval_function_completions([code], **tune_data[1])) + # a more comprehensive tuning example + config2, analysis = oai.Completion.tune( + data=tune_data, + metric="success", + mode="max", + eval_func=eval_with_generated_assertions, + log_file_name="logs/humaneval.log", + inference_budget=0.002, + optimization_budget=2, + num_samples=num_samples, + prompt=[ + "{definition}", + "# Python 3{definition}", + "Complete the following Python function:{definition}", + ], + stop=[["\nclass", "\ndef", "\nif", "\nprint"], None], # the stop sequences + ) + print(config2) + print(analysis.best_result) + print(test_data[0]) + responses = oai.Completion.create(context=test_data[0], **config2) + print(responses) + oai.Completion.data = test_data[:num_samples] + result = oai.Completion._eval(analysis.best_config, prune=False, eval_only=True) + print("result without pruning", result) + result = oai.Completion.test(test_data[:num_samples], config=config2) + print(result) + code, cost, selected = implement(tune_data[1], [config2, config]) + print(selected) + print(eval_function_completions([code], **tune_data[1])) def test_math(num_samples=-1): - from typing import Optional - - def remove_boxed(string: str) -> Optional[str]: - """Source: https://github.com/hendrycks/math - Extract the text within a \\boxed{...} environment. - Example: - >>> remove_boxed(\\boxed{\\frac{2}{3}}) - \\frac{2}{3} - """ - left = "\\boxed{" - try: - assert string[: len(left)] == left - assert string[-1] == "}" - return string[len(left) : -1] - except Exception: - return None - - def last_boxed_only_string(string: str) -> Optional[str]: - """Source: https://github.com/hendrycks/math - Extract the last \\boxed{...} or \\fbox{...} element from a string. - """ - idx = string.rfind("\\boxed") - if idx < 0: - idx = string.rfind("\\fbox") - if idx < 0: - return None - - i = idx - right_brace_idx = None - num_left_braces_open = 0 - while i < len(string): - if string[i] == "{": - num_left_braces_open += 1 - if string[i] == "}": - num_left_braces_open -= 1 - if num_left_braces_open == 0: - right_brace_idx = i - break - i += 1 - - if right_brace_idx is None: - retval = None - else: - retval = string[idx : right_brace_idx + 1] - - return retval - - def _fix_fracs(string: str) -> str: - """Source: https://github.com/hendrycks/math - Reformat fractions. - Examples: - >>> _fix_fracs("\\frac1b") - \frac{1}{b} - >>> _fix_fracs("\\frac12") - \frac{1}{2} - >>> _fix_fracs("\\frac1{72}") - \frac{1}{72} - """ - substrs = string.split("\\frac") - new_str = substrs[0] - if len(substrs) > 1: - substrs = substrs[1:] - for substr in substrs: - new_str += "\\frac" - if substr[0] == "{": - new_str += substr - else: - try: - assert len(substr) >= 2 - except Exception: - return string - a = substr[0] - b = substr[1] - if b != "{": - if len(substr) > 2: - post_substr = substr[2:] - new_str += "{" + a + "}{" + b + "}" + post_substr - else: - new_str += "{" + a + "}{" + b + "}" - else: - if len(substr) > 2: - post_substr = substr[2:] - new_str += "{" + a + "}" + b + post_substr - else: - new_str += "{" + a + "}" + b - string = new_str - return string - - def _fix_a_slash_b(string: str) -> str: - """Source: https://github.com/hendrycks/math - Reformat fractions formatted as a/b to \\frac{a}{b}. - Example: - >>> _fix_a_slash_b("2/3") - \frac{2}{3} - """ - if len(string.split("/")) != 2: - return string - a_str = string.split("/")[0] - b_str = string.split("/")[1] - try: - a = int(a_str) - b = int(b_str) - assert string == "{}/{}".format(a, b) - new_string = "\\frac{" + str(a) + "}{" + str(b) + "}" - return new_string - except Exception: - return string - - def _remove_right_units(string: str) -> str: - """Source: https://github.com/hendrycks/math""" - if "\\text{ " in string: - splits = string.split("\\text{ ") - assert len(splits) == 2 - return splits[0] - else: - return string - - def _fix_sqrt(string: str) -> str: - """Source: https://github.com/hendrycks/math""" - if "\\sqrt" not in string: - return string - splits = string.split("\\sqrt") - new_string = splits[0] - for split in splits[1:]: - if split[0] != "{": - a = split[0] - new_substr = "\\sqrt{" + a + "}" + split[1:] - else: - new_substr = "\\sqrt" + split - new_string += new_substr - return new_string - - def _strip_string(string: str) -> str: - """Source: https://github.com/hendrycks/math - Apply the reformatting helper functions above. - """ - # linebreaks - string = string.replace("\n", "") - # print(string) - - # remove inverse spaces - string = string.replace("\\!", "") - # print(string) - - # replace \\ with \ - string = string.replace("\\\\", "\\") - # print(string) - - # replace tfrac and dfrac with frac - string = string.replace("tfrac", "frac") - string = string.replace("dfrac", "frac") - # print(string) - - # remove \left and \right - string = string.replace("\\left", "") - string = string.replace("\\right", "") - # print(string) - - # Remove circ (degrees) - string = string.replace("^{\\circ}", "") - string = string.replace("^\\circ", "") - - # remove dollar signs - string = string.replace("\\$", "") - - # remove units (on the right) - string = _remove_right_units(string) - - # remove percentage - string = string.replace("\\%", "") - string = string.replace(r"\%", "") - - # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string - string = string.replace(" .", " 0.") - string = string.replace("{.", "{0.") - # if empty, return empty string - if len(string) == 0: - return string - if string[0] == ".": - string = "0" + string - - # to consider: get rid of e.g. "k = " or "q = " at beginning - if len(string.split("=")) == 2: - if len(string.split("=")[0]) <= 2: - string = string.split("=")[1] - - # fix sqrt3 --> sqrt{3} - string = _fix_sqrt(string) - - # remove spaces - string = string.replace(" ", "") - - # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. - # Even works with \frac1{72} (but not \frac{72}1). - # Also does a/b --> \\frac{a}{b} - string = _fix_fracs(string) - - # manually change 0.5 --> \frac{1}{2} - if string == "0.5": - string = "\\frac{1}{2}" - - # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y - string = _fix_a_slash_b(string) - - return string - - def get_answer(solution: Optional[str]) -> Optional[str]: - if solution is None: - return None - last_boxed = last_boxed_only_string(solution) - if last_boxed is None: - return None - answer = remove_boxed(last_boxed) - if answer is None: - return None - return answer - - def is_equiv(str1: Optional[str], str2: Optional[str]) -> float: - """Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in - - units - - fractions - - square roots - - superfluous LaTeX. - Source: https://github.com/hendrycks/math - """ - if str1 is None and str2 is None: - print("WARNING: Both None") - return 1.0 - if str1 is None or str2 is None: - return 0.0 - - try: - ss1 = _strip_string(str1) - ss2 = _strip_string(str2) - return float(ss1 == ss2) - except Exception: - return float(str1 == str2) - - def is_equiv_chain_of_thought(str1: str, str2: str) -> float: - """Strips the solution first before calling `is_equiv`.""" - ans1 = get_answer(str1) - ans2 = get_answer(str2) - - return is_equiv(ans1, ans2) - - def success_metrics(responses, solution, **args): - """Check if each response is correct. - - Args: - responses (list): The list of responses. - solution (str): The canonical solution. - - Returns: - dict: The success metrics. - """ - success_list = [] - n = len(responses) - for i in range(n): - response = responses[i] - succeed = is_equiv_chain_of_thought(response, solution) - success_list.append(succeed) - return { - "expected_success": 1 - pow(1 - sum(success_list) / n, n), - "success": any(s for s in success_list), - } - seed = 41 data = datasets.load_dataset("competition_math") train_data = data["train"].shuffle(seed=seed) @@ -436,78 +144,87 @@ def success_metrics(responses, solution, **args): print(len(tune_data), len(test_data)) # prompt template prompts = [ - lambda data: "Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\n###\nProblem: What is the value of $\\sqrt{3! \\cdot 3!}$ expressed as a positive integer?\nAnswer: $\\sqrt{3!\\cdot3!}$ is equal to $\\sqrt{(3!)^2}=3!=3\\cdot2\\cdot1=\\boxed{6}$.\n###\nProblem: %s\nAnswer:" - + data["problem"] + lambda data: "%s Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\boxed{}." + % data["problem"] ] try: - oai.ChatCompletion.set_cache(seed) - vanilla_config = { - "model": "gpt-3.5-turbo", - "temperature": 1, - "max_tokens": 2048, - "n": 1, - "prompt": prompts[0], - "stop": "###", - } - test_data_sample = test_data[0:3] - result = oai.ChatCompletion.test( - test_data_sample, vanilla_config, success_metrics - ) - test_data_sample = test_data[3:6] - result = oai.ChatCompletion.test( - test_data_sample, - vanilla_config, - success_metrics, - use_cache=False, - agg_method="median", - ) - - def my_median(results): - return np.median(results) + import openai + import diskcache + except ImportError as exc: + print(exc) + return + + oai.ChatCompletion.set_cache(seed) + vanilla_config = { + "model": "gpt-3.5-turbo", + "temperature": 1, + "max_tokens": 2048, + "n": 1, + "prompt": prompts[0], + "stop": "###", + } + test_data_sample = test_data[0:3] + result = oai.ChatCompletion.test( + test_data_sample, vanilla_config, eval_math_responses + ) + test_data_sample = test_data[3:6] + result = oai.ChatCompletion.test( + test_data_sample, + vanilla_config, + eval_math_responses, + use_cache=False, + agg_method="median", + ) - def my_average(results): - return np.mean(results) + def my_median(results): + return np.median(results) - result = oai.ChatCompletion.test( - test_data_sample, - vanilla_config, - success_metrics, - use_cache=False, - agg_method=my_median, - ) - result = oai.ChatCompletion.test( - test_data_sample, - vanilla_config, - success_metrics, - use_cache=False, - agg_method={"expected_success": my_median, "success": my_average}, - ) + def my_average(results): + return np.mean(results) - print(result) + result = oai.ChatCompletion.test( + test_data_sample, + vanilla_config, + eval_math_responses, + use_cache=False, + agg_method=my_median, + ) + result = oai.ChatCompletion.test( + test_data_sample, + vanilla_config, + eval_math_responses, + use_cache=False, + agg_method={ + "expected_success": my_median, + "success": my_average, + "success_vote": my_average, + "votes": np.mean, + }, + ) - config, _ = oai.ChatCompletion.tune( - data=tune_data, # the data for tuning - metric="expected_success", # the metric to optimize - mode="max", # the optimization mode - eval_func=success_metrics, # the evaluation function to return the success metrics - # log_file_name="logs/math.log", # the log file name - inference_budget=0.002, # the inference budget (dollar) - optimization_budget=0.01, # the optimization budget (dollar) - num_samples=num_samples, - prompt=prompts, # the prompt templates to choose from - stop="###", # the stop sequence - ) - print("tuned config", config) - result = oai.ChatCompletion.test(test_data_sample, config) - print("result from tuned config:", result) - except (ImportError, NameError) as exc: - print(exc) + print(result) + + config, _ = oai.ChatCompletion.tune( + data=tune_data, # the data for tuning + metric="expected_success", # the metric to optimize + mode="max", # the optimization mode + eval_func=eval_math_responses, # the evaluation function to return the success metrics + # log_file_name="logs/math.log", # the log file name + inference_budget=0.002, # the inference budget (dollar) + optimization_budget=0.01, # the optimization budget (dollar) + num_samples=num_samples, + prompt=prompts, # the prompt templates to choose from + stop="###", # the stop sequence + ) + print("tuned config", config) + result = oai.ChatCompletion.test(test_data_sample, config) + print("result from tuned config:", result) if __name__ == "__main__": import openai openai.api_key_path = "test/openai/key.txt" - test_humaneval(-1) - test_math(-1) + test_humaneval(1) + # test_math(1) diff --git a/test/openai/test_notebook.py b/test/openai/test_notebook.py index fa68001675..a759543838 100644 --- a/test/openai/test_notebook.py +++ b/test/openai/test_notebook.py @@ -45,18 +45,18 @@ def run_notebook(input_nb, output_nb="executed_openai_notebook.ipynb", save=Fals skip, reason="do not run openai test if openai is not installed", ) -def test_integrate_openai(save=False): - run_notebook("integrate_openai.ipynb", save=save) +def test_autogen_openai(save=False): + run_notebook("autogen_openai.ipynb", save=save) @pytest.mark.skipif( skip, reason="do not run openai test if openai is not installed", ) -def test_integrate_chatgpt(save=False): - run_notebook("integrate_chatgpt.ipynb", save=save) +def test_autogen_chatgpt(save=False): + run_notebook("autogen_chatgpt.ipynb", save=save) if __name__ == "__main__": - test_integrate_chatgpt(save=True) - test_integrate_openai(save=True) + test_autogen_chatgpt(save=True) + test_autogen_openai(save=True) diff --git a/website/docs/Examples/Integrate - OpenAI.md b/website/docs/Examples/AutoGen-OpenAI.md similarity index 56% rename from website/docs/Examples/Integrate - OpenAI.md rename to website/docs/Examples/AutoGen-OpenAI.md index a3f83c9792..19e35f992a 100644 --- a/website/docs/Examples/Integrate - OpenAI.md +++ b/website/docs/Examples/AutoGen-OpenAI.md @@ -1,9 +1,11 @@ -FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of the OpenAI API. +# AutoGen - OpenAI + +FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of them. In this example, we will tune several hyperparameters for the OpenAI's completion API, including the temperature, prompt and n (number of completions), to optimize the inference performance for a code generation task. ### Prerequisites -Install the [openai] option. The OpenAI integration is in preview. ChaptGPT support is available since version 1.2.0. +Install the [openai] option. The OpenAI integration is in preview. ```bash pip install "flaml[openai]==1.2.0" ``` @@ -19,9 +21,11 @@ if "OPENAI_API_KEY" not in os.environ: If you use Azure OpenAI, set up Azure using the following code: ```python +import openai + openai.api_type = "azure" openai.api_base = "https://.openai.azure.com/" -openai.api_version = "2022-12-01" # change if necessary +openai.api_version = "2023-03-15-preview" # change if necessary ``` ### Load the dataset @@ -36,7 +40,7 @@ data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed) n_tune_data = 20 tune_data = [ { - "prompt": data[x]["prompt"], + "definition": data[x]["prompt"], "test": data[x]["test"], "entry_point": data[x]["entry_point"], } @@ -44,7 +48,7 @@ tune_data = [ ] test_data = [ { - "prompt": data[x]["prompt"], + "definition": data[x]["prompt"], "test": data[x]["test"], "entry_point": data[x]["entry_point"], } @@ -54,72 +58,17 @@ test_data = [ ### Defining the metric -Before starting tuning, you need to define the metric for the optimization. For the HumanEval dataset, we use the success rate as the metric. So if one of the returned responses can pass the test, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks. - -#### Define a code executor - -First, we write a simple code executor. The code executor takes the generated code and the test code as the input, and execute them with a timer. +Before starting tuning, you need to define the metric for the optimization. For each code generation task, we can use the model to generate multiple candidate responses, and then select one from them. If the final selected response can pass a unit test, we consider the task as successfully solved. Then we can define the average success rate on a collection of tasks as the optimization metric. ```python -import signal -import subprocess -import sys - -def timeout_handler(signum, frame): - raise TimeoutError("Timed out!") - -signal.signal(signal.SIGALRM, timeout_handler) -max_exec_time = 3 # seconds - -def execute_code(code): - code = code.strip() - with open("codetest.py", "w") as fout: - fout.write(code) - try: - signal.alarm(max_exec_time) - result = subprocess.run( - [sys.executable, "codetest.py"], - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - ) - signal.alarm(0) - except TimeoutError: - return 0 - return int(result.returncode == 0) -``` - -This function will create a temp file "codetest.py" and execute it in a separate process. It allows for 3 seconds to finish that code. +from functools import partial +from flaml.autogen.code_utils import eval_function_completions, generate_assertions -#### Define a function to evaluate the success for a given program synthesis task - -Now we define the success metric. - -```python -def success_metrics(responses, prompt, test, entry_point): - """Check if the task is successful. - - Args: - responses (list): The list of responses. - prompt (str): The input prompt. - test (str): The test code. - entry_point (str): The name of the function. - - Returns: - dict: The success metrics. - """ - success_list = [] - n = len(responses) - for i in range(n): - response = responses[i] - code = f"{prompt}{response}\n{test}\ncheck({entry_point})" - succeed = execute_code(code) - success_list.append(succeed) - return { - "expected_success": 1 - pow(1 - sum(success_list) / n, n), - "success": any(s for s in success_list), - } +eval_with_generated_assertions = partial(eval_function_completions, assertions=generate_assertions) ``` +This function will first generate assertion statements for each problem. Then, it uses the assertions to select the generated responses. + ### Tuning Hyperparameters for OpenAI The tuning will be performed under the specified optimization budgets. @@ -131,24 +80,25 @@ The tuning will be performed under the specified optimization budgets. Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc. ```python +from flaml import oai + config, analysis = oai.Completion.tune( data=tune_data, # the data for tuning - metric="expected_success", # the metric to optimize + metric="success", # the metric to optimize mode="max", # the optimization mode - eval_func=success_metrics, # the evaluation function to return the success metrics + eval_func=eval_with_generated_assertions, # the evaluation function to return the success metrics # log_file_name="logs/humaneval.log", # the log file name - inference_budget=0.1, # the inference budget (dollar) - optimization_budget=4, # the optimization budget (dollar) + inference_budget=0.05, # the inference budget (dollar per instance) + optimization_budget=3, # the optimization budget (dollar in total) # num_samples can further limit the number of trials for different hyperparameter configurations; # -1 means decided by the optimization budget only num_samples=-1, prompt=[ - "{prompt}", - "# Python 3{prompt}", - "Complete the following Python function:{prompt}", - "Complete the following Python function while including necessary import statements inside the function:{prompt}", + "{definition}", + "# Python 3{definition}", + "Complete the following Python function:{definition}", ], # the prompt templates to choose from - stop=["\nclass", "\ndef", "\nif", "\nprint"], # the stop sequence + stop=[["\nclass", "\ndef", "\nif", "\nprint"], None], # the stop sequences ) ``` @@ -168,7 +118,7 @@ We can apply the tuned config to the request for an instance: ```python responses = oai.Completion.create(context=tune_data[1], **config) print(responses) -print(success_metrics([response["text"].rstrip() for response in responses["choices"]], **tune_data[1])) +print(eval_with_generated_assertions(oai.Completion.extract_text(response), **tune_data[1])) ``` #### Evaluate the success rate on the test data @@ -177,9 +127,9 @@ You can use flaml's `oai.Completion.test` to evaluate the performance of an enti ```python result = oai.Completion.test(test_data, config) -print(result) +print("performance on test data with the tuned config:", result) ``` The result will vary with the inference budget and optimization budget. -[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_openai.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_openai.ipynb) +[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/autogen_openai.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/autogen_openai.ipynb) diff --git a/website/docs/Getting-Started.md b/website/docs/Getting-Started.md index c2d498ae9f..3fe35ca1ca 100644 --- a/website/docs/Getting-Started.md +++ b/website/docs/Getting-Started.md @@ -7,10 +7,8 @@ learning models automatically, efficiently and economically. It frees users from ### Main Features -1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large language models such as the OpenAI GPT-3 models. - +1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as the GPT series. 2. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code). Users can customize only when and what they need to, and leave the rest to the library. - 3. It supports fast and economical automatic tuning, capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective hyperparameter optimization](Use-Cases/Tune-User-Defined-Function#hyperparameter-optimization-algorithm) and model selection method invented by Microsoft Research, and many followup [research studies](Research). @@ -88,6 +86,26 @@ from flaml.default import LGBMClassifier Then, you can use it just like you use the original `LGMBClassifier`. Your other code can remain unchanged. When you call the `fit()` function from `flaml.default.LGBMClassifier`, it will automatically instantiate a good data-dependent hyperparameter configuration for your dataset, which is expected to work better than the default configuration. +#### (New) [Auto Generation](Use-Cases/Auto-Generation) + +You can optimize generations by ChatGPT or GPT-4 etc. with your own tuning data, success metrics and budgets. + +```python +from flaml import oai + +config, analysis = oai.Completion.tune( + data=tune_data, + metric="success", + mode="max", + eval_func=eval_func, + inference_budget=0.05, + optimization_budget=3, + num_samples=-1, +) +``` + +The optimization can help you maximize the utility out of these expensive models. + ### Where to Go Next? * Understand the use cases for [Task-oriented AutoML](Use-Cases/task-oriented-automl), [Tune user-defined function](Use-Cases/Tune-User-Defined-Function) and [Zero-shot AutoML](Use-Cases/Zero-Shot-AutoML). diff --git a/website/docs/Use-Cases/Auto-Generation.md b/website/docs/Use-Cases/Auto-Generation.md new file mode 100644 index 0000000000..3158ed7909 --- /dev/null +++ b/website/docs/Use-Cases/Auto-Generation.md @@ -0,0 +1,117 @@ +# Auto Generation + +`flaml.autogen` is a subpackage for automating generation tasks. It uses [`flaml.tune`](../reference/tune/tune) to find good hyperparameter configurations under budget constraints. +Such optimization has several benefits: +* Maximize the utility out of using expensive foundation models. +* Reduce the inference cost by using cheaper models or configurations which achieve equal or better performance. + +## Choices to Optimize + +The cost of using foundation models for text generation is typically measured in terms of the number of tokens in the input and output combined. From the perspective of an application builder using foundation models, the use case is to maximize the utility of the generated text under an inference budget constraint (e.g., measured by the average dollar cost needed to solve a coding problem). This can be achieved by optimizing the hyperparameters of the inference, +which can significantly affect both the utility and the cost of the generated text. + +The tunable hyperparameters include: +1. model - this is a required input, specifying the model ID to use. +1. prompt - the input prompt to the model, which provides the context for the text generation task. +1. max_tokens - the maximum number of tokens (words or word pieces) to generate in the output. +1. temperature - a value between 0 and 1 that controls the randomness of the generated text. A higher temperature will result in more random and diverse text, while a lower temperature will result in more predictable text. +1. top_p - a value between 0 and 1 that controls the sampling probability mass for each token generation. A lower top_p value will make it more likely to generate text based on the most likely tokens, while a higher value will allow the model to explore a wider range of possible tokens. +1. n - the number of responses to generate for a given prompt. Generating multiple responses can provide more diverse and potentially more useful output, but it also increases the cost of the request. +1. stop - a list of strings that, when encountered in the generated text, will cause the generation to stop. This can be used to control the length or the validity of the output. +1. presence_penalty, frequency_penalty - values that control the relative importance of the presence and frequency of certain words or phrases in the generated text. +1. best_of - the number of responses to generate server-side when selecting the "best" (the one with the highest log probability per token) response for a given prompt. + +The cost and utility of text generation are intertwined with the joint effect of these hyperparameters. +There are also complex interactions among subsets of the hyperparameters. For example, +the temperature and top_p are not recommended to be altered from their default values together because they both control the randomness of the generated text, and changing both at the same time can result in conflicting effects; n and best_of are rarely tuned together because if the application can process multiple outputs, filtering on the server side causes unnecessary information loss; both n and max_tokens will affect the total number of tokens generated, which in turn will affect the cost of the request. +These interactions and trade-offs make it difficult to manually determine the optimal hyperparameter settings for a given text generation task. + +## Tune Hyperparameters + +The tuning can be performed with the following information: +1. Validation data. +1. Evaluation function. +1. Metric to optimize. +1. Search space. +1. Budgets: inference and optimization respectively. + +### Validation data + +Collect a diverse set of instances. They can be stored in an iterable of dicts. For example, each instance dict can contain "problem" as a key and the description str of a math problem as the value; and "solution" as a key and the solution str as the value. + +### Evaluation function + +The evaluation function should take a list of responses, and other keyword arguments corresponding to the keys in each validation data instance as input, and output a dict of metrics. For example, + +```python +def success_metrics(responses: List[str], problem: str, solution: str) -> Dict: + # select a response from the list of responses + # check whether the answer is correct + return {"success": True or False} +``` + +`flaml.autogen` offers some example evaluation functions for common tasks such as code generation and math problem solving. + +### Metric to optimize + +The metric to optimize is usually an aggregated metric over all the tuning data instances. For example, users can specify "success" as the metric and "max" as the optimization mode. By default, the aggregation function is taking the average. Users can provide a customized aggregation function if needed. + +### Search space + +Users can specify the (optional) search range for each hyperparameter. + +1. model. Either a constant str, or multiple choices specified by `flaml.tune.choice`. +1. prompt. Either a str or a list of strs, of the prompt templates. +Each prompt template will be formatted with each data instance. For example, the prompt template can be: +"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\boxed{{}}." +And `{problem}` will be replaced by the "problem" field of each data instance. +1. max_tokens, n, best_of. They can be constants, or specified by `flaml.tune.randint`, `flaml.tune.qrandint`, `flaml.tune.lograndint` or `flaml.qlograndint`. By default, max_tokens is searched in [50, 1000); n is searched in [1, 100); and best_of is fixed to 1. +1. stop. It can be a str or a list of strs, or a list of lists of strs or None. Default is None. +1. temperature or top_p. One of them can be specified as a constant or by `flaml.tune.uniform` or `flaml.tune.loguniform` etc. +Please don't provide both. By default, each configuration will choose either a temperature or a top_p in [0, 1] uniformly. +1. presence_penalty, frequency_penalty. They can be constants or specified by `flaml.tune.uniform` etc. Not tuned by default. + +### Budgets + +One can specify an inference budget and an optimization budget. +The inference budget refers to the average inference cost per data instance. +The optimization budget refers to the total budget allowed in the tuning process. Both are measured by dollars and follow the price per 1000 tokens. + +### Perform tuning + +Now, you can use [`flaml.oai.Completion.tune`](../reference/autogen/oai/completion#tune) for tuning. For example, + +```python +from flaml import oai + +config, analysis = oai.Completion.tune( + data=tune_data, + metric="success", + mode="max", + eval_func=eval_func, + inference_budget=0.05, + optimization_budget=3, + num_samples=-1, +) +``` + +`num_samples` is the number of configurations to sample. -1 means unlimited (until optimization budget is exhausted). +The returned `config` contains the optimized configuration and `analysis` contains an [ExperimentAnalysis](../reference/tune/analysis#experimentanalysis-objects) object for all the tried configurations and results. + +### Perform inference with the tuned config + +One can use [`flaml.oai.Completion.create`](../reference/autogen/oai/completion#create) to performance inference. It materializes a prompt using a given context. For example, + +```python +response = oai.Completion.create(problme=problem, **config) +responses = oai.Completion.extract_text(response) +# Extract a list of str responses +``` + +`flaml.oai.Completion` is compatible with both `openai.Completion` and `openai.ChatCompletion`. So models such as "text-davinci-003", "gpt-3.5-turbo" and "gpt-4" can share a common API. When only tuning the chat-based models, `flaml.oai.ChatCompletion` can be used. + +`flaml.oai.Completion` also offers some additional utilities including a `test` function to conveniently evaluate the configuration over test data, a `cost` function to calculate the cost of an API call, and caching and error handling. It also supports both OpenAI API and Azure OpenAI API. + +Interested in trying it yourself? Please check the following notebook examples: +* [Optimize for Code Gen](https://github.com/microsoft/FLAML/blob/main/notebook/autogen_openai.ipynb) +* [Optimize for Math](https://github.com/microsoft/FLAML/blob/main/notebook/autogen_chatgpt.ipynb)