-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updates on existing solvers and bugged tool eval (#1506)
@JunShern will review this Wrap solvers with completion functions for compatibility with pre-solver Evals. This means you can execute all evals using solvers. [49fd9ef](49fd9ef) Add context length information about gpt-4-turbo-preview and gpt-4-0125-preview. [9a0ab1c](9a0ab1c) Move oai and together solvers into providers / subdir [063bf4f](063bf4f) Update the default task descriptions for bugged tools. We added more information when using gemini + OS models, since they got confused. [0523dd4](0523dd4) Modified the default solver chain-of-thought prompt, as well as other custom chain-of-thought prompts used in some evals. The default CoTSolver prompts were a bit misleading in some cases; we observed GeminiSolver working too hard to arrive at a final answer for the whole eval when it's in fact supposed to give just a response for the next turn. [287f3cf](287f3cf) --------- Co-authored-by: johny-b <33967107+johny-b@users.noreply.github.com> Co-authored-by: Chan Jun Shern <JunShern@users.noreply.github.com> Co-authored-by: Giulio Starace <giulio.starace@gmail.com>
- Loading branch information
1 parent
d9d2f5f
commit 2420c62
Showing
39 changed files
with
533 additions
and
283 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
from typing import Any, Union | ||
|
||
from evals.api import CompletionFn, CompletionResult | ||
from evals.prompt.base import OpenAICreateChatPrompt | ||
from evals.solvers.nested.cot_solver import CoTSolver | ||
from evals.solvers.solver import Solver, SolverSpec, create_solver | ||
from evals.task_state import Message, TaskState | ||
|
||
|
||
class SolverCompletionFnResult(CompletionResult): | ||
def __init__(self, msg): | ||
self.msg = msg | ||
|
||
def get_completions(self): | ||
return [self.msg] | ||
|
||
|
||
class SolverCompletionFn(CompletionFn): | ||
""" | ||
Wraps a solver into a completion function, s.t. that the completion function's | ||
__call__ method calls the internal solver's _solve method, mapping the input | ||
completion function `prompt` to the solver's `task_state` input. | ||
Useful for using Solvers with eval.Eval classes, which would normally require a CompletionFn. | ||
Current limitations: | ||
- Stateful solvers are not supported: Solver state is not maintained between | ||
calls. | ||
- Prompts with more than `role` and `content` keys are not supported. | ||
""" | ||
|
||
def __init__(self, solver: Union[SolverSpec, Solver], registry: Any = None): | ||
if isinstance(solver, Solver): | ||
self.solver = solver | ||
else: | ||
self.solver = create_solver(solver) | ||
|
||
def __call__( | ||
self, prompt: Union[str, OpenAICreateChatPrompt], **kwargs | ||
) -> SolverCompletionFnResult: | ||
# We have this check here rather than __init__ since the solver may be unwrapped and used in a SolverEval | ||
if isinstance(self.solver, CoTSolver): | ||
if self.solver.interaction_cache is not None: | ||
raise ValueError( | ||
"`CoTSolver` with persistent memory is incompatible with " | ||
"CompletionFn-based `Eval` classes. " | ||
"Please set `CoTSolver(persistent_memory=False)` or update the eval to a `SolverEval`." | ||
) | ||
|
||
if isinstance(prompt, str): | ||
prompt = [{"role": "system", "content": prompt}] | ||
elif isinstance(prompt, list): | ||
assert prompt[0]["role"] == "system", "Unexpected prompt role ordering" | ||
else: | ||
raise ValueError( | ||
f"Unexpected prompt type: " | ||
f"string or OpenAICreateChatPrompt expected, got {type(prompt)}" | ||
) | ||
|
||
assert set(prompt[0].keys()) == {"role", "content",}, ( | ||
"Unexpected keys in prompt: " | ||
f"expected exactly {{'role', 'content'}}, got {set(prompt[0].keys())}" | ||
) | ||
task_state = TaskState( | ||
prompt[0]["content"], | ||
[Message(msg["role"], msg["content"]) for msg in prompt[1:]], | ||
) | ||
|
||
# use a copy to avoid task state surviving across samples | ||
pure_solver = self.solver.copy() | ||
|
||
result = pure_solver(task_state, **kwargs) | ||
return SolverCompletionFnResult(result.output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.