Skip to content

Commit

Permalink
update docs and examples
Browse files Browse the repository at this point in the history
  • Loading branch information
bugsz committed Dec 7, 2024
1 parent 6e5754b commit 0368a0c
Show file tree
Hide file tree
Showing 4 changed files with 230 additions and 80 deletions.
44 changes: 43 additions & 1 deletion docs/pages/concepts/evaluation_dimension.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ In original Sotopia paper, there are 7 dimensions to evaluate the quality of soc
- goal


However we observe under many use cases people may want to evaluate with customized evaluation metrics.
However we observe under many use cases people may want to evaluate with customized evaluation metrics, so we provide a way to build custom evaluation dimensions.
For a quick reference, you can directly check out the `examples/use_custom_dimensions.py`.

### CustomEvaluationDimension
The [`CustomEvaluationDimension`](/python_API/database/evaluation_dimensions) is a class that can be used to create a custom evaluation dimension.
Expand All @@ -32,6 +33,31 @@ The [`EvaluationDimensionBuilder`](/python_API/database/evaluation_dimensions) i


## Usage
### Initialize the database
The default evaluation metric is still `SotopiaDimensions` in `sotopia.env.evaluators`.There is no `CustomEvaluationDimension` in the database by default. To initialize the database, please refer to `examples/use_custom_dimensions.py`.

### Use the custom evaluation dimensions
After you initialize your customized evaluation dimensions, you can choose to use any one of these methods provided below:

#### Method 1: Choose dimensions by names
```python
evaluation_dimensions = (
EvaluationDimensionBuilder.select_existing_dimension_model_by_name(
["transactivity", "verbal_equity"]
)
)
```

#### Method 2: Directly choose the grouped evaluation dimension list
```python
evaluation_dimensions = (
EvaluationDimensionBuilder.select_existing_dimension_model_by_list_name(
"sotopia"
)
)
```

#### Method 3: Build a custom evaluation dimension model temporarily
We provide multiple ways to build a custom evaluation dimension model with `EvaluationDimensionBuilder`, specifically:
- `generate_dimension_model`: build an evaluation dimension from existing dimension primary keys.
- `generate_dimension_model_from_dict`: build an evaluation dimension from a dictionary that specifies the parameters of the `CustomEvaluationDimension`. For example
Expand All @@ -48,3 +74,19 @@ We provide multiple ways to build a custom evaluation dimension model with `Eval
```
- `select_existing_dimension_model_by_name`: build an evaluation dimension from existing dimension names. For example `['believability', 'goal']`
- `select_existing_dimension_model_by_list_name`: build an evaluation dimension from existing `CustomEvaluationDimensionList` list names. For example, directly use `sotopia`.


After you get the evaluation dimension model, you can pass it as a parameter for the `Evaluator`, for example,
```python
evaluation_dimensions = (
EvaluationDimensionBuilder.select_existing_dimension_model_by_list_name(
"sotopia"
)
)
terminal_evaluators=[
ReachGoalLLMEvaluator(
model_names["env"],
EvaluationForTwoAgents[evaluation_dimensions], # type: ignore
),
],
```
24 changes: 12 additions & 12 deletions examples/experiment_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
EvaluationForTwoAgents,
ReachGoalLLMEvaluator,
RuleBasedTerminatedEvaluator,
SotopiaDimensions,
)
from sotopia.envs.parallel import ParallelSotopiaEnv
from sotopia.generation_utils.generate import LLM_Name
Expand All @@ -34,6 +35,7 @@
)
from sotopia.server import run_async_server
from sotopia_conf.gin_utils import parse_gin_flags, run
# from sotopia.database import EvaluationDimensionBuilder

_DEFAULT_GIN_SEARCH_PATHS = [
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
Expand Down Expand Up @@ -108,21 +110,19 @@ def _iterate_env_agent_combo_not_in_db(
env_ids: list[str] = [],
tag: str | None = None,
) -> Generator[EnvAgentCombo[Observation, AgentAction], None, None]:
# method 1 for loading evaluation metric
evaluation_dimensions = (
EvaluationDimensionBuilder.select_existing_dimension_model_by_name(
["transactivity", "verbal_equity"]
)
)

# method 2 for loading evaluation metric
evaluation_dimensions = (
EvaluationDimensionBuilder.select_existing_dimension_model_by_list_name(
"""We iterate over each environment and return the **first** env-agent combo that is not in the database."""
# loading evaluation metric
try:
evaluation_dimensions = EvaluationDimensionBuilder.select_existing_dimension_model_by_list_name(
"sotopia"
) # Initialize your customized dimension, please refer to `examples/use_custom_dimensions.py`
except Exception as e:
print(
"No customized evaluation dimensions found, using default SotopiaDimensions",
e,
)
)
evaluation_dimensions = SotopiaDimensions

"""We iterate over each environment and return the **first** env-agent combo that is not in the database."""
if not env_ids:
env_ids = list(EnvironmentProfile.all_pks())
for env_id in env_ids:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,73 @@
)
from typing import Type, Union
from redis_om import Migrator
from sotopia.envs.evaluators import (
ReachGoalLLMEvaluator,
EvaluationForTwoAgents,
RuleBasedTerminatedEvaluator,
)
from sotopia.server import arun_one_episode
from typing import Optional, cast
from sotopia.envs import ParallelSotopiaEnv
from sotopia.agents import LLMAgent
from sotopia.database import AgentProfile, EnvironmentProfile
import asyncio


def save_dimensions(dimensions: list[dict[str, Union[str, int]]]) -> None:
Migrator().run()
for dimension in dimensions:
if (
len(
CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension["name"]
).all()
)
== 0
):
print("No existing dimension found, creating a new one")
CustomEvaluationDimension(**dimension).save()
print("Saved {}".format(dimension["name"]))
else:
print(
CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension["name"]
).all()[0],
"already exists",
)


def save_dimension_list(
dimensions: list[dict[str, Union[str, int]]], list_name: str
) -> None:
Migrator().run()
dimension_list = CustomEvaluationDimensionList.find(
CustomEvaluationDimensionList.name == list_name
).all()

if len(dimension_list) == 0:
all_dimensions_pks = []
for dimension in dimensions:
find_dimension = CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension["name"]
).all()
assert (
len(find_dimension) == 1
), f"Expected 1 dimension for {dimension['name']}, but found {len(find_dimension)}"
all_dimensions_pks.append(find_dimension[0].pk)
CustomEvaluationDimensionList(
name=list_name, dimension_pks=all_dimensions_pks
).save()
print("Saved {}".format(list_name))
else:
print(dimension_list[0], "already exists")


def generate_sotopia_dimensions() -> Type[BaseModel]:
# let's first insert some of the data
def build_sotopia_dimensions() -> Type[BaseModel]:
"""
Build the default Sotopia dimensions, the same as
`sotopia.env.evaluators.SotopiaDimensions`
"""
sotopia_dimensions: list[dict[str, Union[str, int]]] = [
{
"name": "believability",
Expand Down Expand Up @@ -55,7 +118,7 @@ def generate_sotopia_dimensions() -> Type[BaseModel]:
},
]

dimensions = EvaluationDimensionBuilder.generate_dimension_model_from_dict(
dimensions = EvaluationDimensionBuilder.build_dimension_model_from_dict(
dimensions=sotopia_dimensions
)
save_dimensions(sotopia_dimensions)
Expand All @@ -64,7 +127,88 @@ def generate_sotopia_dimensions() -> Type[BaseModel]:
return dimensions


def generate_custom_dimensions() -> Type[BaseModel]:
def build_custom_dimensions(
custom_dimensions: list[dict[str, Union[str, int]]], list_name: Optional[str] = None
) -> Type[BaseModel]:
"""
Build a custom evaluation dimension model,
: param custom_dimensions: a list of dictionaries that specify the parameters of the `CustomEvaluationDimension`.
: param list_name: the name of the list to save the custom dimensions to. If None, no list will be saved.
"""
dimensions = EvaluationDimensionBuilder.build_dimension_model_from_dict(
dimensions=custom_dimensions
)

save_dimensions(custom_dimensions)
if list_name is not None:
save_dimension_list(custom_dimensions, list_name=list_name)

return dimensions


def run_simple_sample_with_custom_samples(
custom_dimensions: list[dict[str, Union[str, int]]],
) -> None:
custom_dimensions_type = build_custom_dimensions(
custom_dimensions, list_name="custom"
)
evaluator = RuleBasedTerminatedEvaluator(max_turn_number=10, max_stale_turn=2)
terminal_evaluator = ReachGoalLLMEvaluator(
model_name="gpt-4o-mini",
response_format_class=EvaluationForTwoAgents[custom_dimensions_type], # type: ignore
)

all_agents: list[AgentProfile] = cast(
list[AgentProfile], AgentProfile.find().page(0, 2)
) # type: ignore[attr-defined]
all_envs: list[EnvironmentProfile] = cast(
list[EnvironmentProfile], EnvironmentProfile.find().page(0, 1)
) # type: ignore[attr-defined]
environment: ParallelSotopiaEnv = ParallelSotopiaEnv(
env_profile=all_envs[0],
model_name="gpt-4o-mini",
action_order="round-robin",
evaluators=[evaluator],
terminal_evaluators=[terminal_evaluator],
)
agents: list[LLMAgent] = [
LLMAgent(agent_profile=agent_profile, model_name="gpt-4o-mini")
for agent_profile in all_agents[:2]
]

res = asyncio.run(
arun_one_episode(
env=environment,
agent_list=agents,
omniscient=False,
script_like=False,
tag=None,
push_to_db=False,
)
)

print(res)


if __name__ == "__main__":
"""
A sample dimension:
custom_dimensions: list[dict[str, Union[str, int]]] = [
{
"name": "transactivity",
"description": "Analyze the provided social interaction episode between the given pair/team, focusing on identifying instances of transactive exchanges. Evaluate the level of transactivity by considering the following aspects: elaboration, building upon ideas, questioning, argumentation. Analyze whether these transactive patterns persist consistently across the entire interaction or if there are notable variations throughout the exchange. In the 'reasoning' field, provide a comprehensive account of the logic and thought process that led to your conclusion. Consider how the observed instances of transactivity contribute to or detract from the overall quality and depth of the interaction. In the 'score' field, provide an integer score ranging from 0 to 10, where a higher score indicates a higher level of transactivity.",
"range_high": 10,
"range_low": 0,
},
{
"name": "verbal_equity",
"description": "Analyze the script and measure the level of verbal equity reflected in the interaction between the agents. And then analyze the extent to which the interaction shows a balanced distribution of speaking opportunities among team members. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. A higher score indicates a higher level of verbal equity.",
"range_high": 10,
"range_low": 0,
},
]
"""

custom_dimensions: list[dict[str, Union[str, int]]] = [
{
"name": "transactivity",
Expand All @@ -80,64 +224,9 @@ def generate_custom_dimensions() -> Type[BaseModel]:
},
]

dimensions = EvaluationDimensionBuilder.generate_dimension_model_from_dict(
dimensions=custom_dimensions
)

save_dimensions(custom_dimensions)
save_dimension_list(custom_dimensions, "custom_example")

return dimensions

# Only build evaluation dimensions
build_sotopia_dimensions()
build_custom_dimensions(custom_dimensions=custom_dimensions, list_name="custom")

def save_dimensions(dimensions: list[dict[str, Union[str, int]]]) -> None:
for dimension in dimensions:
if (
len(
CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension["name"]
).all()
)
== 0
):
print("No existing dimension found, creating a new one")
CustomEvaluationDimension(**dimension).save()
print("Saved {}".format(dimension["name"]))
else:
print(
CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension["name"]
).all()[0],
"already exists",
)


def save_dimension_list(
dimensions: list[dict[str, Union[str, int]]], list_name: str
) -> None:
Migrator().run()
dimension_list = CustomEvaluationDimensionList.find(
CustomEvaluationDimensionList.name == list_name
).all()

if len(dimension_list) == 0:
all_dimensions_pks = []
for dimension in dimensions:
find_dimension = CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension["name"]
).all()
assert (
len(find_dimension) == 1
), f"Expected 1 dimension for {dimension['name']}, but found {len(find_dimension)}"
all_dimensions_pks.append(find_dimension[0].pk)
CustomEvaluationDimensionList(
name=list_name, dimension_pks=all_dimensions_pks
).save()
print("Saved {}".format(list_name))
else:
print(dimension_list[0], "already exists")


if __name__ == "__main__":
generate_sotopia_dimensions()
generate_custom_dimensions()
# Build and use evaluation dimensions
run_simple_sample_with_custom_samples(custom_dimensions=custom_dimensions)
Loading

0 comments on commit 0368a0c

Please sign in to comment.