update docs and examples

sotopia-lab · Dec 7, 2024 · 0368a0c · 0368a0c
1 parent 6e5754b
commit 0368a0c
Show file tree

Hide file tree

Showing 4 changed files with 230 additions and 80 deletions.
diff --git a/docs/pages/concepts/evaluation_dimension.md b/docs/pages/concepts/evaluation_dimension.md
@@ -11,7 +11,8 @@ In original Sotopia paper, there are 7 dimensions to evaluate the quality of soc
 - goal
 
 
-However we observe under many use cases people may want to evaluate with customized evaluation metrics.
+However we observe under many use cases people may want to evaluate with customized evaluation metrics, so we provide a way to build custom evaluation dimensions.
+For a quick reference, you can directly check out the `examples/use_custom_dimensions.py`.
 
 ### CustomEvaluationDimension
 The [`CustomEvaluationDimension`](/python_API/database/evaluation_dimensions) is a class that can be used to create a custom evaluation dimension.
@@ -32,6 +33,31 @@ The [`EvaluationDimensionBuilder`](/python_API/database/evaluation_dimensions) i
 
 
 ## Usage
+### Initialize the database
+The default evaluation metric is still `SotopiaDimensions` in `sotopia.env.evaluators`.There is no `CustomEvaluationDimension` in the database by default. To initialize the database, please refer to `examples/use_custom_dimensions.py`.
+
+### Use the custom evaluation dimensions
+After you initialize your customized evaluation dimensions, you can choose to use any one of these methods provided below:
+
+#### Method 1: Choose dimensions by names
+```python
+evaluation_dimensions = (
+    EvaluationDimensionBuilder.select_existing_dimension_model_by_name(
+        ["transactivity", "verbal_equity"]
+    )
+)
+```
+
+#### Method 2: Directly choose the grouped evaluation dimension list
+```python
+evaluation_dimensions = (
+    EvaluationDimensionBuilder.select_existing_dimension_model_by_list_name(
+        "sotopia"
+    )
+)
+```
+
+#### Method 3: Build a custom evaluation dimension model temporarily
 We provide multiple ways to build a custom evaluation dimension model with `EvaluationDimensionBuilder`, specifically:
 - `generate_dimension_model`: build an evaluation dimension from existing dimension primary keys.
 - `generate_dimension_model_from_dict`: build an evaluation dimension from a dictionary that specifies the parameters of the `CustomEvaluationDimension`. For example
@@ -48,3 +74,19 @@ We provide multiple ways to build a custom evaluation dimension model with `Eval
 ```
 - `select_existing_dimension_model_by_name`: build an evaluation dimension from existing dimension names. For example `['believability', 'goal']`
 - `select_existing_dimension_model_by_list_name`: build an evaluation dimension from existing `CustomEvaluationDimensionList` list names. For example, directly use `sotopia`.
+
+
+After you get the evaluation dimension model, you can pass it as a parameter for the `Evaluator`, for example,
+```python
+evaluation_dimensions = (
+    EvaluationDimensionBuilder.select_existing_dimension_model_by_list_name(
+        "sotopia"
+    )
+)
+terminal_evaluators=[
+    ReachGoalLLMEvaluator(
+        model_names["env"],
+        EvaluationForTwoAgents[evaluation_dimensions],  # type: ignore
+    ),
+],
+```
diff --git a/examples/experiment_eval.py b/examples/experiment_eval.py
@@ -23,6 +23,7 @@
     EvaluationForTwoAgents,
     ReachGoalLLMEvaluator,
     RuleBasedTerminatedEvaluator,
+    SotopiaDimensions,
 )
 from sotopia.envs.parallel import ParallelSotopiaEnv
 from sotopia.generation_utils.generate import LLM_Name
@@ -34,6 +35,7 @@
 )
 from sotopia.server import run_async_server
 from sotopia_conf.gin_utils import parse_gin_flags, run
+# from sotopia.database import EvaluationDimensionBuilder
 
 _DEFAULT_GIN_SEARCH_PATHS = [
     os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -108,21 +110,19 @@ def _iterate_env_agent_combo_not_in_db(
     env_ids: list[str] = [],
     tag: str | None = None,
 ) -> Generator[EnvAgentCombo[Observation, AgentAction], None, None]:
-    # method 1 for loading evaluation metric
-    evaluation_dimensions = (
-        EvaluationDimensionBuilder.select_existing_dimension_model_by_name(
-            ["transactivity", "verbal_equity"]
-        )
-    )
-
-    # method 2 for loading evaluation metric
-    evaluation_dimensions = (
-        EvaluationDimensionBuilder.select_existing_dimension_model_by_list_name(
+    """We iterate over each environment and return the **first** env-agent combo that is not in the database."""
+    # loading evaluation metric
+    try:
+        evaluation_dimensions = EvaluationDimensionBuilder.select_existing_dimension_model_by_list_name(
             "sotopia"
+        )  # Initialize your customized dimension, please refer to `examples/use_custom_dimensions.py`
+    except Exception as e:
+        print(
+            "No customized evaluation dimensions found, using default SotopiaDimensions",
+            e,
         )
-    )
+        evaluation_dimensions = SotopiaDimensions
 
-    """We iterate over each environment and return the **first** env-agent combo that is not in the database."""
     if not env_ids:
         env_ids = list(EnvironmentProfile.all_pks())
     for env_id in env_ids:

diff --git a/examples/create_custom_dimensions.py → examples/use_custom_dimensions.py b/examples/create_custom_dimensions.py → examples/use_custom_dimensions.py
@@ -6,10 +6,73 @@
 )
 from typing import Type, Union
 from redis_om import Migrator
+from sotopia.envs.evaluators import (
+    ReachGoalLLMEvaluator,
+    EvaluationForTwoAgents,
+    RuleBasedTerminatedEvaluator,
+)
+from sotopia.server import arun_one_episode
+from typing import Optional, cast
+from sotopia.envs import ParallelSotopiaEnv
+from sotopia.agents import LLMAgent
+from sotopia.database import AgentProfile, EnvironmentProfile
+import asyncio
+
+
+def save_dimensions(dimensions: list[dict[str, Union[str, int]]]) -> None:
+    Migrator().run()
+    for dimension in dimensions:
+        if (
+            len(
+                CustomEvaluationDimension.find(
+                    CustomEvaluationDimension.name == dimension["name"]
+                ).all()
+            )
+            == 0
+        ):
+            print("No existing dimension found, creating a new one")
+            CustomEvaluationDimension(**dimension).save()
+            print("Saved {}".format(dimension["name"]))
+        else:
+            print(
+                CustomEvaluationDimension.find(
+                    CustomEvaluationDimension.name == dimension["name"]
+                ).all()[0],
+                "already exists",
+            )
+
+
+def save_dimension_list(
+    dimensions: list[dict[str, Union[str, int]]], list_name: str
+) -> None:
+    Migrator().run()
+    dimension_list = CustomEvaluationDimensionList.find(
+        CustomEvaluationDimensionList.name == list_name
+    ).all()
+
+    if len(dimension_list) == 0:
+        all_dimensions_pks = []
+        for dimension in dimensions:
+            find_dimension = CustomEvaluationDimension.find(
+                CustomEvaluationDimension.name == dimension["name"]
+            ).all()
+            assert (
+                len(find_dimension) == 1
+            ), f"Expected 1 dimension for {dimension['name']}, but found {len(find_dimension)}"
+            all_dimensions_pks.append(find_dimension[0].pk)
+        CustomEvaluationDimensionList(
+            name=list_name, dimension_pks=all_dimensions_pks
+        ).save()
+        print("Saved {}".format(list_name))
+    else:
+        print(dimension_list[0], "already exists")
 
 
-def generate_sotopia_dimensions() -> Type[BaseModel]:
-    # let's first insert some of the data
+def build_sotopia_dimensions() -> Type[BaseModel]:
+    """
+    Build the default Sotopia dimensions, the same as
+    `sotopia.env.evaluators.SotopiaDimensions`
+    """
     sotopia_dimensions: list[dict[str, Union[str, int]]] = [
         {
             "name": "believability",
@@ -55,7 +118,7 @@ def generate_sotopia_dimensions() -> Type[BaseModel]:
         },
     ]
 
-    dimensions = EvaluationDimensionBuilder.generate_dimension_model_from_dict(
+    dimensions = EvaluationDimensionBuilder.build_dimension_model_from_dict(
         dimensions=sotopia_dimensions
     )
     save_dimensions(sotopia_dimensions)
@@ -64,7 +127,88 @@ def generate_sotopia_dimensions() -> Type[BaseModel]:
     return dimensions
 
 
-def generate_custom_dimensions() -> Type[BaseModel]:
+def build_custom_dimensions(
+    custom_dimensions: list[dict[str, Union[str, int]]], list_name: Optional[str] = None
+) -> Type[BaseModel]:
+    """
+    Build a custom evaluation dimension model,
+    : param custom_dimensions: a list of dictionaries that specify the parameters of the `CustomEvaluationDimension`.
+    : param list_name: the name of the list to save the custom dimensions to. If None, no list will be saved.
+    """
+    dimensions = EvaluationDimensionBuilder.build_dimension_model_from_dict(
+        dimensions=custom_dimensions
+    )
+
+    save_dimensions(custom_dimensions)
+    if list_name is not None:
+        save_dimension_list(custom_dimensions, list_name=list_name)
+
+    return dimensions
+
+
+def run_simple_sample_with_custom_samples(
+    custom_dimensions: list[dict[str, Union[str, int]]],
+) -> None:
+    custom_dimensions_type = build_custom_dimensions(
+        custom_dimensions, list_name="custom"
+    )
+    evaluator = RuleBasedTerminatedEvaluator(max_turn_number=10, max_stale_turn=2)
+    terminal_evaluator = ReachGoalLLMEvaluator(
+        model_name="gpt-4o-mini",
+        response_format_class=EvaluationForTwoAgents[custom_dimensions_type],  # type: ignore
+    )
+
+    all_agents: list[AgentProfile] = cast(
+        list[AgentProfile], AgentProfile.find().page(0, 2)
+    )  # type: ignore[attr-defined]
+    all_envs: list[EnvironmentProfile] = cast(
+        list[EnvironmentProfile], EnvironmentProfile.find().page(0, 1)
+    )  # type: ignore[attr-defined]
+    environment: ParallelSotopiaEnv = ParallelSotopiaEnv(
+        env_profile=all_envs[0],
+        model_name="gpt-4o-mini",
+        action_order="round-robin",
+        evaluators=[evaluator],
+        terminal_evaluators=[terminal_evaluator],
+    )
+    agents: list[LLMAgent] = [
+        LLMAgent(agent_profile=agent_profile, model_name="gpt-4o-mini")
+        for agent_profile in all_agents[:2]
+    ]
+
+    res = asyncio.run(
+        arun_one_episode(
+            env=environment,
+            agent_list=agents,
+            omniscient=False,
+            script_like=False,
+            tag=None,
+            push_to_db=False,
+        )
+    )
+
+    print(res)
+
+
+if __name__ == "__main__":
+    """
+        A sample dimension:
+            custom_dimensions: list[dict[str, Union[str, int]]] = [
+                {
+                    "name": "transactivity",
+                    "description": "Analyze the provided social interaction episode between the given pair/team, focusing on identifying instances of transactive exchanges. Evaluate the level of transactivity by considering the following aspects: elaboration, building upon ideas, questioning, argumentation. Analyze whether these transactive patterns persist consistently across the entire interaction or if there are notable variations throughout the exchange. In the 'reasoning' field, provide a comprehensive account of the logic and thought process that led to your conclusion. Consider how the observed instances of transactivity contribute to or detract from the overall quality and depth of the interaction. In the 'score' field, provide an integer score ranging from 0 to 10, where a higher score indicates a higher level of transactivity.",
+                    "range_high": 10,
+                    "range_low": 0,
+                },
+                {
+                    "name": "verbal_equity",
+                    "description": "Analyze the script and measure the level of verbal equity reflected in the interaction between the agents. And then analyze the extent to which the interaction shows a balanced distribution of speaking opportunities among team members. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. A higher score indicates a higher level of verbal equity.",
+                    "range_high": 10,
+                    "range_low": 0,
+                },
+            ]
+    """
+
     custom_dimensions: list[dict[str, Union[str, int]]] = [
         {
             "name": "transactivity",
@@ -80,64 +224,9 @@ def generate_custom_dimensions() -> Type[BaseModel]:
         },
     ]
 
-    dimensions = EvaluationDimensionBuilder.generate_dimension_model_from_dict(
-        dimensions=custom_dimensions
-    )
-
-    save_dimensions(custom_dimensions)
-    save_dimension_list(custom_dimensions, "custom_example")
-
-    return dimensions
-
+    # Only build evaluation dimensions
+    build_sotopia_dimensions()
+    build_custom_dimensions(custom_dimensions=custom_dimensions, list_name="custom")
 
-def save_dimensions(dimensions: list[dict[str, Union[str, int]]]) -> None:
-    for dimension in dimensions:
-        if (
-            len(
-                CustomEvaluationDimension.find(
-                    CustomEvaluationDimension.name == dimension["name"]
-                ).all()
-            )
-            == 0
-        ):
-            print("No existing dimension found, creating a new one")
-            CustomEvaluationDimension(**dimension).save()
-            print("Saved {}".format(dimension["name"]))
-        else:
-            print(
-                CustomEvaluationDimension.find(
-                    CustomEvaluationDimension.name == dimension["name"]
-                ).all()[0],
-                "already exists",
-            )
-
-
-def save_dimension_list(
-    dimensions: list[dict[str, Union[str, int]]], list_name: str
-) -> None:
-    Migrator().run()
-    dimension_list = CustomEvaluationDimensionList.find(
-        CustomEvaluationDimensionList.name == list_name
-    ).all()
-
-    if len(dimension_list) == 0:
-        all_dimensions_pks = []
-        for dimension in dimensions:
-            find_dimension = CustomEvaluationDimension.find(
-                CustomEvaluationDimension.name == dimension["name"]
-            ).all()
-            assert (
-                len(find_dimension) == 1
-            ), f"Expected 1 dimension for {dimension['name']}, but found {len(find_dimension)}"
-            all_dimensions_pks.append(find_dimension[0].pk)
-        CustomEvaluationDimensionList(
-            name=list_name, dimension_pks=all_dimensions_pks
-        ).save()
-        print("Saved {}".format(list_name))
-    else:
-        print(dimension_list[0], "already exists")
-
-
-if __name__ == "__main__":
-    generate_sotopia_dimensions()
-    generate_custom_dimensions()
+    # Build and use evaluation dimensions
+    run_simple_sample_with_custom_samples(custom_dimensions=custom_dimensions)