Working on autometrics

deeppavlov · Nov 8, 2024 · df82bf2 · df82bf2
1 parent 25da331
commit df82bf2
Show file tree

Hide file tree

Showing 8 changed files with 541 additions and 122 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -39,7 +39,11 @@ from chatsky_llm_autoconfig.autometrics.registry import AlgorithmRegistry
 class DialogueSampler(DialogueGenerator)
 ```
 2. Make sure that `input_type` and `output_type` are matching with the signature of `.invoke()` method
-
+3. Run
+```bash
+poetry run python dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/autometrics/run_autometrics.py
+```
+4. If metrics didn't drop then everything is fine. Add, commit and push as usual.
 
 ## How to Contribute
 1. Make your changes and test hypothesis in the `./experiments` folder as it is described in **Conducting experiments** section

diff --git a/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/algorithms/base.py b/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/algorithms/base.py
@@ -73,8 +73,3 @@ def __init__(self):
 
     def invoke(self, dialogue: Dialogue = None, graph: BaseGraph = None, topic: str = "") -> BaseGraph:
         raise NotImplementedError
-
-
-class CycleGraphGenerator(GraphGenerator):
-    def invoke(self, dialogue: Dialogue = None, graph: BaseGraph = None, topic: str = "") -> BaseGraph:
-
diff --git a/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/algorithms/dialogue_generation.py b/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/algorithms/dialogue_generation.py
@@ -5,7 +5,7 @@
 from chatsky_llm_autoconfig.autometrics.registry import AlgorithmRegistry
 
 
-@AlgorithmRegistry.register(input_type=BaseGraph, output_type=list[Dialogue])
+@AlgorithmRegistry.register(input_type=BaseGraph, output_type=Dialogue)
 class DialogueSampler(DialogueGenerator):
 
     def invoke(self, graph: BaseGraph, start_node: int = 1, end_node: int = -1, topic="") -> list[Dialogue]:

diff --git a/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/autometrics/results/results.json b/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/autometrics/results/results.json
@@ -1,24 +1,66 @@
 {
-  "2024-11-06 01:03:16.054569": {
+  "2024-11-08 08:36:09.361006": {
     "DialogueSampler": {
-      "all_paths_sampled": true,
-      "all_utterances_present": true
+      "all_paths_sampled": [
+        true,
+        true,
+        true
+      ],
+      "all_utterances_present": [
+        true,
+        false,
+        false
+      ],
+      "all_paths_sampled_avg": 1.0,
+      "all_utterances_present_avg": 0.3333333333333333
     }
   },
-  "2024-11-06 12:50:44.905469": {
+  "2024-11-08 08:46:12.796705": {
     "DialogueSampler": {
-      "all_paths_sampled": true,
-      "all_utterances_present": true
+      "all_paths_sampled": [
+        true,
+        true,
+        true
+      ],
+      "all_utterances_present": [
+        true,
+        true,
+        false
+      ],
+      "all_paths_sampled_avg": 1.0,
+      "all_utterances_present_avg": 0.6666666666666666
     }
   },
-  "2024-11-06 14:24:21.022803": {
+  "2024-11-08 08:59:00.654385": {
     "DialogueSampler": {
-      "all_paths_sampled": true,
-      "all_utterances_present": true
-    },
-    "DialogueSampler2": {
-      "all_paths_sampled": true,
-      "all_utterances_present": true
+      "all_paths_sampled": [
+        true,
+        true,
+        true
+      ],
+      "all_utterances_present": [
+        true,
+        true,
+        false
+      ],
+      "all_paths_sampled_avg": 1.0,
+      "all_utterances_present_avg": 0.6666666666666666
+    }
+  },
+  "2024-11-08 09:00:40.185875": {
+    "DialogueSampler": {
+      "all_paths_sampled": [
+        true,
+        true,
+        true
+      ],
+      "all_utterances_present": [
+        true,
+        true,
+        false
+      ],
+      "all_paths_sampled_avg": 1.0,
+      "all_utterances_present_avg": 0.6666666666666666
     }
   }
 }
diff --git a/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/autometrics/run_autometrics.py b/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/autometrics/run_autometrics.py
@@ -1,5 +1,8 @@
 from chatsky_llm_autoconfig.autometrics.registry import AlgorithmRegistry
-import chatsky_llm_autoconfig.sample_dialogue
+import chatsky_llm_autoconfig.algorithms.dialogue_generation
+import chatsky_llm_autoconfig.algorithms.dialogue_augmentation
+import chatsky_llm_autoconfig.algorithms.graph_generation
+
 import json
 from chatsky_llm_autoconfig.graph import Graph, BaseGraph
 from chatsky_llm_autoconfig.dialogue import Dialogue
@@ -9,11 +12,8 @@
 
 with open("dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/autometrics/test_data/data.json") as f:
     test_data = json.load(f)
-    test_dialogue = test_data["dialogue"]
-    test_dialogue = Dialogue(dialogue=test_dialogue)
-    test_graph = test_data["graph"]
-    test_graph = Graph(graph_dict=test_graph)
-
+    graph_to_dialogue = test_data["graph_to_dialogue"]
+    dialogue_to_dialogue = test_data["dialogue_to_dialogue"]
 
 def run_all_algorithms():
     # Get all registered classes
@@ -23,28 +23,86 @@ def run_all_algorithms():
     for class_ in algorithms:
         class_instance = algorithms[class_]["type"]()
         metrics = {}
-        if algorithms[class_]["input_type"] is BaseGraph:
-            result = class_instance.invoke(test_graph)
+
+        if algorithms[class_]["input_type"] is BaseGraph and algorithms[class_]["output_type"] is Dialogue:
             metrics = {
-                "all_paths_sampled": all_paths_sampled(test_graph, result[0]),
-                "all_utterances_present": all_utterances_present(test_graph, result),
+                    "all_paths_sampled": [],
+                    "all_utterances_present": [],
+                }
+            for case in graph_to_dialogue:
+                test_dialogue = Dialogue(dialogue=case["dialogue"])
+                test_graph = Graph(graph_dict=case["graph"])
+                result = class_instance.invoke(test_graph)
+
+                metrics["all_paths_sampled"].append(all_paths_sampled(test_graph, result[0]))
+                metrics["all_utterances_present"].append(all_utterances_present(test_graph, result))
+
+            metrics["all_paths_sampled_avg"] = sum(metrics["all_paths_sampled"])/len(metrics["all_paths_sampled"])
+            metrics["all_utterances_present_avg"] = sum(metrics["all_utterances_present"])/len(metrics["all_utterances_present"])
+
+        elif algorithms[class_]["input_type"] is Dialogue and algorithms[class_]["output_type"] is Dialogue:
+            metrics = {
+                "all_roles_correct": [],
+                "is_correct_lenght": [],
             }
+            for case in dialogue_to_dialogue:
+                test_dialogue = Dialogue(dialogue=case["dialogue"])
+                result = class_instance.invoke(test_dialogue)
 
-        elif algorithms[class_]["input_type"] is list[Dialogue]:
+                metrics["all_roles_correct"].append(all_roles_correct(test_dialogue, result))
+                metrics["is_correct_lenght"].append(is_correct_lenght(test_dialogue, result))
+
+            metrics["all_roles_correct_avg"] = sum(metrics["all_roles_correct"])/len(metrics["all_roles_correct"])
+            metrics["is_correct_lenght_avg"] = sum(metrics["is_correct_lenght"])/len(metrics["is_correct_lenght"])
+
+        elif algorithms[class_]["input_type"] is str and algorithms[class_]["output_type"] is BaseGraph:
             result = class_instance.invoke(test_dialogue)
+            metrics = {
+
+            }
 
         total_metrics[class_] = metrics
 
     return total_metrics
 
+def compare_results(date, old_data):
+
+    current_metrics = old_data.get(date, {})
+    previous_date = list(old_data.keys())[-2]
+    previous_metrics = old_data.get(previous_date, {})
+
+    differences = {}
+    for algorithm, metrics in current_metrics.items():
+        if algorithm in previous_metrics:
+            differences[algorithm] = {
+                "all_paths_sampled_diff": metrics.get("all_paths_sampled_avg", 0) - previous_metrics[algorithm].get("all_paths_sampled_avg", 0),
+                "all_utterances_present_diff": metrics.get("all_utterances_present_avg", 0) - previous_metrics[algorithm].get("all_utterances_present_avg", 0),
+                "all_roles_correct_diff": metrics.get("all_roles_correct_avg", 0) - previous_metrics[algorithm].get("all_roles_correct_avg", 0),
+                "is_correct_length_diff": metrics.get("is_correct_lenght_avg", 0) - previous_metrics[algorithm].get("is_correct_lenght_avg", 0),
+                "total_diff": (
+                    metrics.get("all_paths_sampled_avg", 0) +
+                    metrics.get("all_utterances_present_avg", 0) +
+                    metrics.get("all_roles_correct_avg", 0) +
+                    metrics.get("is_correct_lenght_avg", 0)
+                ) - (
+                    previous_metrics[algorithm].get("all_paths_sampled_avg", 0) +
+                    previous_metrics[algorithm].get("all_utterances_present_avg", 0) +
+                    previous_metrics[algorithm].get("all_roles_correct_avg", 0) +
+                    previous_metrics[algorithm].get("is_correct_lenght_avg", 0)
+                )
+            }
+
+    return differences
+
 
 if __name__ == "__main__":
     with open("dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/autometrics/results/results.json") as f:
         old_data = json.load(f)
 
-    new_metrics = {str(datetime.datetime.now()): run_all_algorithms()}
-
+    date = str(datetime.datetime.now())
+    new_metrics = {date: run_all_algorithms()}
     old_data.update(new_metrics)
+    print(compare_results(date, old_data))
 
     with open("dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/autometrics/results/results.json", "w") as f:
         f.write(json.dumps(old_data, indent=2, ensure_ascii=False))