From 4138b220ef1809db892d6d76ad724b7aeea74641 Mon Sep 17 00:00:00 2001
From: Kim Tran <ksprtran@gmail.com>
Date: Wed, 26 Jun 2024 22:59:38 -0400
Subject: [PATCH] Separate llm vs cli tests

---
 .github/workflows/test.yml       |  14 +--
 log10/cli/completions.py         | 136 ++++++++++++++++++++++++++++-
 log10/completions/completions.py | 141 +------------------------------
 tests/test_cli.py                |   9 ++
 4 files changed, 152 insertions(+), 148 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 78a1940..036f606 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -63,7 +63,10 @@ jobs:
       - name: Install dependencies
         run: poetry install --all-extras
 
-      - name: Run dispatch tests
+      - name: Run cli tests
+        run: poetry run pytest -vv tests/test_cli.py
+
+      - name: Run dispatch llm tests
         if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
         run: |
           echo "This is a dispatch event"
@@ -113,16 +116,13 @@ jobs:
 
           if $empty_inputs; then
             echo "All variables are empty"
-            poetry run pytest -vv tests/
+            poetry run pytest -vv tests/ --ignore=tests/test_cli.py
             poetry run pytest --llm_provider=anthropic -vv tests/test_magentic.py
           fi
 
-      - name: Run scheduled tests
+      - name: Run scheduled llm tests
         if: ${{ github.event_name == 'schedule' }}
         run: |
           echo "This is a schedule event"
-          poetry run pytest -vv tests/
+          poetry run pytest -vv tests/ --ignore=tests/test_cli.py
           poetry run pytest --openai_model=gpt-4o -m chat -vv tests/test_openai.py
-
-      - name: Test cli commands
-        run: poetry run pytest -vv tests/test_cli.py
\ No newline at end of file
diff --git a/log10/cli/completions.py b/log10/cli/completions.py
index 9ac9db7..885447b 100644
--- a/log10/cli/completions.py
+++ b/log10/cli/completions.py
@@ -4,17 +4,16 @@
 import pandas as pd
 import rich
 import tqdm
+from rich.console import Console
+from rich.table import Table
 
-from log10._httpx_utils import _try_get
+from log10._httpx_utils import _get_time_diff, _try_get
 from log10.cli_utils import generate_markdown_report, generate_results_table
 from log10.completions.completions import (
     _check_model_support,
     _compare,
-    _create_dataframe_from_comparison_data,
     _get_completion,
     _get_completions_url,
-    _render_comparison_table,
-    _render_completions_table,
     _write_completions,
 )
 from log10.llm import Log10Config
@@ -24,6 +23,135 @@
 _log10_config = Log10Config()
 
 
+def _render_completions_table(completions_data, total_completions):
+    data_for_table = []
+    for completion in completions_data:
+        prompt, response = "", ""
+        if completion.get("kind") == "completion":
+            prompt = completion.get("request", {}).get("prompt", "")
+            response_choices = completion.get("response", {}).get("choices", [])
+            if response_choices:
+                response = response_choices[0].get("text", "")
+        elif completion.get("kind") == "chat":
+            request_messages = completion.get("request", {}).get("messages", [])
+            prompt = request_messages[0].get("content", "") if request_messages else ""
+
+            response_choices = completion.get("response", {}).get("choices", [])
+            if response_choices:
+                # Handle 'message' and 'function_call' within the first choice safely
+                first_choice = response_choices[0]
+                if "message" in first_choice:
+                    message = first_choice["message"]
+                    response = (
+                        message.get("content")
+                        or message.get("tool_calls", [])[-1].get("function", {}).get("arguments", "")
+                        if message.get("tool_calls")
+                        else ""
+                    )
+                elif "function_call" in first_choice:
+                    response = json.dumps(first_choice.get("function_call", {}))
+        else:
+            rich.print(f"Unknown completion kind: {completion['kind']} for id: {completion['id']}")
+
+        data_for_table.append(
+            {
+                "id": completion["id"],
+                "status": "success" if completion["status"] == "finished" else completion["status"],
+                "created_at": _get_time_diff(completion["created_at"]),
+                "prompt": prompt,
+                "completion": response,
+                "tags": [t["name"] for t in completion["tagResolved"]],
+            }
+        )
+    # render data_for_table with rich table
+    table = Table(show_header=True, header_style="bold magenta")
+
+    table.add_column("ID", style="dim")
+    table.add_column("Status")
+    table.add_column("Created At")
+    table.add_column("Prompt", overflow="fold")
+    table.add_column("Completion", overflow="fold")
+    table.add_column("Tags", justify="right")
+
+    max_len = 40
+    for item in data_for_table:
+        tags = ", ".join(item["tags"]) if item["tags"] else ""
+        if isinstance(item["prompt"], list):
+            item["prompt"] = " ".join(item["prompt"])
+        short_prompt = item["prompt"][:max_len] + "..." if len(item["prompt"]) > max_len else item["prompt"]
+        completion = item.get("completion", "")
+        short_completion = completion[:max_len] + "..." if len(completion) > max_len else completion
+        table.add_row(item["id"], item["status"], item["created_at"], short_prompt, short_completion, tags)
+
+    console = Console()
+    console.print(table)
+    console.print(f"{total_completions=}")
+
+
+def _render_comparison_table(model_response_raw_data):
+    rich.print(f"completion_id: {model_response_raw_data['completion_id']}")
+    rich.print("original_request:")
+    rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4))
+
+    table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True)
+    table.add_column("Model")
+    table.add_column("Content")
+    table.add_column("Total Token Usage (Input/Output)")
+    table.add_column("Duration (ms)")
+
+    for model, data in model_response_raw_data.items():
+        # only display model data
+        if model not in ["completion_id", "original_request"]:
+            usage = data["usage"]
+            formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})"
+            table.add_row(model, data["content"], formatted_usage, str(data["duration"]))
+    rich.print(table)
+
+
+def _create_dataframe_from_comparison_data(model_response_raw_data):
+    completion_id = model_response_raw_data["completion_id"]
+    original_request = model_response_raw_data["original_request"]
+    rows = []
+    for model, model_data in model_response_raw_data.items():
+        # only display model data
+        if model not in ["completion_id", "original_request"]:
+            content = model_data["content"]
+            usage = model_data["usage"]
+            prompt_tokens = usage["prompt_tokens"]
+            completion_tokens = usage["completion_tokens"]
+            total_tokens = usage["total_tokens"]
+            duration = model_data["duration"]
+            prompt_messages = json.dumps(original_request["messages"])
+            rows.append(
+                [
+                    completion_id,
+                    prompt_messages,
+                    model,
+                    content,
+                    prompt_tokens,
+                    completion_tokens,
+                    total_tokens,
+                    duration,
+                ]
+            )
+
+    df = pd.DataFrame(
+        rows,
+        columns=[
+            "Completion ID",
+            "Prompt Messages",
+            "Model",
+            "Content",
+            "Prompt Tokens",
+            "Completion Tokens",
+            "Total Tokens",
+            "Duration (ms)",
+        ],
+    )
+
+    return df
+
+
 @click.command()
 @click.option("--limit", default=25, help="Specify the maximum number of completions to retrieve.")
 @click.option("--offset", default=0, help="Set the starting point (offset) from where to begin fetching completions.")
diff --git a/log10/completions/completions.py b/log10/completions/completions.py
index b52c75b..ed479dd 100644
--- a/log10/completions/completions.py
+++ b/log10/completions/completions.py
@@ -3,12 +3,8 @@
 
 import click
 import httpx
-import pandas as pd
-import rich
-from rich.console import Console
-from rich.table import Table
 
-from log10._httpx_utils import _get_time_diff, _try_get
+from log10._httpx_utils import _try_get
 from log10.llm import Log10Config
 
 
@@ -52,11 +48,11 @@ def _get_tag_ids(tags):
 def _get_completions_url(limit, offset, tags, from_date, to_date, base_url, org_id, printout=True):
     tag_ids_str = _get_tag_ids(tags) if tags else ""
     if tag_ids_str and printout:
-        rich.print(f"Filter with tags: {tags}")
+        print(f"Filter with tags: {tags}")
 
     date_range = _get_valid_date_range(from_date, to_date)
     if date_range and printout:
-        rich.print(f"Filter with created date: {date_range['from'][:10]} to {date_range['to'][:10]}")
+        print(f"Filter with created date: {date_range['from'][:10]} to {date_range['to'][:10]}")
 
     url = f"{base_url}/api/completions?organization_id={org_id}&offset={offset}&limit={limit}&tagFilter={tag_ids_str}&createdFilter={json.dumps(date_range)}&sort=created_at&desc=true&ids="
     return url
@@ -79,71 +75,6 @@ def _get_valid_date_range(from_date, to_date):
     return date_range
 
 
-def _render_completions_table(completions_data, total_completions):
-    data_for_table = []
-    for completion in completions_data:
-        prompt, response = "", ""
-        if completion.get("kind") == "completion":
-            prompt = completion.get("request", {}).get("prompt", "")
-            response_choices = completion.get("response", {}).get("choices", [])
-            if response_choices:
-                response = response_choices[0].get("text", "")
-        elif completion.get("kind") == "chat":
-            request_messages = completion.get("request", {}).get("messages", [])
-            prompt = request_messages[0].get("content", "") if request_messages else ""
-
-            response_choices = completion.get("response", {}).get("choices", [])
-            if response_choices:
-                # Handle 'message' and 'function_call' within the first choice safely
-                first_choice = response_choices[0]
-                if "message" in first_choice:
-                    message = first_choice["message"]
-                    response = (
-                        message.get("content")
-                        or message.get("tool_calls", [])[-1].get("function", {}).get("arguments", "")
-                        if message.get("tool_calls")
-                        else ""
-                    )
-                elif "function_call" in first_choice:
-                    response = json.dumps(first_choice.get("function_call", {}))
-        else:
-            rich.print(f"Unknown completion kind: {completion['kind']} for id: {completion['id']}")
-
-        data_for_table.append(
-            {
-                "id": completion["id"],
-                "status": "success" if completion["status"] == "finished" else completion["status"],
-                "created_at": _get_time_diff(completion["created_at"]),
-                "prompt": prompt,
-                "completion": response,
-                "tags": [t["name"] for t in completion["tagResolved"]],
-            }
-        )
-    # render data_for_table with rich table
-    table = Table(show_header=True, header_style="bold magenta")
-
-    table.add_column("ID", style="dim")
-    table.add_column("Status")
-    table.add_column("Created At")
-    table.add_column("Prompt", overflow="fold")
-    table.add_column("Completion", overflow="fold")
-    table.add_column("Tags", justify="right")
-
-    max_len = 40
-    for item in data_for_table:
-        tags = ", ".join(item["tags"]) if item["tags"] else ""
-        if isinstance(item["prompt"], list):
-            item["prompt"] = " ".join(item["prompt"])
-        short_prompt = item["prompt"][:max_len] + "..." if len(item["prompt"]) > max_len else item["prompt"]
-        completion = item.get("completion", "")
-        short_completion = completion[:max_len] + "..." if len(completion) > max_len else completion
-        table.add_row(item["id"], item["status"], item["created_at"], short_prompt, short_completion, tags)
-
-    console = Console()
-    console.print(table)
-    console.print(f"{total_completions=}")
-
-
 def _write_completions(res, output_file, compact_mode):
     """Processes completions and appends them to the output file."""
     with open(output_file, "a") as file:
@@ -217,75 +148,11 @@ def _get_llm_repsone(
     return ret
 
 
-def _render_comparison_table(model_response_raw_data):
-    rich.print(f"completion_id: {model_response_raw_data['completion_id']}")
-    rich.print("original_request:")
-    rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4))
-
-    table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True)
-    table.add_column("Model")
-    table.add_column("Content")
-    table.add_column("Total Token Usage (Input/Output)")
-    table.add_column("Duration (ms)")
-
-    for model, data in model_response_raw_data.items():
-        # only display model data
-        if model not in ["completion_id", "original_request"]:
-            usage = data["usage"]
-            formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})"
-            table.add_row(model, data["content"], formatted_usage, str(data["duration"]))
-    rich.print(table)
-
-
-def _create_dataframe_from_comparison_data(model_response_raw_data):
-    completion_id = model_response_raw_data["completion_id"]
-    original_request = model_response_raw_data["original_request"]
-    rows = []
-    for model, model_data in model_response_raw_data.items():
-        # only display model data
-        if model not in ["completion_id", "original_request"]:
-            content = model_data["content"]
-            usage = model_data["usage"]
-            prompt_tokens = usage["prompt_tokens"]
-            completion_tokens = usage["completion_tokens"]
-            total_tokens = usage["total_tokens"]
-            duration = model_data["duration"]
-            prompt_messages = json.dumps(original_request["messages"])
-            rows.append(
-                [
-                    completion_id,
-                    prompt_messages,
-                    model,
-                    content,
-                    prompt_tokens,
-                    completion_tokens,
-                    total_tokens,
-                    duration,
-                ]
-            )
-
-    df = pd.DataFrame(
-        rows,
-        columns=[
-            "Completion ID",
-            "Prompt Messages",
-            "Model",
-            "Content",
-            "Prompt Tokens",
-            "Completion Tokens",
-            "Total Tokens",
-            "Duration (ms)",
-        ],
-    )
-
-    return df
-
-
 def _compare(models: list[str], messages: dict, temperature: float = 0.2, max_tokens: float = 256, top_p: float = 1.0):
     ret = {}
     if models:
         for model in models:
-            rich.print(f"Running {model}")
+            print(f"Running {model}")
             response = _get_llm_repsone(
                 model,
                 messages,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index acdda48..3758ead 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -33,6 +33,15 @@ def test_download_completions(runner):
     assert "Download total completions: 1/" in result.output
 
 
+def test_benchmark_models(runner):
+    tag = "test_tag_c"
+    model = "gpt-3.5-turbo"
+    result = runner.invoke(cli, ["completions", "benchmark_models", "--models", model, "--limit", "1", "--tags", tag])
+    assert result.exit_code == 0
+    assert f"Filter with tags: {tag}" in result.output
+    assert f"Running {model}" in result.output
+
+
 def test_list_feedback(runner):
     result = runner.invoke(cli, ["feedback", "list"])
     assert result.exit_code == 0