Separate llm vs cli tests

log10-io · Jun 27, 2024 · f462938 · f462938
1 parent 5f2c053
commit f462938
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 146 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -63,7 +63,7 @@ jobs:
  - name: Install dependencies
  run: poetry install --all-extras
 
- - name: Run dispatch tests
+ - name: Run dispatch llm tests
  if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
  run: |
  echo "This is a dispatch event"
@@ -113,16 +113,16 @@ jobs:
 
  if $empty_inputs; then
  echo "All variables are empty"
- poetry run pytest -vv tests/
+ poetry run pytest -vv tests/ --ignore=tests/test_cli.py
  poetry run pytest --llm_provider=anthropic -vv tests/test_magentic.py
  fi
 
- - name: Run scheduled tests
+ - name: Run scheduled llm tests
  if: ${{ github.event_name == 'schedule' }}
  run: |
  echo "This is a schedule event"
  poetry run pytest -vv tests/
  poetry run pytest --openai_model=gpt-4o -m chat -vv tests/test_openai.py
 
- - name: Test cli commands
- run: poetry run pytest -vv tests/test_cli.py
+ - name: Run cli tests
+ run: poetry run pytest -vv tests/test_cli.py
diff --git a/log10/cli/completions.py b/log10/cli/completions.py
@@ -4,17 +4,16 @@
 import pandas as pd
 import rich
 import tqdm
+from rich.console import Console
+from rich.table import Table
 
-from log10._httpx_utils import _try_get
+from log10._httpx_utils import _get_time_diff, _try_get
 from log10.cli_utils import generate_markdown_report, generate_results_table
 from log10.completions.completions import (
  _check_model_support,
  _compare,
- _create_dataframe_from_comparison_data,
  _get_completion,
  _get_completions_url,
- _render_comparison_table,
- _render_completions_table,
  _write_completions,
 )
 from log10.llm import Log10Config
@@ -24,6 +23,135 @@
 _log10_config = Log10Config()
 
 
+def _render_completions_table(completions_data, total_completions):
+ data_for_table = []
+ for completion in completions_data:
+ prompt, response = "", ""
+ if completion.get("kind") == "completion":
+ prompt = completion.get("request", {}).get("prompt", "")
+ response_choices = completion.get("response", {}).get("choices", [])
+ if response_choices:
+ response = response_choices[0].get("text", "")
+ elif completion.get("kind") == "chat":
+ request_messages = completion.get("request", {}).get("messages", [])
+ prompt = request_messages[0].get("content", "") if request_messages else ""
+
+ response_choices = completion.get("response", {}).get("choices", [])
+ if response_choices:
+ # Handle 'message' and 'function_call' within the first choice safely
+ first_choice = response_choices[0]
+ if "message" in first_choice:
+ message = first_choice["message"]
+ response = (
+ message.get("content")
+ or message.get("tool_calls", [])[-1].get("function", {}).get("arguments", "")
+ if message.get("tool_calls")
+ else ""
+ )
+ elif "function_call" in first_choice:
+ response = json.dumps(first_choice.get("function_call", {}))
+ else:
+ rich.print(f"Unknown completion kind: {completion['kind']} for id: {completion['id']}")
+
+ data_for_table.append(
+ {
+ "id": completion["id"],
+ "status": "success" if completion["status"] == "finished" else completion["status"],
+ "created_at": _get_time_diff(completion["created_at"]),
+ "prompt": prompt,
+ "completion": response,
+ "tags": [t["name"] for t in completion["tagResolved"]],
+ }
+ )
+ # render data_for_table with rich table
+ table = Table(show_header=True, header_style="bold magenta")
+
+ table.add_column("ID", style="dim")
+ table.add_column("Status")
+ table.add_column("Created At")
+ table.add_column("Prompt", overflow="fold")
+ table.add_column("Completion", overflow="fold")
+ table.add_column("Tags", justify="right")
+
+ max_len = 40
+ for item in data_for_table:
+ tags = ", ".join(item["tags"]) if item["tags"] else ""
+ if isinstance(item["prompt"], list):
+ item["prompt"] = " ".join(item["prompt"])
+ short_prompt = item["prompt"][:max_len] + "..." if len(item["prompt"]) > max_len else item["prompt"]
+ completion = item.get("completion", "")
+ short_completion = completion[:max_len] + "..." if len(completion) > max_len else completion
+ table.add_row(item["id"], item["status"], item["created_at"], short_prompt, short_completion, tags)
+
+ console = Console()
+ console.print(table)
+ console.print(f"{total_completions=}")
+
+
+def _render_comparison_table(model_response_raw_data):
+ rich.print(f"completion_id: {model_response_raw_data['completion_id']}")
+ rich.print("original_request:")
+ rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4))
+
+ table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True)
+ table.add_column("Model")
+ table.add_column("Content")
+ table.add_column("Total Token Usage (Input/Output)")
+ table.add_column("Duration (ms)")
+
+ for model, data in model_response_raw_data.items():
+ # only display model data
+ if model not in ["completion_id", "original_request"]:
+ usage = data["usage"]
+ formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})"
+ table.add_row(model, data["content"], formatted_usage, str(data["duration"]))
+ rich.print(table)
+
+
+def _create_dataframe_from_comparison_data(model_response_raw_data):
+ completion_id = model_response_raw_data["completion_id"]
+ original_request = model_response_raw_data["original_request"]
+ rows = []
+ for model, model_data in model_response_raw_data.items():
+ # only display model data
+ if model not in ["completion_id", "original_request"]:
+ content = model_data["content"]
+ usage = model_data["usage"]
+ prompt_tokens = usage["prompt_tokens"]
+ completion_tokens = usage["completion_tokens"]
+ total_tokens = usage["total_tokens"]
+ duration = model_data["duration"]
+ prompt_messages = json.dumps(original_request["messages"])
+ rows.append(
+ [
+ completion_id,
+ prompt_messages,
+ model,
+ content,
+ prompt_tokens,
+ completion_tokens,
+ total_tokens,
+ duration,
+ ]
+ )
+
+ df = pd.DataFrame(
+ rows,
+ columns=[
+ "Completion ID",
+ "Prompt Messages",
+ "Model",
+ "Content",
+ "Prompt Tokens",
+ "Completion Tokens",
+ "Total Tokens",
+ "Duration (ms)",
+ ],
+ )
+
+ return df
+
+
 @click.command()
 @click.option("--limit", default=25, help="Specify the maximum number of completions to retrieve.")
 @click.option("--offset", default=0, help="Set the starting point (offset) from where to begin fetching completions.")

diff --git a/log10/completions/completions.py b/log10/completions/completions.py
@@ -3,12 +3,8 @@
 
 import click
 import httpx
-import pandas as pd
-import rich
-from rich.console import Console
-from rich.table import Table
 
-from log10._httpx_utils import _get_time_diff, _try_get
+from log10._httpx_utils import _try_get
 from log10.llm import Log10Config
 
 
@@ -52,11 +48,11 @@ def _get_tag_ids(tags):
 def _get_completions_url(limit, offset, tags, from_date, to_date, base_url, org_id, printout=True):
  tag_ids_str = _get_tag_ids(tags) if tags else ""
  if tag_ids_str and printout:
- rich.print(f"Filter with tags: {tags}")
+ print(f"Filter with tags: {tags}")
 
  date_range = _get_valid_date_range(from_date, to_date)
  if date_range and printout:
- rich.print(f"Filter with created date: {date_range['from'][:10]} to {date_range['to'][:10]}")
+ print(f"Filter with created date: {date_range['from'][:10]} to {date_range['to'][:10]}")
 
  url = f"{base_url}/api/completions?organization_id={org_id}&offset={offset}&limit={limit}&tagFilter={tag_ids_str}&createdFilter={json.dumps(date_range)}&sort=created_at&desc=true&ids="
  return url
@@ -79,71 +75,6 @@ def _get_valid_date_range(from_date, to_date):
  return date_range
 
 
-def _render_completions_table(completions_data, total_completions):
- data_for_table = []
- for completion in completions_data:
- prompt, response = "", ""
- if completion.get("kind") == "completion":
- prompt = completion.get("request", {}).get("prompt", "")
- response_choices = completion.get("response", {}).get("choices", [])
- if response_choices:
- response = response_choices[0].get("text", "")
- elif completion.get("kind") == "chat":
- request_messages = completion.get("request", {}).get("messages", [])
- prompt = request_messages[0].get("content", "") if request_messages else ""
-
- response_choices = completion.get("response", {}).get("choices", [])
- if response_choices:
- # Handle 'message' and 'function_call' within the first choice safely
- first_choice = response_choices[0]
- if "message" in first_choice:
- message = first_choice["message"]
- response = (
- message.get("content")
- or message.get("tool_calls", [])[-1].get("function", {}).get("arguments", "")
- if message.get("tool_calls")
- else ""
- )
- elif "function_call" in first_choice:
- response = json.dumps(first_choice.get("function_call", {}))
- else:
- rich.print(f"Unknown completion kind: {completion['kind']} for id: {completion['id']}")
-
- data_for_table.append(
- {
- "id": completion["id"],
- "status": "success" if completion["status"] == "finished" else completion["status"],
- "created_at": _get_time_diff(completion["created_at"]),
- "prompt": prompt,
- "completion": response,
- "tags": [t["name"] for t in completion["tagResolved"]],
- }
- )
- # render data_for_table with rich table
- table = Table(show_header=True, header_style="bold magenta")
-
- table.add_column("ID", style="dim")
- table.add_column("Status")
- table.add_column("Created At")
- table.add_column("Prompt", overflow="fold")
- table.add_column("Completion", overflow="fold")
- table.add_column("Tags", justify="right")
-
- max_len = 40
- for item in data_for_table:
- tags = ", ".join(item["tags"]) if item["tags"] else ""
- if isinstance(item["prompt"], list):
- item["prompt"] = " ".join(item["prompt"])
- short_prompt = item["prompt"][:max_len] + "..." if len(item["prompt"]) > max_len else item["prompt"]
- completion = item.get("completion", "")
- short_completion = completion[:max_len] + "..." if len(completion) > max_len else completion
- table.add_row(item["id"], item["status"], item["created_at"], short_prompt, short_completion, tags)
-
- console = Console()
- console.print(table)
- console.print(f"{total_completions=}")
-
-
 def _write_completions(res, output_file, compact_mode):
  """Processes completions and appends them to the output file."""
  with open(output_file, "a") as file:
@@ -217,75 +148,11 @@ def _get_llm_repsone(
  return ret
 
 
-def _render_comparison_table(model_response_raw_data):
- rich.print(f"completion_id: {model_response_raw_data['completion_id']}")
- rich.print("original_request:")
- rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4))
-
- table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True)
- table.add_column("Model")
- table.add_column("Content")
- table.add_column("Total Token Usage (Input/Output)")
- table.add_column("Duration (ms)")
-
- for model, data in model_response_raw_data.items():
- # only display model data
- if model not in ["completion_id", "original_request"]:
- usage = data["usage"]
- formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})"
- table.add_row(model, data["content"], formatted_usage, str(data["duration"]))
- rich.print(table)
-
-
-def _create_dataframe_from_comparison_data(model_response_raw_data):
- completion_id = model_response_raw_data["completion_id"]
- original_request = model_response_raw_data["original_request"]
- rows = []
- for model, model_data in model_response_raw_data.items():
- # only display model data
- if model not in ["completion_id", "original_request"]:
- content = model_data["content"]
- usage = model_data["usage"]
- prompt_tokens = usage["prompt_tokens"]
- completion_tokens = usage["completion_tokens"]
- total_tokens = usage["total_tokens"]
- duration = model_data["duration"]
- prompt_messages = json.dumps(original_request["messages"])
- rows.append(
- [
- completion_id,
- prompt_messages,
- model,
- content,
- prompt_tokens,
- completion_tokens,
- total_tokens,
- duration,
- ]
- )
-
- df = pd.DataFrame(
- rows,
- columns=[
- "Completion ID",
- "Prompt Messages",
- "Model",
- "Content",
- "Prompt Tokens",
- "Completion Tokens",
- "Total Tokens",
- "Duration (ms)",
- ],
- )
-
- return df
-
-
 def _compare(models: list[str], messages: dict, temperature: float = 0.2, max_tokens: float = 256, top_p: float = 1.0):
  ret = {}
  if models:
  for model in models:
- rich.print(f"Running {model}")
+ print(f"Running {model}")
  response = _get_llm_repsone(
  model,
  messages,

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -33,6 +33,15 @@ def test_download_completions(runner):
  assert "Download total completions: 1/" in result.output
 
 
+def test_benchmark_models(runner):
+ tag = "test_tag_c"
+ model = "gpt-3.5-turbo"
+ result = runner.invoke(cli, ["completions", "benchmark_models", "--models", model, "--limit", "1", "--tags", tag])
+ assert result.exit_code == 0
+ assert f"Filter with tags: {tag}" in result.output
+ assert f"Running {model}" in result.output
+
+
 def test_list_feedback(runner):
  result = runner.invoke(cli, ["feedback", "list"])
  assert result.exit_code == 0