From 4138b220ef1809db892d6d76ad724b7aeea74641 Mon Sep 17 00:00:00 2001 From: Kim Tran Date: Wed, 26 Jun 2024 22:59:38 -0400 Subject: [PATCH] Separate llm vs cli tests --- .github/workflows/test.yml | 14 +-- log10/cli/completions.py | 136 ++++++++++++++++++++++++++++- log10/completions/completions.py | 141 +------------------------------ tests/test_cli.py | 9 ++ 4 files changed, 152 insertions(+), 148 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 78a1940..036f606 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -63,7 +63,10 @@ jobs: - name: Install dependencies run: poetry install --all-extras - - name: Run dispatch tests + - name: Run cli tests + run: poetry run pytest -vv tests/test_cli.py + + - name: Run dispatch llm tests if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }} run: | echo "This is a dispatch event" @@ -113,16 +116,13 @@ jobs: if $empty_inputs; then echo "All variables are empty" - poetry run pytest -vv tests/ + poetry run pytest -vv tests/ --ignore=tests/test_cli.py poetry run pytest --llm_provider=anthropic -vv tests/test_magentic.py fi - - name: Run scheduled tests + - name: Run scheduled llm tests if: ${{ github.event_name == 'schedule' }} run: | echo "This is a schedule event" - poetry run pytest -vv tests/ + poetry run pytest -vv tests/ --ignore=tests/test_cli.py poetry run pytest --openai_model=gpt-4o -m chat -vv tests/test_openai.py - - - name: Test cli commands - run: poetry run pytest -vv tests/test_cli.py \ No newline at end of file diff --git a/log10/cli/completions.py b/log10/cli/completions.py index 9ac9db7..885447b 100644 --- a/log10/cli/completions.py +++ b/log10/cli/completions.py @@ -4,17 +4,16 @@ import pandas as pd import rich import tqdm +from rich.console import Console +from rich.table import Table -from log10._httpx_utils import _try_get +from log10._httpx_utils import _get_time_diff, _try_get from log10.cli_utils import generate_markdown_report, generate_results_table from log10.completions.completions import ( _check_model_support, _compare, - _create_dataframe_from_comparison_data, _get_completion, _get_completions_url, - _render_comparison_table, - _render_completions_table, _write_completions, ) from log10.llm import Log10Config @@ -24,6 +23,135 @@ _log10_config = Log10Config() +def _render_completions_table(completions_data, total_completions): + data_for_table = [] + for completion in completions_data: + prompt, response = "", "" + if completion.get("kind") == "completion": + prompt = completion.get("request", {}).get("prompt", "") + response_choices = completion.get("response", {}).get("choices", []) + if response_choices: + response = response_choices[0].get("text", "") + elif completion.get("kind") == "chat": + request_messages = completion.get("request", {}).get("messages", []) + prompt = request_messages[0].get("content", "") if request_messages else "" + + response_choices = completion.get("response", {}).get("choices", []) + if response_choices: + # Handle 'message' and 'function_call' within the first choice safely + first_choice = response_choices[0] + if "message" in first_choice: + message = first_choice["message"] + response = ( + message.get("content") + or message.get("tool_calls", [])[-1].get("function", {}).get("arguments", "") + if message.get("tool_calls") + else "" + ) + elif "function_call" in first_choice: + response = json.dumps(first_choice.get("function_call", {})) + else: + rich.print(f"Unknown completion kind: {completion['kind']} for id: {completion['id']}") + + data_for_table.append( + { + "id": completion["id"], + "status": "success" if completion["status"] == "finished" else completion["status"], + "created_at": _get_time_diff(completion["created_at"]), + "prompt": prompt, + "completion": response, + "tags": [t["name"] for t in completion["tagResolved"]], + } + ) + # render data_for_table with rich table + table = Table(show_header=True, header_style="bold magenta") + + table.add_column("ID", style="dim") + table.add_column("Status") + table.add_column("Created At") + table.add_column("Prompt", overflow="fold") + table.add_column("Completion", overflow="fold") + table.add_column("Tags", justify="right") + + max_len = 40 + for item in data_for_table: + tags = ", ".join(item["tags"]) if item["tags"] else "" + if isinstance(item["prompt"], list): + item["prompt"] = " ".join(item["prompt"]) + short_prompt = item["prompt"][:max_len] + "..." if len(item["prompt"]) > max_len else item["prompt"] + completion = item.get("completion", "") + short_completion = completion[:max_len] + "..." if len(completion) > max_len else completion + table.add_row(item["id"], item["status"], item["created_at"], short_prompt, short_completion, tags) + + console = Console() + console.print(table) + console.print(f"{total_completions=}") + + +def _render_comparison_table(model_response_raw_data): + rich.print(f"completion_id: {model_response_raw_data['completion_id']}") + rich.print("original_request:") + rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4)) + + table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True) + table.add_column("Model") + table.add_column("Content") + table.add_column("Total Token Usage (Input/Output)") + table.add_column("Duration (ms)") + + for model, data in model_response_raw_data.items(): + # only display model data + if model not in ["completion_id", "original_request"]: + usage = data["usage"] + formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})" + table.add_row(model, data["content"], formatted_usage, str(data["duration"])) + rich.print(table) + + +def _create_dataframe_from_comparison_data(model_response_raw_data): + completion_id = model_response_raw_data["completion_id"] + original_request = model_response_raw_data["original_request"] + rows = [] + for model, model_data in model_response_raw_data.items(): + # only display model data + if model not in ["completion_id", "original_request"]: + content = model_data["content"] + usage = model_data["usage"] + prompt_tokens = usage["prompt_tokens"] + completion_tokens = usage["completion_tokens"] + total_tokens = usage["total_tokens"] + duration = model_data["duration"] + prompt_messages = json.dumps(original_request["messages"]) + rows.append( + [ + completion_id, + prompt_messages, + model, + content, + prompt_tokens, + completion_tokens, + total_tokens, + duration, + ] + ) + + df = pd.DataFrame( + rows, + columns=[ + "Completion ID", + "Prompt Messages", + "Model", + "Content", + "Prompt Tokens", + "Completion Tokens", + "Total Tokens", + "Duration (ms)", + ], + ) + + return df + + @click.command() @click.option("--limit", default=25, help="Specify the maximum number of completions to retrieve.") @click.option("--offset", default=0, help="Set the starting point (offset) from where to begin fetching completions.") diff --git a/log10/completions/completions.py b/log10/completions/completions.py index b52c75b..ed479dd 100644 --- a/log10/completions/completions.py +++ b/log10/completions/completions.py @@ -3,12 +3,8 @@ import click import httpx -import pandas as pd -import rich -from rich.console import Console -from rich.table import Table -from log10._httpx_utils import _get_time_diff, _try_get +from log10._httpx_utils import _try_get from log10.llm import Log10Config @@ -52,11 +48,11 @@ def _get_tag_ids(tags): def _get_completions_url(limit, offset, tags, from_date, to_date, base_url, org_id, printout=True): tag_ids_str = _get_tag_ids(tags) if tags else "" if tag_ids_str and printout: - rich.print(f"Filter with tags: {tags}") + print(f"Filter with tags: {tags}") date_range = _get_valid_date_range(from_date, to_date) if date_range and printout: - rich.print(f"Filter with created date: {date_range['from'][:10]} to {date_range['to'][:10]}") + print(f"Filter with created date: {date_range['from'][:10]} to {date_range['to'][:10]}") url = f"{base_url}/api/completions?organization_id={org_id}&offset={offset}&limit={limit}&tagFilter={tag_ids_str}&createdFilter={json.dumps(date_range)}&sort=created_at&desc=true&ids=" return url @@ -79,71 +75,6 @@ def _get_valid_date_range(from_date, to_date): return date_range -def _render_completions_table(completions_data, total_completions): - data_for_table = [] - for completion in completions_data: - prompt, response = "", "" - if completion.get("kind") == "completion": - prompt = completion.get("request", {}).get("prompt", "") - response_choices = completion.get("response", {}).get("choices", []) - if response_choices: - response = response_choices[0].get("text", "") - elif completion.get("kind") == "chat": - request_messages = completion.get("request", {}).get("messages", []) - prompt = request_messages[0].get("content", "") if request_messages else "" - - response_choices = completion.get("response", {}).get("choices", []) - if response_choices: - # Handle 'message' and 'function_call' within the first choice safely - first_choice = response_choices[0] - if "message" in first_choice: - message = first_choice["message"] - response = ( - message.get("content") - or message.get("tool_calls", [])[-1].get("function", {}).get("arguments", "") - if message.get("tool_calls") - else "" - ) - elif "function_call" in first_choice: - response = json.dumps(first_choice.get("function_call", {})) - else: - rich.print(f"Unknown completion kind: {completion['kind']} for id: {completion['id']}") - - data_for_table.append( - { - "id": completion["id"], - "status": "success" if completion["status"] == "finished" else completion["status"], - "created_at": _get_time_diff(completion["created_at"]), - "prompt": prompt, - "completion": response, - "tags": [t["name"] for t in completion["tagResolved"]], - } - ) - # render data_for_table with rich table - table = Table(show_header=True, header_style="bold magenta") - - table.add_column("ID", style="dim") - table.add_column("Status") - table.add_column("Created At") - table.add_column("Prompt", overflow="fold") - table.add_column("Completion", overflow="fold") - table.add_column("Tags", justify="right") - - max_len = 40 - for item in data_for_table: - tags = ", ".join(item["tags"]) if item["tags"] else "" - if isinstance(item["prompt"], list): - item["prompt"] = " ".join(item["prompt"]) - short_prompt = item["prompt"][:max_len] + "..." if len(item["prompt"]) > max_len else item["prompt"] - completion = item.get("completion", "") - short_completion = completion[:max_len] + "..." if len(completion) > max_len else completion - table.add_row(item["id"], item["status"], item["created_at"], short_prompt, short_completion, tags) - - console = Console() - console.print(table) - console.print(f"{total_completions=}") - - def _write_completions(res, output_file, compact_mode): """Processes completions and appends them to the output file.""" with open(output_file, "a") as file: @@ -217,75 +148,11 @@ def _get_llm_repsone( return ret -def _render_comparison_table(model_response_raw_data): - rich.print(f"completion_id: {model_response_raw_data['completion_id']}") - rich.print("original_request:") - rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4)) - - table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True) - table.add_column("Model") - table.add_column("Content") - table.add_column("Total Token Usage (Input/Output)") - table.add_column("Duration (ms)") - - for model, data in model_response_raw_data.items(): - # only display model data - if model not in ["completion_id", "original_request"]: - usage = data["usage"] - formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})" - table.add_row(model, data["content"], formatted_usage, str(data["duration"])) - rich.print(table) - - -def _create_dataframe_from_comparison_data(model_response_raw_data): - completion_id = model_response_raw_data["completion_id"] - original_request = model_response_raw_data["original_request"] - rows = [] - for model, model_data in model_response_raw_data.items(): - # only display model data - if model not in ["completion_id", "original_request"]: - content = model_data["content"] - usage = model_data["usage"] - prompt_tokens = usage["prompt_tokens"] - completion_tokens = usage["completion_tokens"] - total_tokens = usage["total_tokens"] - duration = model_data["duration"] - prompt_messages = json.dumps(original_request["messages"]) - rows.append( - [ - completion_id, - prompt_messages, - model, - content, - prompt_tokens, - completion_tokens, - total_tokens, - duration, - ] - ) - - df = pd.DataFrame( - rows, - columns=[ - "Completion ID", - "Prompt Messages", - "Model", - "Content", - "Prompt Tokens", - "Completion Tokens", - "Total Tokens", - "Duration (ms)", - ], - ) - - return df - - def _compare(models: list[str], messages: dict, temperature: float = 0.2, max_tokens: float = 256, top_p: float = 1.0): ret = {} if models: for model in models: - rich.print(f"Running {model}") + print(f"Running {model}") response = _get_llm_repsone( model, messages, diff --git a/tests/test_cli.py b/tests/test_cli.py index acdda48..3758ead 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -33,6 +33,15 @@ def test_download_completions(runner): assert "Download total completions: 1/" in result.output +def test_benchmark_models(runner): + tag = "test_tag_c" + model = "gpt-3.5-turbo" + result = runner.invoke(cli, ["completions", "benchmark_models", "--models", model, "--limit", "1", "--tags", tag]) + assert result.exit_code == 0 + assert f"Filter with tags: {tag}" in result.output + assert f"Running {model}" in result.output + + def test_list_feedback(runner): result = runner.invoke(cli, ["feedback", "list"]) assert result.exit_code == 0