From ac2e1778f1651d68e11ede309b7e24bcb503c22d Mon Sep 17 00:00:00 2001
From: wenzhe <145375501+wenzhe-log10@users.noreply.github.com>
Date: Tue, 16 Apr 2024 21:46:07 -0700
Subject: [PATCH] [feature] add cli `log10 completions benchmark_models` to
compare a logged completion with other models (#141)
* create log10 completions report to compare models from ids or tags
* generate report and also do autoprompt analysis with --analyze_prompt
* minor: enable mistral models
* add and check supported models for gpt, claude-3, and mistral
log the completions
* updates:
- cli docs
- skip failed completions
- remove double log10 patch
- add pandas in deps
- fix autofeedback import issue
* format
* allow to benchmark the same model
* add an example to test multiple log10 load only log once
* fix split models and completion ids with ending comma or multiple comma
* apply same for tags
* minor: skip id is its kind is not chat
* fix tags if not found, don't return any completions
* minor - add a warning if log10 load patched module
* minor clean up example
* move log10_load_twice.py to tests
---
cli_docs.md | 93 +++++-
log10/__main__.py | 3 +-
log10/completions/completions.py | 340 +++++++++++++++++++++-
log10/feedback/_summary_feedback_utils.py | 6 +
log10/feedback/autofeedback.py | 11 +-
log10/load.py | 8 +-
log10/prompt_analyzer.py | 40 +++
log10/utils.py | 19 ++
pyproject.toml | 1 +
tests/log10_load_twice.py | 28 ++
10 files changed, 539 insertions(+), 10 deletions(-)
create mode 100644 tests/log10_load_twice.py
diff --git a/cli_docs.md b/cli_docs.md
index 5c1eea27..9e5e2ecb 100644
--- a/cli_docs.md
+++ b/cli_docs.md
@@ -82,6 +82,64 @@ output (only showing part of the full raw output):
}
```
+You can load completions' prompt messages and compare with other LLM models by using [`log10 completions benchmark_models`](#log10-completions-benchmark_models).
+For instance,
+```bash
+log10 completions benchmark_models --ids 25572f3c-c2f1-45b0-9de8-d96be4c4e544 --models=gpt-3.5-turbo,mistral-small-latest,claude-3-haiku-20240307
+```
+output
+```
+Running gpt-3.5-turbo
+Running mistral-small-latest
+Running claude-3-haiku-20240307
+completion_id: 25572f3c-c2f1-45b0-9de8-d96be4c4e544
+original_request:
+{
+ "model": "gpt-4-0125-preview",
+ "messages": [
+ {
+ "role": "system",
+ "content": "Summarize the article in 30 words."
+ },
+ {
+ "role": "user",
+ "content": "\"Story of Your Life\" is a science fiction novella by American writer Ted Chiang, first published in Starlight 2 in 1998, and in 2002 in Chiang's collection of short stories, Stories of Your Life and Others. Its major themes are language and determinism. \"Story of Your Life\" won the 2000 Nebula Award for Best Novella, as well as the 1999 Theodore Sturgeon Award. It was nominated for the 1999 Hugo Award for Best Novella. The novella has been translated into Italian, Japanese, French and German.[1] A film adaptation of the story, Arrival, was conceived and adapted by Eric Heisserer. Titled and directed by Denis Villeneuve, it was released in 2016. It stars Amy Adams, Jeremy Renner, and Forest Whitaker and was nominated for eight Academy Awards, including Best Picture and Best Adapted Screenplay; it won the award for Best Sound Editing.[2][3][4] The film also won the 2017 Ray Bradbury Award for Outstanding Dramatic Presentation and the Hugo Award for Best Dramatic Presentation."
+ }
+ ],
+ "temperature": 0.2
+}
+╭─────────────────────────┬───────────────────────────────────────────────────────┬──────────────────────────────────┬───────────────╮
+│ Model │ Content │ Total Token Usage (Input/Output) │ Duration (ms) │
+├─────────────────────────┼───────────────────────────────────────────────────────┼──────────────────────────────────┼───────────────┤
+│ gpt-4-0125-preview │ "Story of Your Life" by Ted Chiang explores language │ 323 (255/68) │ 2527 │
+│ │ and determinism, winning the 2000 Nebula and 1999 │ │ │
+│ │ Theodore Sturgeon Awards. Adapted into the film │ │ │
+│ │ "Arrival" by Denis Villeneuve in 2016, it received │ │ │
+│ │ critical acclaim and multiple awards, including an │ │ │
+│ │ Academy Award for Best Sound Editing. │ │ │
+├─────────────────────────┼───────────────────────────────────────────────────────┼──────────────────────────────────┼───────────────┤
+│ gpt-3.5-turbo │ "Story of Your Life" is a science fiction novella by │ 295 (255/40) │ 2345 │
+│ │ Ted Chiang, exploring themes of language and │ │ │
+│ │ determinism. It won awards and was adapted into the │ │ │
+│ │ film Arrival in 2016. │ │ │
+├─────────────────────────┼───────────────────────────────────────────────────────┼──────────────────────────────────┼───────────────┤
+│ mistral-small-latest │ "Story of Your Life" is a Ted Chiang novella │ 342 (282/60) │ 2087 │
+│ │ exploring language and determinism, winning Nebula │ │ │
+│ │ and Sturgeon Awards. It was adapted into the 2016 │ │ │
+│ │ film "Arrival," which received multiple Academy Award │ │ │
+│ │ nominations and won for Best Sound Editing. │ │ │
+├─────────────────────────┼───────────────────────────────────────────────────────┼──────────────────────────────────┼───────────────┤
+│ claude-3-haiku-20240307 │ "Story of Your Life" is a science fiction novella by │ 320 (274/46) │ 1944 │
+│ │ Ted Chiang, exploring themes of language and │ │ │
+│ │ determinism, winning multiple awards and inspiring a │ │ │
+│ │ film adaptation, Arrival, which was critically │ │ │
+│ │ acclaimed. │ │ │
+╰─────────────────────────┴───────────────────────────────────────────────────────┴──────────────────────────────────┴───────────────╯
+```
+
+You can also filter the completions by tags and generate a report in markdown file using `--file` or `-f`. And run our prompt analyzer (auto-prompt) using `--analyze_prompt`.
+
+
### Feedback Tasks and Feedback
To start adding feedback, first you need to define a feedback task with [`log10 feedback-task create`](#log10-feedback-task-create). Then you can add feedback to a logged completions with [`log10 feedback create`](#log10-feedback-create). For more details, you can read more in [log10's user documentation](https://log10.io/docs/feedback/feedback#add-feedback).
@@ -132,10 +190,39 @@ Usage: log10 completions [OPTIONS] COMMAND [ARGS]...
Manage logs from completions i.e. logs from users
+Options:
+ --help Show this message and exit.
+
Commands:
- download Download completions to a jsonl file
- get Get a completion by id
- list List completions
+ benchmark_models Compare completions using different models and...
+ download Download completions to a jsonl file
+ get Get a completion by id
+ list List completions
+```
+
+#### log10 completions benchmark_models
+```bash
+log10 completions benchmark_models --help
+Usage: log10 completions benchmark_models [OPTIONS]
+
+ Compare completions using different models and generate report
+
+Options:
+ --ids TEXT Completion IDs. Separate multiple ids with commas.
+ --tags TEXT Filter completions by specific tags. Separate multiple
+ tags with commas.
+ --limit TEXT Specify the maximum number of completions to retrieve
+ filtered by tags.
+ --offset TEXT Set the starting point (offset) from where to begin
+ fetching completions filtered by tags.
+ --models TEXT Comma separated list of models to compare
+ --temperature FLOAT Temperature
+ --max_tokens INTEGER Max tokens
+ --top_p FLOAT Top p
+ --analyze_prompt Run prompt analyzer on the messages.
+ -f, --file TEXT Specify the filename for the report in markdown
+ format.
+ --help Show this message and exit.
```
#### log10 completions download
diff --git a/log10/__main__.py b/log10/__main__.py
index 5d53659d..20a67310 100644
--- a/log10/__main__.py
+++ b/log10/__main__.py
@@ -1,6 +1,6 @@
import click
-from log10.completions.completions import download_completions, get_completion, list_completions
+from log10.completions.completions import benchmark_models, download_completions, get_completion, list_completions
from log10.feedback.autofeedback import auto_feedback_icl
from log10.feedback.feedback import create_feedback, download_feedback, get_feedback, list_feedback
from log10.feedback.feedback_task import create_feedback_task, get_feedback_task, list_feedback_task
@@ -39,6 +39,7 @@ def feedback_task():
completions.add_command(list_completions, "list")
completions.add_command(get_completion, "get")
completions.add_command(download_completions, "download")
+completions.add_command(benchmark_models, "benchmark_models")
cli.add_command(feedback)
feedback.add_command(create_feedback, "create")
diff --git a/log10/completions/completions.py b/log10/completions/completions.py
index aedc340a..3b28b557 100644
--- a/log10/completions/completions.py
+++ b/log10/completions/completions.py
@@ -1,7 +1,9 @@
import json
+import time
import click
import httpx
+import pandas as pd
import rich
import tqdm
from rich.console import Console
@@ -9,6 +11,8 @@
from log10._httpx_utils import _get_time_diff, _try_get
from log10.llm import Log10Config
+from log10.prompt_analyzer import PromptAnalyzer, convert_suggestion_to_markdown, display_prompt_analyzer_suggestions
+from log10.utils import generate_markdown_report, generate_results_table
_log10_config = Log10Config()
@@ -37,10 +41,13 @@ def _get_tag_id(tag: str) -> str:
def _get_tag_ids(tags):
tag_ids = []
- for tag in tags.split(","):
+ for tag in [t for t in tags.split(",") if t]:
tag_id = _get_tag_id(tag)
if tag_id:
tag_ids.append(tag_id)
+ else:
+ raise SystemExit(f"Cannot found tag: {tag}.")
+
tag_ids_str = ",".join(tag_ids)
return tag_ids_str
@@ -248,3 +255,334 @@ def download_completions(limit, offset, timeout, tags, from_date, to_date, compa
res = _try_get(download_url, timeout)
_write_completions(res, file, compact)
pbar.update(current_batch_size)
+
+
+def _get_llm_repsone(
+ model: str,
+ messages: list[dict],
+ temperature: float = 0.2,
+ max_tokens: int = 512,
+ top_p: float = 1.0,
+):
+ ret = {"content": "", "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, "duration": 0.0}
+
+ start_time = time.perf_counter()
+ if "gpt-4" in model or "gpt-3.5" in model:
+ from log10.load import OpenAI
+
+ client = OpenAI()
+ response = client.chat.completions.create(
+ model=model, messages=messages, temperature=temperature, max_tokens=max_tokens, top_p=top_p
+ )
+ ret["content"] = response.choices[0].message.content
+ ret["usage"] = response.usage.dict()
+ elif "claude-3" in model:
+ from log10.load import Anthropic
+
+ system_messages = [m["content"] for m in messages if m["role"] == "system"]
+ other_messages = [m for m in messages if m["role"] != "system"]
+ system_prompt = ("\n").join(system_messages)
+
+ client = Anthropic()
+ response = client.messages.create(
+ model=model,
+ system=system_prompt,
+ messages=other_messages,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ top_p=top_p,
+ )
+ ret["content"] = response.content[0].text
+ ret["usage"]["prompt_tokens"] = response.usage.input_tokens
+ ret["usage"]["completion_tokens"] = response.usage.output_tokens
+ ret["usage"]["total_tokens"] = response.usage.input_tokens + response.usage.output_tokens
+ elif "mistral" in model:
+ import mistralai
+ from mistralai.client import MistralClient
+
+ from log10.load import log10
+
+ log10(mistralai)
+
+ client = MistralClient()
+ response = client.chat(
+ model=model, messages=messages, temperature=temperature, max_tokens=max_tokens, top_p=top_p
+ )
+ ret["content"] = response.choices[0].message.content
+ ret["usage"] = response.usage.model_dump()
+ else:
+ raise ValueError(f"Model {model} not supported.")
+ ret["duration"] = int((time.perf_counter() - start_time) * 1000)
+
+ return ret
+
+
+def _render_comparison_table(model_response_raw_data):
+ rich.print(f"completion_id: {model_response_raw_data['completion_id']}")
+ rich.print("original_request:")
+ rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4))
+
+ table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True)
+ table.add_column("Model")
+ table.add_column("Content")
+ table.add_column("Total Token Usage (Input/Output)")
+ table.add_column("Duration (ms)")
+
+ for model, data in model_response_raw_data.items():
+ # only display model data
+ if model not in ["completion_id", "original_request"]:
+ usage = data["usage"]
+ formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})"
+ table.add_row(model, data["content"], formatted_usage, str(data["duration"]))
+ rich.print(table)
+
+
+def _create_dataframe_from_comparison_data(model_response_raw_data):
+ completion_id = model_response_raw_data["completion_id"]
+ original_request = model_response_raw_data["original_request"]
+ rows = []
+ for model, model_data in model_response_raw_data.items():
+ # only display model data
+ if model not in ["completion_id", "original_request"]:
+ content = model_data["content"]
+ usage = model_data["usage"]
+ prompt_tokens = usage["prompt_tokens"]
+ completion_tokens = usage["completion_tokens"]
+ total_tokens = usage["total_tokens"]
+ duration = model_data["duration"]
+ prompt_messages = json.dumps(original_request["messages"])
+ rows.append(
+ [
+ completion_id,
+ prompt_messages,
+ model,
+ content,
+ prompt_tokens,
+ completion_tokens,
+ total_tokens,
+ duration,
+ ]
+ )
+
+ df = pd.DataFrame(
+ rows,
+ columns=[
+ "Completion ID",
+ "Prompt Messages",
+ "Model",
+ "Content",
+ "Prompt Tokens",
+ "Completion Tokens",
+ "Total Tokens",
+ "Duration (ms)",
+ ],
+ )
+
+ return df
+
+
+def _compare(models: list[str], messages: dict, temperature: float = 0.2, max_tokens: float = 256, top_p: float = 1.0):
+ ret = {}
+ if models:
+ for model in models:
+ rich.print(f"Running {model}")
+ response = _get_llm_repsone(
+ model,
+ messages,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ top_p=top_p,
+ )
+
+ ret[model] = response
+ return ret
+
+
+_SUPPORTED_MODELS = [
+ # openai chat models
+ "gpt-4-turbo",
+ "gpt-4-turbo-2024-04-09",
+ "gpt-4-0125-preview",
+ "gpt-4-turbo-preview",
+ "gpt-4-1106-preview",
+ "gpt-4-vision-preview",
+ "gpt-4",
+ "gpt-4-0314",
+ "gpt-4-0613",
+ "gpt-4-32k",
+ "gpt-4-32k-0314",
+ "gpt-4-32k-0613",
+ "gpt-3.5-turbo",
+ "gpt-3.5-turbo-16k",
+ "gpt-3.5-turbo-0301",
+ "gpt-3.5-turbo-0613",
+ "gpt-3.5-turbo-1106",
+ "gpt-3.5-turbo-0125",
+ "gpt-3.5-turbo-16k-0613",
+ # anthropic claude
+ "claude-3-opus-20240229",
+ "claude-3-sonnet-20240229",
+ "claude-3-haiku-20240307",
+ # mistral
+ "mistral-small-latest",
+ "mistral-medium-latest",
+ "mistral-large-latest",
+]
+
+
+def _check_model_support(model: str) -> bool:
+ return model in _SUPPORTED_MODELS
+
+
+@click.command()
+@click.option("--ids", default="", help="Completion IDs. Separate multiple ids with commas.")
+@click.option("--tags", default="", help="Filter completions by specific tags. Separate multiple tags with commas.")
+@click.option("--limit", help="Specify the maximum number of completions to retrieve filtered by tags.")
+@click.option(
+ "--offset", help="Set the starting point (offset) from where to begin fetching completions filtered by tags."
+)
+@click.option("--models", default="", help="Comma separated list of models to compare")
+@click.option("--temperature", default=0.2, help="Temperature")
+@click.option("--max_tokens", default=512, help="Max tokens")
+@click.option("--top_p", default=1.0, help="Top p")
+@click.option("--analyze_prompt", is_flag=True, help="Run prompt analyzer on the messages.")
+@click.option("--file", "-f", help="Specify the filename for the report in markdown format.")
+def benchmark_models(ids, tags, limit, offset, models, temperature, max_tokens, top_p, file, analyze_prompt):
+ """
+ Compare completions using different models and generate report
+ """
+ if ids and tags:
+ raise click.UsageError("--ids and --tags cannot be set together.")
+ if (limit or offset) and not tags:
+ raise click.UsageError("--limit and --offset can only be used with --tags.")
+ if tags:
+ if not limit:
+ limit = 5
+ if not offset:
+ offset = 0
+
+ if not models:
+ raise click.UsageError("--models must be set to compare.")
+ else:
+ for model in [m for m in models.split(",") if m]:
+ if not _check_model_support(model):
+ raise click.UsageError(f"Model {model} is not supported.")
+
+ # get completions ids
+ completion_ids = []
+ if ids:
+ completion_ids = [id for id in ids.split(",") if id]
+ elif tags:
+ base_url = _log10_config.url
+ org_id = _log10_config.org_id
+ url = _get_completions_url(limit, offset, tags, None, None, base_url, org_id)
+ res = _try_get(url)
+ completions = res.json()["data"]
+ completion_ids = [completion["id"] for completion in completions]
+ if not completion_ids:
+ SystemExit(f"No completions found for tags: {tags}")
+
+ compare_models = [m for m in models.split(",") if m]
+
+ data = []
+ skipped_completion_ids = []
+ for id in completion_ids:
+ # get message from id
+ completion_data = _get_completion(id).json()["data"]
+
+ # skip completion if status is not finished or kind is not chat
+ if completion_data["status"] != "finished" or completion_data["kind"] != "chat":
+ rich.print(f"Skip completion {id}. Status is not finished or kind is not chat.")
+ skipped_completion_ids.append(id)
+ continue
+
+ original_model_request = completion_data["request"]
+ original_model_response = completion_data["response"]
+ original_model = original_model_response["model"]
+ benchmark_data = {
+ "completion_id": id,
+ "original_request": original_model_request,
+ f"{original_model} (original model)": {
+ "content": original_model_response["choices"][0]["message"]["content"],
+ "usage": original_model_response["usage"],
+ "duration": completion_data["duration"],
+ },
+ }
+ messages = original_model_request["messages"]
+ compare_models_data = _compare(compare_models, messages, temperature, max_tokens, top_p)
+ benchmark_data.update(compare_models_data)
+ data.append(benchmark_data)
+
+ prompt_analysis_data = {}
+ if analyze_prompt:
+ rich.print("Analyzing prompts")
+ for item in data:
+ completion_id = item["completion_id"]
+ prompt_messages = item["original_request"]["messages"]
+ all_messages = "\n\n".join([m["content"] for m in prompt_messages])
+ analyzer = PromptAnalyzer()
+ suggestions = analyzer.analyze(all_messages)
+ prompt_analysis_data[completion_id] = suggestions
+
+ # create an empty dataframe
+ all_df = pd.DataFrame(
+ columns=[
+ "Completion ID",
+ "Prompt Messages",
+ "Model",
+ "Content",
+ "Prompt Tokens",
+ "Completion Tokens",
+ "Total Tokens",
+ "Duration (ms)",
+ ]
+ )
+
+ #
+ # Display or save the results
+ #
+ if not file:
+ # display in terminal using rich
+ for ret in data:
+ _render_comparison_table(ret)
+ if analyze_prompt:
+ completion_id = ret["completion_id"]
+ suggestions = prompt_analysis_data[completion_id]
+ rich.print(f"Prompt Analysis for completion_id: {completion_id}")
+ display_prompt_analyzer_suggestions(suggestions)
+ else:
+ # generate markdown report and save to file
+ for ret in data:
+ df = _create_dataframe_from_comparison_data(ret)
+ all_df = pd.concat([all_df, df])
+ pivot_df = all_df.pivot(index="Completion ID", columns="Model", values="Content")
+ pivot_df["Prompt Messages"] = all_df.groupby("Completion ID")["Prompt Messages"].first()
+ # Reorder the columns
+ cols = pivot_df.columns.tolist()
+ cols = [cols[-1]] + cols[:-1]
+ pivot_df = pivot_df[cols]
+
+ pivot_table = generate_results_table(pivot_df, section_name="model comparison")
+ all_results_table = generate_results_table(all_df, section_name="All Results")
+
+ prompt_analysis_markdown = ""
+ if analyze_prompt:
+ prompt_analysis_markdown = "## Prompt Analysis\n\n"
+ for completion_id, suggestions in prompt_analysis_data.items():
+ prompt_messages = all_df[all_df["Completion ID"] == completion_id]["Prompt Messages"].values[0]
+ prompt_analysis_markdown += (
+ f"### Prompt Analysis for completion_id: {completion_id}\n\n{prompt_messages}\n\n"
+ )
+ prompt_analysis_markdown += convert_suggestion_to_markdown(suggestions)
+
+ # generate the list of skipped completions ids
+ skipped_completion_markdown = ""
+ if skipped_completion_ids:
+ skipped_completion_ids_str = ", ".join(skipped_completion_ids)
+ skipped_completion_markdown += "## Skipped Completion IDs\n\n"
+ skipped_completion_markdown += f"Skipped completions: {skipped_completion_ids_str}\n\n"
+
+ generate_markdown_report(
+ file, [pivot_table, prompt_analysis_markdown, all_results_table, skipped_completion_markdown]
+ )
+ rich.print(f"Report saved to {file}")
diff --git a/log10/feedback/_summary_feedback_utils.py b/log10/feedback/_summary_feedback_utils.py
index 909c2ec9..c9df2356 100644
--- a/log10/feedback/_summary_feedback_utils.py
+++ b/log10/feedback/_summary_feedback_utils.py
@@ -1,5 +1,11 @@
import sys
+import openai
+
+from log10.load import log10
+
+
+log10(openai)
if sys.version_info < (3, 10):
raise RuntimeError("Python 3.10 or higher is required to run summary feedback llm call.")
diff --git a/log10/feedback/autofeedback.py b/log10/feedback/autofeedback.py
index c6d98a8b..20e4cdbb 100644
--- a/log10/feedback/autofeedback.py
+++ b/log10/feedback/autofeedback.py
@@ -4,12 +4,11 @@
from types import FunctionType
import click
-import openai
from rich.console import Console
from log10.completions.completions import _get_completion
from log10.feedback.feedback import _get_feedback_list
-from log10.load import log10, log10_session
+from log10.load import log10_session
try:
@@ -19,7 +18,6 @@
except ImportError:
Magentic_imported = False
-log10(openai)
logger = logging.getLogger("LOG10")
logger.setLevel(logging.INFO)
@@ -33,7 +31,12 @@ class AutoFeedbackICL:
_examples: list[dict] = []
_predict_func: FunctionType = None
- def __init__(self, task_id: str, num_samples: int = 5, predict_func: FunctionType = summary_feedback_llm_call):
+ def __init__(
+ self,
+ task_id: str,
+ num_samples: int = 5,
+ predict_func: FunctionType = summary_feedback_llm_call if Magentic_imported else None,
+ ):
if not Magentic_imported:
raise ImportError(
"Log10 feedback predict requires magentic package. Please install using 'pip install log10-io[autofeedback_icl]'"
diff --git a/log10/load.py b/log10/load.py
index 1f3e649b..f091e9e7 100644
--- a/log10/load.py
+++ b/log10/load.py
@@ -789,6 +789,10 @@ def log10(module, DEBUG_=False, USE_ASYNC_=True):
# elif inspect.isclass(method): # Handle nested classes
# intercept_class_methods(method)
+ if getattr(module, "_log10_patched", False):
+ logger.warning(f"{module.__name__} already patched. Skipping.")
+ return
+
if module.__name__ == "anthropic":
attr = module.resources.completions.Completions
method = getattr(attr, "create")
@@ -798,7 +802,7 @@ def log10(module, DEBUG_=False, USE_ASYNC_=True):
attr = module.resources.messages.Messages
method = getattr(attr, "create")
setattr(attr, "create", intercepting_decorator(method))
- if module.__name__ == "mistralai":
+ elif module.__name__ == "mistralai" and getattr(module, "_log10_patched", False) is False:
attr = module.client.MistralClient
method = getattr(attr, "chat")
setattr(attr, "chat", intercepting_decorator(method))
@@ -869,6 +873,8 @@ def new_init(self, *args, **kwargs):
# # else: # uncomment if we want to include nested function support
# # intercept_nested_functions(attr)
+ module._log10_patched = True
+
if is_openai_v1():
import openai
diff --git a/log10/prompt_analyzer.py b/log10/prompt_analyzer.py
index 20d5f563..5eebde1b 100644
--- a/log10/prompt_analyzer.py
+++ b/log10/prompt_analyzer.py
@@ -3,6 +3,9 @@
import httpx
from dotenv import load_dotenv
+from rich.console import Console
+from rich.panel import Panel
+from rich.text import Text
from log10.llm import Log10Config
@@ -143,3 +146,40 @@ def analyze(self, prompt: str) -> dict:
return suggestions_dict
except Exception as e:
logger.error(e)
+
+
+def convert_suggestion_to_markdown(suggestion):
+ markdown_text = ""
+
+ markdown_text += "| Category | Recommendation |\n"
+ markdown_text += "| --- | --- |\n"
+ for category, contents in suggestion.items():
+ for subcategory, details in contents.items():
+ color = details["color"] # Assuming 'color' is always provided
+ recommendation = details["recommendation"]
+
+ # Using HTML to colorize the subcategory title if 'color' is specified
+ colored_subcategory = (
+ f'{subcategory}
'
+ )
+
+ # Add each subcategory and its recommendation to the table
+ markdown_text += f"| **{colored_subcategory}** | {recommendation} |\n"
+
+ # markdown_text += "\n" # Add a newline for spacing before the next section
+
+ return markdown_text
+
+
+def display_prompt_analyzer_suggestions(data, title=""):
+ console = Console()
+ console.print(f"[bold magenta]{title}[/bold magenta]")
+ for key, value in data.items():
+ for sub_key, sub_value in value.items():
+ text = Text(justify="left")
+ text.append(f"Recommendation: {sub_value['recommendation']}", style="white")
+ console.print(
+ Panel(
+ text, title=f"[bold {sub_value['color']}]{sub_key}[/]", border_style=f"bold {sub_value['color']}"
+ )
+ )
diff --git a/log10/utils.py b/log10/utils.py
index 5e7c3f8b..e2d2d7f6 100644
--- a/log10/utils.py
+++ b/log10/utils.py
@@ -3,6 +3,9 @@
import string
from copy import deepcopy
+from pandas import DataFrame
+from tabulate import tabulate
+
def merge_hparams(override, base):
merged = deepcopy(base)
@@ -45,3 +48,19 @@ def parse_field(value):
except json.JSONDecodeError:
# If it's not valid JSON, return the original string value as a list with singleton element
return [value]
+
+
+def generate_results_table(dataframe: DataFrame, column_list: list[str] = None, section_name: str = "") -> str:
+ selected_df = dataframe[column_list] if column_list else dataframe
+ section_name = f"## {section_name}" if section_name else "## Test Results"
+
+ table = tabulate(selected_df, headers="keys", tablefmt="pipe", showindex=True)
+ ret_str = f"{section_name}\n{table}"
+ return ret_str
+
+
+def generate_markdown_report(test_name: str, report_strings: list[str]):
+ with open(test_name, "w") as f:
+ f.write(f"Generated from {test_name}.\n\n")
+ for report_string in report_strings:
+ f.write(report_string + "\n\n")
diff --git a/pyproject.toml b/pyproject.toml
index 7d7ce1c5..7c2ed22f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ mosaicml-cli = "^0.5.30"
together = "^0.2.7"
google-cloud-aiplatform = ">=1.44.0"
mistralai = "^0.1.5"
+pandas = ">=2"
magentic = {version = ">=0.17.0", optional = true, markers = "python_version >= '3.10'"}
litellm = {version = "^1.34.18", optional = true}
diff --git a/tests/log10_load_twice.py b/tests/log10_load_twice.py
new file mode 100644
index 00000000..945447a7
--- /dev/null
+++ b/tests/log10_load_twice.py
@@ -0,0 +1,28 @@
+import openai
+from openai import OpenAI
+
+from log10.load import log10
+
+
+# log10 load twice should not expect to log the completion twice
+log10(openai)
+log10(openai)
+
+
+client = OpenAI()
+
+completion = client.chat.completions.create(
+ model="gpt-3.5-turbo",
+ messages=[
+ {
+ "role": "system",
+ "content": "You are the most knowledgable Star Wars guru on the planet",
+ },
+ {
+ "role": "user",
+ "content": "Write the time period of all the Star Wars movies and spinoffs?",
+ },
+ ],
+)
+
+print(completion.choices[0].message)