From ac2e1778f1651d68e11ede309b7e24bcb503c22d Mon Sep 17 00:00:00 2001
From: wenzhe <145375501+wenzhe-log10@users.noreply.github.com>
Date: Tue, 16 Apr 2024 21:46:07 -0700
Subject: [PATCH] [feature] add cli `log10 completions benchmark_models` to
 compare a logged completion with other models (#141)

* create log10 completions report to compare models from ids or tags

* generate report and also do autoprompt analysis with --analyze_prompt

* minor: enable mistral models

* add and check supported models for gpt, claude-3, and mistral
log the completions

* updates:
- cli docs
- skip failed completions
- remove double log10 patch
- add pandas in deps
- fix autofeedback import issue

* format

* allow to benchmark the same model

* add an example to test multiple log10 load only log once

* fix split models and completion ids with ending comma or multiple comma

* apply same for tags

* minor: skip id is its kind is not chat

* fix tags if not found, don't return any completions

* minor - add a warning if log10 load patched module

* minor clean up example

* move log10_load_twice.py to tests
---
 cli_docs.md                               |  93 +++++-
 log10/__main__.py                         |   3 +-
 log10/completions/completions.py          | 340 +++++++++++++++++++++-
 log10/feedback/_summary_feedback_utils.py |   6 +
 log10/feedback/autofeedback.py            |  11 +-
 log10/load.py                             |   8 +-
 log10/prompt_analyzer.py                  |  40 +++
 log10/utils.py                            |  19 ++
 pyproject.toml                            |   1 +
 tests/log10_load_twice.py                 |  28 ++
 10 files changed, 539 insertions(+), 10 deletions(-)
 create mode 100644 tests/log10_load_twice.py

diff --git a/cli_docs.md b/cli_docs.md
index 5c1eea27..9e5e2ecb 100644
--- a/cli_docs.md
+++ b/cli_docs.md
@@ -82,6 +82,64 @@ output (only showing part of the full raw output):
 }
 ```
 
+You can load completions' prompt messages and compare with other LLM models by using [`log10 completions benchmark_models`](#log10-completions-benchmark_models).
+For instance,
+```bash
+log10 completions benchmark_models --ids 25572f3c-c2f1-45b0-9de8-d96be4c4e544 --models=gpt-3.5-turbo,mistral-small-latest,claude-3-haiku-20240307
+```
+output
+```
+Running gpt-3.5-turbo
+Running mistral-small-latest
+Running claude-3-haiku-20240307
+completion_id: 25572f3c-c2f1-45b0-9de8-d96be4c4e544
+original_request:
+{
+  "model": "gpt-4-0125-preview",
+  "messages": [
+    {
+      "role": "system",
+      "content": "Summarize the article in 30 words."
+    },
+    {
+      "role": "user",
+      "content": "\"Story of Your Life\" is a science fiction novella by American writer Ted Chiang, first published in Starlight 2 in 1998, and in 2002 in Chiang's collection of short stories, Stories of Your Life and Others. Its major themes are language and determinism. \"Story of Your Life\" won the 2000 Nebula Award for Best Novella, as well as the 1999 Theodore Sturgeon Award. It was nominated for the 1999 Hugo Award for Best Novella. The novella has been translated into Italian, Japanese, French and German.[1] A film adaptation of the story, Arrival, was conceived and adapted by Eric Heisserer. Titled and directed by Denis Villeneuve, it was released in 2016. It stars Amy Adams, Jeremy Renner, and Forest Whitaker and was nominated for eight Academy Awards, including Best Picture and Best Adapted Screenplay; it won the award for Best Sound Editing.[2][3][4] The film also won the 2017 Ray Bradbury Award for Outstanding Dramatic Presentation and the Hugo Award for Best Dramatic Presentation."
+    }
+  ],
+  "temperature": 0.2
+}
+╭─────────────────────────┬───────────────────────────────────────────────────────┬──────────────────────────────────┬───────────────╮
+│ Model                   │ Content                                               │ Total Token Usage (Input/Output) │ Duration (ms) │
+├─────────────────────────┼───────────────────────────────────────────────────────┼──────────────────────────────────┼───────────────┤
+│ gpt-4-0125-preview      │ "Story of Your Life" by Ted Chiang explores language  │ 323 (255/68)                     │ 2527          │
+│                         │ and determinism, winning the 2000 Nebula and 1999     │                                  │               │
+│                         │ Theodore Sturgeon Awards. Adapted into the film       │                                  │               │
+│                         │ "Arrival" by Denis Villeneuve in 2016, it received    │                                  │               │
+│                         │ critical acclaim and multiple awards, including an    │                                  │               │
+│                         │ Academy Award for Best Sound Editing.                 │                                  │               │
+├─────────────────────────┼───────────────────────────────────────────────────────┼──────────────────────────────────┼───────────────┤
+│ gpt-3.5-turbo           │ "Story of Your Life" is a science fiction novella by  │ 295 (255/40)                     │ 2345          │
+│                         │ Ted Chiang, exploring themes of language and          │                                  │               │
+│                         │ determinism. It won awards and was adapted into the   │                                  │               │
+│                         │ film Arrival in 2016.                                 │                                  │               │
+├─────────────────────────┼───────────────────────────────────────────────────────┼──────────────────────────────────┼───────────────┤
+│ mistral-small-latest    │ "Story of Your Life" is a Ted Chiang novella          │ 342 (282/60)                     │ 2087          │
+│                         │ exploring language and determinism, winning Nebula    │                                  │               │
+│                         │ and Sturgeon Awards. It was adapted into the 2016     │                                  │               │
+│                         │ film "Arrival," which received multiple Academy Award │                                  │               │
+│                         │ nominations and won for Best Sound Editing.           │                                  │               │
+├─────────────────────────┼───────────────────────────────────────────────────────┼──────────────────────────────────┼───────────────┤
+│ claude-3-haiku-20240307 │ "Story of Your Life" is a science fiction novella by  │ 320 (274/46)                     │ 1944          │
+│                         │ Ted Chiang, exploring themes of language and          │                                  │               │
+│                         │ determinism, winning multiple awards and inspiring a  │                                  │               │
+│                         │ film adaptation, Arrival, which was critically        │                                  │               │
+│                         │ acclaimed.                                            │                                  │               │
+╰─────────────────────────┴───────────────────────────────────────────────────────┴──────────────────────────────────┴───────────────╯
+```
+
+You can also filter the completions by tags and generate a report in markdown file using `--file` or `-f`. And run our prompt analyzer (auto-prompt) using `--analyze_prompt`.
+
+
 ### Feedback Tasks and Feedback
 To start adding feedback, first you need to define a feedback task with [`log10 feedback-task create`](#log10-feedback-task-create). Then you can add feedback to a logged completions with [`log10 feedback create`](#log10-feedback-create). For more details, you can read more in [log10's user documentation](https://log10.io/docs/feedback/feedback#add-feedback).
 
@@ -132,10 +190,39 @@ Usage: log10 completions [OPTIONS] COMMAND [ARGS]...
 
   Manage logs from completions i.e. logs from users
 
+Options:
+  --help  Show this message and exit.
+
 Commands:
-  download  Download completions to a jsonl file
-  get       Get a completion by id
-  list      List completions
+  benchmark_models  Compare completions using different models and...
+  download          Download completions to a jsonl file
+  get               Get a completion by id
+  list              List completions
+```
+
+#### log10 completions benchmark_models
+```bash
+log10 completions benchmark_models --help
+Usage: log10 completions benchmark_models [OPTIONS]
+
+  Compare completions using different models and generate report
+
+Options:
+  --ids TEXT            Completion IDs. Separate multiple ids with commas.
+  --tags TEXT           Filter completions by specific tags. Separate multiple
+                        tags with commas.
+  --limit TEXT          Specify the maximum number of completions to retrieve
+                        filtered by tags.
+  --offset TEXT         Set the starting point (offset) from where to begin
+                        fetching completions filtered by tags.
+  --models TEXT         Comma separated list of models to compare
+  --temperature FLOAT   Temperature
+  --max_tokens INTEGER  Max tokens
+  --top_p FLOAT         Top p
+  --analyze_prompt      Run prompt analyzer on the messages.
+  -f, --file TEXT       Specify the filename for the report in markdown
+                        format.
+  --help                Show this message and exit.
 ```
 
 #### log10 completions download
diff --git a/log10/__main__.py b/log10/__main__.py
index 5d53659d..20a67310 100644
--- a/log10/__main__.py
+++ b/log10/__main__.py
@@ -1,6 +1,6 @@
 import click
 
-from log10.completions.completions import download_completions, get_completion, list_completions
+from log10.completions.completions import benchmark_models, download_completions, get_completion, list_completions
 from log10.feedback.autofeedback import auto_feedback_icl
 from log10.feedback.feedback import create_feedback, download_feedback, get_feedback, list_feedback
 from log10.feedback.feedback_task import create_feedback_task, get_feedback_task, list_feedback_task
@@ -39,6 +39,7 @@ def feedback_task():
 completions.add_command(list_completions, "list")
 completions.add_command(get_completion, "get")
 completions.add_command(download_completions, "download")
+completions.add_command(benchmark_models, "benchmark_models")
 
 cli.add_command(feedback)
 feedback.add_command(create_feedback, "create")
diff --git a/log10/completions/completions.py b/log10/completions/completions.py
index aedc340a..3b28b557 100644
--- a/log10/completions/completions.py
+++ b/log10/completions/completions.py
@@ -1,7 +1,9 @@
 import json
+import time
 
 import click
 import httpx
+import pandas as pd
 import rich
 import tqdm
 from rich.console import Console
@@ -9,6 +11,8 @@
 
 from log10._httpx_utils import _get_time_diff, _try_get
 from log10.llm import Log10Config
+from log10.prompt_analyzer import PromptAnalyzer, convert_suggestion_to_markdown, display_prompt_analyzer_suggestions
+from log10.utils import generate_markdown_report, generate_results_table
 
 
 _log10_config = Log10Config()
@@ -37,10 +41,13 @@ def _get_tag_id(tag: str) -> str:
 
 def _get_tag_ids(tags):
     tag_ids = []
-    for tag in tags.split(","):
+    for tag in [t for t in tags.split(",") if t]:
         tag_id = _get_tag_id(tag)
         if tag_id:
             tag_ids.append(tag_id)
+        else:
+            raise SystemExit(f"Cannot found tag: {tag}.")
+
     tag_ids_str = ",".join(tag_ids)
     return tag_ids_str
 
@@ -248,3 +255,334 @@ def download_completions(limit, offset, timeout, tags, from_date, to_date, compa
         res = _try_get(download_url, timeout)
         _write_completions(res, file, compact)
         pbar.update(current_batch_size)
+
+
+def _get_llm_repsone(
+    model: str,
+    messages: list[dict],
+    temperature: float = 0.2,
+    max_tokens: int = 512,
+    top_p: float = 1.0,
+):
+    ret = {"content": "", "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, "duration": 0.0}
+
+    start_time = time.perf_counter()
+    if "gpt-4" in model or "gpt-3.5" in model:
+        from log10.load import OpenAI
+
+        client = OpenAI()
+        response = client.chat.completions.create(
+            model=model, messages=messages, temperature=temperature, max_tokens=max_tokens, top_p=top_p
+        )
+        ret["content"] = response.choices[0].message.content
+        ret["usage"] = response.usage.dict()
+    elif "claude-3" in model:
+        from log10.load import Anthropic
+
+        system_messages = [m["content"] for m in messages if m["role"] == "system"]
+        other_messages = [m for m in messages if m["role"] != "system"]
+        system_prompt = ("\n").join(system_messages)
+
+        client = Anthropic()
+        response = client.messages.create(
+            model=model,
+            system=system_prompt,
+            messages=other_messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        )
+        ret["content"] = response.content[0].text
+        ret["usage"]["prompt_tokens"] = response.usage.input_tokens
+        ret["usage"]["completion_tokens"] = response.usage.output_tokens
+        ret["usage"]["total_tokens"] = response.usage.input_tokens + response.usage.output_tokens
+    elif "mistral" in model:
+        import mistralai
+        from mistralai.client import MistralClient
+
+        from log10.load import log10
+
+        log10(mistralai)
+
+        client = MistralClient()
+        response = client.chat(
+            model=model, messages=messages, temperature=temperature, max_tokens=max_tokens, top_p=top_p
+        )
+        ret["content"] = response.choices[0].message.content
+        ret["usage"] = response.usage.model_dump()
+    else:
+        raise ValueError(f"Model {model} not supported.")
+    ret["duration"] = int((time.perf_counter() - start_time) * 1000)
+
+    return ret
+
+
+def _render_comparison_table(model_response_raw_data):
+    rich.print(f"completion_id: {model_response_raw_data['completion_id']}")
+    rich.print("original_request:")
+    rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4))
+
+    table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True)
+    table.add_column("Model")
+    table.add_column("Content")
+    table.add_column("Total Token Usage (Input/Output)")
+    table.add_column("Duration (ms)")
+
+    for model, data in model_response_raw_data.items():
+        # only display model data
+        if model not in ["completion_id", "original_request"]:
+            usage = data["usage"]
+            formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})"
+            table.add_row(model, data["content"], formatted_usage, str(data["duration"]))
+    rich.print(table)
+
+
+def _create_dataframe_from_comparison_data(model_response_raw_data):
+    completion_id = model_response_raw_data["completion_id"]
+    original_request = model_response_raw_data["original_request"]
+    rows = []
+    for model, model_data in model_response_raw_data.items():
+        # only display model data
+        if model not in ["completion_id", "original_request"]:
+            content = model_data["content"]
+            usage = model_data["usage"]
+            prompt_tokens = usage["prompt_tokens"]
+            completion_tokens = usage["completion_tokens"]
+            total_tokens = usage["total_tokens"]
+            duration = model_data["duration"]
+            prompt_messages = json.dumps(original_request["messages"])
+            rows.append(
+                [
+                    completion_id,
+                    prompt_messages,
+                    model,
+                    content,
+                    prompt_tokens,
+                    completion_tokens,
+                    total_tokens,
+                    duration,
+                ]
+            )
+
+    df = pd.DataFrame(
+        rows,
+        columns=[
+            "Completion ID",
+            "Prompt Messages",
+            "Model",
+            "Content",
+            "Prompt Tokens",
+            "Completion Tokens",
+            "Total Tokens",
+            "Duration (ms)",
+        ],
+    )
+
+    return df
+
+
+def _compare(models: list[str], messages: dict, temperature: float = 0.2, max_tokens: float = 256, top_p: float = 1.0):
+    ret = {}
+    if models:
+        for model in models:
+            rich.print(f"Running {model}")
+            response = _get_llm_repsone(
+                model,
+                messages,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                top_p=top_p,
+            )
+
+            ret[model] = response
+    return ret
+
+
+_SUPPORTED_MODELS = [
+    # openai chat models
+    "gpt-4-turbo",
+    "gpt-4-turbo-2024-04-09",
+    "gpt-4-0125-preview",
+    "gpt-4-turbo-preview",
+    "gpt-4-1106-preview",
+    "gpt-4-vision-preview",
+    "gpt-4",
+    "gpt-4-0314",
+    "gpt-4-0613",
+    "gpt-4-32k",
+    "gpt-4-32k-0314",
+    "gpt-4-32k-0613",
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-16k",
+    "gpt-3.5-turbo-0301",
+    "gpt-3.5-turbo-0613",
+    "gpt-3.5-turbo-1106",
+    "gpt-3.5-turbo-0125",
+    "gpt-3.5-turbo-16k-0613",
+    # anthropic claude
+    "claude-3-opus-20240229",
+    "claude-3-sonnet-20240229",
+    "claude-3-haiku-20240307",
+    # mistral
+    "mistral-small-latest",
+    "mistral-medium-latest",
+    "mistral-large-latest",
+]
+
+
+def _check_model_support(model: str) -> bool:
+    return model in _SUPPORTED_MODELS
+
+
+@click.command()
+@click.option("--ids", default="", help="Completion IDs. Separate multiple ids with commas.")
+@click.option("--tags", default="", help="Filter completions by specific tags. Separate multiple tags with commas.")
+@click.option("--limit", help="Specify the maximum number of completions to retrieve filtered by tags.")
+@click.option(
+    "--offset", help="Set the starting point (offset) from where to begin fetching completions filtered by tags."
+)
+@click.option("--models", default="", help="Comma separated list of models to compare")
+@click.option("--temperature", default=0.2, help="Temperature")
+@click.option("--max_tokens", default=512, help="Max tokens")
+@click.option("--top_p", default=1.0, help="Top p")
+@click.option("--analyze_prompt", is_flag=True, help="Run prompt analyzer on the messages.")
+@click.option("--file", "-f", help="Specify the filename for the report in markdown format.")
+def benchmark_models(ids, tags, limit, offset, models, temperature, max_tokens, top_p, file, analyze_prompt):
+    """
+    Compare completions using different models and generate report
+    """
+    if ids and tags:
+        raise click.UsageError("--ids and --tags cannot be set together.")
+    if (limit or offset) and not tags:
+        raise click.UsageError("--limit and --offset can only be used with --tags.")
+    if tags:
+        if not limit:
+            limit = 5
+        if not offset:
+            offset = 0
+
+    if not models:
+        raise click.UsageError("--models must be set to compare.")
+    else:
+        for model in [m for m in models.split(",") if m]:
+            if not _check_model_support(model):
+                raise click.UsageError(f"Model {model} is not supported.")
+
+    # get completions ids
+    completion_ids = []
+    if ids:
+        completion_ids = [id for id in ids.split(",") if id]
+    elif tags:
+        base_url = _log10_config.url
+        org_id = _log10_config.org_id
+        url = _get_completions_url(limit, offset, tags, None, None, base_url, org_id)
+        res = _try_get(url)
+        completions = res.json()["data"]
+        completion_ids = [completion["id"] for completion in completions]
+        if not completion_ids:
+            SystemExit(f"No completions found for tags: {tags}")
+
+    compare_models = [m for m in models.split(",") if m]
+
+    data = []
+    skipped_completion_ids = []
+    for id in completion_ids:
+        # get message from id
+        completion_data = _get_completion(id).json()["data"]
+
+        # skip completion if status is not finished or kind is not chat
+        if completion_data["status"] != "finished" or completion_data["kind"] != "chat":
+            rich.print(f"Skip completion {id}. Status is not finished or kind is not chat.")
+            skipped_completion_ids.append(id)
+            continue
+
+        original_model_request = completion_data["request"]
+        original_model_response = completion_data["response"]
+        original_model = original_model_response["model"]
+        benchmark_data = {
+            "completion_id": id,
+            "original_request": original_model_request,
+            f"{original_model} (original model)": {
+                "content": original_model_response["choices"][0]["message"]["content"],
+                "usage": original_model_response["usage"],
+                "duration": completion_data["duration"],
+            },
+        }
+        messages = original_model_request["messages"]
+        compare_models_data = _compare(compare_models, messages, temperature, max_tokens, top_p)
+        benchmark_data.update(compare_models_data)
+        data.append(benchmark_data)
+
+    prompt_analysis_data = {}
+    if analyze_prompt:
+        rich.print("Analyzing prompts")
+        for item in data:
+            completion_id = item["completion_id"]
+            prompt_messages = item["original_request"]["messages"]
+            all_messages = "\n\n".join([m["content"] for m in prompt_messages])
+            analyzer = PromptAnalyzer()
+            suggestions = analyzer.analyze(all_messages)
+            prompt_analysis_data[completion_id] = suggestions
+
+    # create an empty dataframe
+    all_df = pd.DataFrame(
+        columns=[
+            "Completion ID",
+            "Prompt Messages",
+            "Model",
+            "Content",
+            "Prompt Tokens",
+            "Completion Tokens",
+            "Total Tokens",
+            "Duration (ms)",
+        ]
+    )
+
+    #
+    # Display or save the results
+    #
+    if not file:
+        # display in terminal using rich
+        for ret in data:
+            _render_comparison_table(ret)
+            if analyze_prompt:
+                completion_id = ret["completion_id"]
+                suggestions = prompt_analysis_data[completion_id]
+                rich.print(f"Prompt Analysis for completion_id: {completion_id}")
+                display_prompt_analyzer_suggestions(suggestions)
+    else:
+        # generate markdown report and save to file
+        for ret in data:
+            df = _create_dataframe_from_comparison_data(ret)
+            all_df = pd.concat([all_df, df])
+        pivot_df = all_df.pivot(index="Completion ID", columns="Model", values="Content")
+        pivot_df["Prompt Messages"] = all_df.groupby("Completion ID")["Prompt Messages"].first()
+        # Reorder the columns
+        cols = pivot_df.columns.tolist()
+        cols = [cols[-1]] + cols[:-1]
+        pivot_df = pivot_df[cols]
+
+        pivot_table = generate_results_table(pivot_df, section_name="model comparison")
+        all_results_table = generate_results_table(all_df, section_name="All Results")
+
+        prompt_analysis_markdown = ""
+        if analyze_prompt:
+            prompt_analysis_markdown = "## Prompt Analysis\n\n"
+            for completion_id, suggestions in prompt_analysis_data.items():
+                prompt_messages = all_df[all_df["Completion ID"] == completion_id]["Prompt Messages"].values[0]
+                prompt_analysis_markdown += (
+                    f"### Prompt Analysis for completion_id: {completion_id}\n\n{prompt_messages}\n\n"
+                )
+                prompt_analysis_markdown += convert_suggestion_to_markdown(suggestions)
+
+        # generate the list of skipped completions ids
+        skipped_completion_markdown = ""
+        if skipped_completion_ids:
+            skipped_completion_ids_str = ", ".join(skipped_completion_ids)
+            skipped_completion_markdown += "## Skipped Completion IDs\n\n"
+            skipped_completion_markdown += f"Skipped completions: {skipped_completion_ids_str}\n\n"
+
+        generate_markdown_report(
+            file, [pivot_table, prompt_analysis_markdown, all_results_table, skipped_completion_markdown]
+        )
+        rich.print(f"Report saved to {file}")
diff --git a/log10/feedback/_summary_feedback_utils.py b/log10/feedback/_summary_feedback_utils.py
index 909c2ec9..c9df2356 100644
--- a/log10/feedback/_summary_feedback_utils.py
+++ b/log10/feedback/_summary_feedback_utils.py
@@ -1,5 +1,11 @@
 import sys
 
+import openai
+
+from log10.load import log10
+
+
+log10(openai)
 
 if sys.version_info < (3, 10):
     raise RuntimeError("Python 3.10 or higher is required to run summary feedback llm call.")
diff --git a/log10/feedback/autofeedback.py b/log10/feedback/autofeedback.py
index c6d98a8b..20e4cdbb 100644
--- a/log10/feedback/autofeedback.py
+++ b/log10/feedback/autofeedback.py
@@ -4,12 +4,11 @@
 from types import FunctionType
 
 import click
-import openai
 from rich.console import Console
 
 from log10.completions.completions import _get_completion
 from log10.feedback.feedback import _get_feedback_list
-from log10.load import log10, log10_session
+from log10.load import log10_session
 
 
 try:
@@ -19,7 +18,6 @@
 except ImportError:
     Magentic_imported = False
 
-log10(openai)
 
 logger = logging.getLogger("LOG10")
 logger.setLevel(logging.INFO)
@@ -33,7 +31,12 @@ class AutoFeedbackICL:
     _examples: list[dict] = []
     _predict_func: FunctionType = None
 
-    def __init__(self, task_id: str, num_samples: int = 5, predict_func: FunctionType = summary_feedback_llm_call):
+    def __init__(
+        self,
+        task_id: str,
+        num_samples: int = 5,
+        predict_func: FunctionType = summary_feedback_llm_call if Magentic_imported else None,
+    ):
         if not Magentic_imported:
             raise ImportError(
                 "Log10 feedback predict requires magentic package. Please install using 'pip install log10-io[autofeedback_icl]'"
diff --git a/log10/load.py b/log10/load.py
index 1f3e649b..f091e9e7 100644
--- a/log10/load.py
+++ b/log10/load.py
@@ -789,6 +789,10 @@ def log10(module, DEBUG_=False, USE_ASYNC_=True):
     #         elif inspect.isclass(method):  # Handle nested classes
     #             intercept_class_methods(method)
 
+    if getattr(module, "_log10_patched", False):
+        logger.warning(f"{module.__name__} already patched. Skipping.")
+        return
+
     if module.__name__ == "anthropic":
         attr = module.resources.completions.Completions
         method = getattr(attr, "create")
@@ -798,7 +802,7 @@ def log10(module, DEBUG_=False, USE_ASYNC_=True):
         attr = module.resources.messages.Messages
         method = getattr(attr, "create")
         setattr(attr, "create", intercepting_decorator(method))
-    if module.__name__ == "mistralai":
+    elif module.__name__ == "mistralai" and getattr(module, "_log10_patched", False) is False:
         attr = module.client.MistralClient
         method = getattr(attr, "chat")
         setattr(attr, "chat", intercepting_decorator(method))
@@ -869,6 +873,8 @@ def new_init(self, *args, **kwargs):
         # # else: # uncomment if we want to include nested function support
         # #     intercept_nested_functions(attr)
 
+    module._log10_patched = True
+
 
 if is_openai_v1():
     import openai
diff --git a/log10/prompt_analyzer.py b/log10/prompt_analyzer.py
index 20d5f563..5eebde1b 100644
--- a/log10/prompt_analyzer.py
+++ b/log10/prompt_analyzer.py
@@ -3,6 +3,9 @@
 
 import httpx
 from dotenv import load_dotenv
+from rich.console import Console
+from rich.panel import Panel
+from rich.text import Text
 
 from log10.llm import Log10Config
 
@@ -143,3 +146,40 @@ def analyze(self, prompt: str) -> dict:
             return suggestions_dict
         except Exception as e:
             logger.error(e)
+
+
+def convert_suggestion_to_markdown(suggestion):
+    markdown_text = ""
+
+    markdown_text += "| Category | Recommendation |\n"
+    markdown_text += "| --- | --- |\n"
+    for category, contents in suggestion.items():
+        for subcategory, details in contents.items():
+            color = details["color"]  # Assuming 'color' is always provided
+            recommendation = details["recommendation"]
+
+            # Using HTML to colorize the subcategory title if 'color' is specified
+            colored_subcategory = (
+                f'<span style="color:{color};">{subcategory}</span> <br> <span style="color:{color};"></span>'
+            )
+
+            # Add each subcategory and its recommendation to the table
+            markdown_text += f"| **{colored_subcategory}** | {recommendation} |\n"
+
+        # markdown_text += "\n"  # Add a newline for spacing before the next section
+
+    return markdown_text
+
+
+def display_prompt_analyzer_suggestions(data, title=""):
+    console = Console()
+    console.print(f"[bold magenta]{title}[/bold magenta]")
+    for key, value in data.items():
+        for sub_key, sub_value in value.items():
+            text = Text(justify="left")
+            text.append(f"Recommendation: {sub_value['recommendation']}", style="white")
+            console.print(
+                Panel(
+                    text, title=f"[bold {sub_value['color']}]{sub_key}[/]", border_style=f"bold {sub_value['color']}"
+                )
+            )
diff --git a/log10/utils.py b/log10/utils.py
index 5e7c3f8b..e2d2d7f6 100644
--- a/log10/utils.py
+++ b/log10/utils.py
@@ -3,6 +3,9 @@
 import string
 from copy import deepcopy
 
+from pandas import DataFrame
+from tabulate import tabulate
+
 
 def merge_hparams(override, base):
     merged = deepcopy(base)
@@ -45,3 +48,19 @@ def parse_field(value):
     except json.JSONDecodeError:
         # If it's not valid JSON, return the original string value as a list with singleton element
         return [value]
+
+
+def generate_results_table(dataframe: DataFrame, column_list: list[str] = None, section_name: str = "") -> str:
+    selected_df = dataframe[column_list] if column_list else dataframe
+    section_name = f"## {section_name}" if section_name else "## Test Results"
+
+    table = tabulate(selected_df, headers="keys", tablefmt="pipe", showindex=True)
+    ret_str = f"{section_name}\n{table}"
+    return ret_str
+
+
+def generate_markdown_report(test_name: str, report_strings: list[str]):
+    with open(test_name, "w") as f:
+        f.write(f"Generated from {test_name}.\n\n")
+        for report_string in report_strings:
+            f.write(report_string + "\n\n")
diff --git a/pyproject.toml b/pyproject.toml
index 7d7ce1c5..7c2ed22f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ mosaicml-cli = "^0.5.30"
 together = "^0.2.7"
 google-cloud-aiplatform = ">=1.44.0"
 mistralai = "^0.1.5"
+pandas = ">=2"
 
 magentic = {version = ">=0.17.0", optional = true, markers = "python_version >= '3.10'"}
 litellm = {version = "^1.34.18", optional = true}
diff --git a/tests/log10_load_twice.py b/tests/log10_load_twice.py
new file mode 100644
index 00000000..945447a7
--- /dev/null
+++ b/tests/log10_load_twice.py
@@ -0,0 +1,28 @@
+import openai
+from openai import OpenAI
+
+from log10.load import log10
+
+
+# log10 load twice should not expect to log the completion twice
+log10(openai)
+log10(openai)
+
+
+client = OpenAI()
+
+completion = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "system",
+            "content": "You are the most knowledgable Star Wars guru on the planet",
+        },
+        {
+            "role": "user",
+            "content": "Write the time period of all the Star Wars movies and spinoffs?",
+        },
+    ],
+)
+
+print(completion.choices[0].message)