diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7956e481..036f6062 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -63,7 +63,10 @@ jobs: - name: Install dependencies run: poetry install --all-extras - - name: Run dispatch tests + - name: Run cli tests + run: poetry run pytest -vv tests/test_cli.py + + - name: Run dispatch llm tests if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }} run: | echo "This is a dispatch event" @@ -113,13 +116,13 @@ jobs: if $empty_inputs; then echo "All variables are empty" - poetry run pytest -vv tests/ + poetry run pytest -vv tests/ --ignore=tests/test_cli.py poetry run pytest --llm_provider=anthropic -vv tests/test_magentic.py fi - - name: Run scheduled tests + - name: Run scheduled llm tests if: ${{ github.event_name == 'schedule' }} run: | echo "This is a schedule event" - poetry run pytest -vv tests/ + poetry run pytest -vv tests/ --ignore=tests/test_cli.py poetry run pytest --openai_model=gpt-4o -m chat -vv tests/test_openai.py diff --git a/log10/__main__.py b/log10/__main__.py index 6365121b..07c9387e 100644 --- a/log10/__main__.py +++ b/log10/__main__.py @@ -1,68 +1,5 @@ -import click +from log10.cli.cli_commands import cli -from log10.completions.completions import benchmark_models, download_completions, get_completion, list_completions -from log10.feedback.autofeedback import auto_feedback_icl, get_autofeedback_cli -from log10.feedback.feedback import create_feedback, download_feedback, get_feedback, list_feedback -from log10.feedback.feedback_task import create_feedback_task, get_feedback_task, list_feedback_task - - -@click.group() -def cli(): - pass - - -@click.group() -def completions(): - """ - Manage logs from completions i.e. logs from users - """ - pass - - -@click.group(name="feedback") -def feedback(): - """ - Manage feedback for completions i.e. capturing feedback from users - """ - pass - - -@click.group(name="auto_feedback") -def auto_feedback(): - """ - Manage auto feedback for completions i.e. capturing feedback from users - """ - pass - - -@click.group() -def feedback_task(): - """ - Manage tasks for feedback i.e. instructions and schema for feedback - """ - pass - - -cli.add_command(completions) -completions.add_command(list_completions, "list") -completions.add_command(get_completion, "get") -completions.add_command(download_completions, "download") -completions.add_command(benchmark_models, "benchmark_models") - -cli.add_command(feedback) -feedback.add_command(create_feedback, "create") -feedback.add_command(list_feedback, "list") -feedback.add_command(get_feedback, "get") -feedback.add_command(download_feedback, "download") -feedback.add_command(auto_feedback_icl, "predict") -feedback.add_command(auto_feedback, "autofeedback") -# Subcommands for auto_feedback under feedback command -auto_feedback.add_command(get_autofeedback_cli, "get") - -cli.add_command(feedback_task) -feedback_task.add_command(create_feedback_task, "create") -feedback_task.add_command(list_feedback_task, "list") -feedback_task.add_command(get_feedback_task, "get") if __name__ == "__main__": cli() diff --git a/log10/cli/autofeedback.py b/log10/cli/autofeedback.py new file mode 100644 index 00000000..a1ecf364 --- /dev/null +++ b/log10/cli/autofeedback.py @@ -0,0 +1,47 @@ +import json + +import click +import rich +from rich.console import Console + +from log10.feedback.autofeedback import AutoFeedbackICL, get_autofeedback + + +@click.command() +@click.option("--task_id", help="Feedback task ID") +@click.option("--content", help="Completion content") +@click.option("--file", "-f", help="File containing completion content") +@click.option("--completion_id", help="Completion ID") +@click.option("--num_samples", default=5, help="Number of samples to use for few-shot learning") +def auto_feedback_icl(task_id: str, content: str, file: str, completion_id: str, num_samples: int): + """ + Generate feedback with existing human feedback based on in context learning + """ + options_count = sum([1 for option in [content, file, completion_id] if option]) + if options_count > 1: + click.echo("Only one of --content, --file, or --completion_id should be provided.") + return + + console = Console() + auto_feedback_icl = AutoFeedbackICL(task_id, num_samples=num_samples) + if completion_id: + results = auto_feedback_icl.predict(completion_id=completion_id) + console.print_json(results) + return + + if file: + with open(file, "r") as f: + content = f.read() + results = auto_feedback_icl.predict(text=content) + console.print_json(results) + + +@click.command() +@click.option("--completion-id", required=True, help="Completion ID") +def get_autofeedback_cli(completion_id: str): + """ + Get an auto feedback by completion id + """ + res = get_autofeedback(completion_id) + if res: + rich.print_json(json.dumps(res["data"], indent=4)) diff --git a/log10/cli/cli_commands.py b/log10/cli/cli_commands.py new file mode 100644 index 00000000..0396841b --- /dev/null +++ b/log10/cli/cli_commands.py @@ -0,0 +1,74 @@ +try: + import click + import pandas # noqa: F401 + import rich # noqa: F401 + import tabulate # noqa: F401 +except ImportError: + print( + "To use log10 cli you must install optional modules. Please install them with `pip install 'log10-io[cli]'`." + ) + exit(1) + +from log10.cli.autofeedback import auto_feedback_icl, get_autofeedback_cli +from log10.cli.completions import benchmark_models, download_completions, get_completion, list_completions +from log10.cli.feedback import create_feedback, download_feedback, get_feedback, list_feedback +from log10.cli.feedback_task import create_feedback_task, get_feedback_task, list_feedback_task + + +@click.group() +def cli(): + pass + + +@click.group() +def completions(): + """ + Manage logs from completions i.e. logs from users + """ + pass + + +@click.group(name="feedback") +def feedback(): + """ + Manage feedback for completions i.e. capturing feedback from users + """ + pass + + +@click.group(name="auto_feedback") +def auto_feedback(): + """ + Manage auto feedback for completions i.e. capturing feedback from users + """ + pass + + +@click.group() +def feedback_task(): + """ + Manage tasks for feedback i.e. instructions and schema for feedback + """ + pass + + +cli.add_command(completions) +completions.add_command(list_completions, "list") +completions.add_command(get_completion, "get") +completions.add_command(download_completions, "download") +completions.add_command(benchmark_models, "benchmark_models") + +cli.add_command(feedback) +feedback.add_command(create_feedback, "create") +feedback.add_command(list_feedback, "list") +feedback.add_command(get_feedback, "get") +feedback.add_command(download_feedback, "download") +feedback.add_command(auto_feedback_icl, "predict") +feedback.add_command(auto_feedback, "autofeedback") +# Subcommands for auto_feedback under feedback command +auto_feedback.add_command(get_autofeedback_cli, "get") + +cli.add_command(feedback_task) +feedback_task.add_command(create_feedback_task, "create") +feedback_task.add_command(list_feedback_task, "list") +feedback_task.add_command(get_feedback_task, "get") diff --git a/log10/cli/completions.py b/log10/cli/completions.py new file mode 100644 index 00000000..885447ba --- /dev/null +++ b/log10/cli/completions.py @@ -0,0 +1,408 @@ +import json + +import click +import pandas as pd +import rich +import tqdm +from rich.console import Console +from rich.table import Table + +from log10._httpx_utils import _get_time_diff, _try_get +from log10.cli_utils import generate_markdown_report, generate_results_table +from log10.completions.completions import ( + _check_model_support, + _compare, + _get_completion, + _get_completions_url, + _write_completions, +) +from log10.llm import Log10Config +from log10.prompt_analyzer import PromptAnalyzer, convert_suggestion_to_markdown, display_prompt_analyzer_suggestions + + +_log10_config = Log10Config() + + +def _render_completions_table(completions_data, total_completions): + data_for_table = [] + for completion in completions_data: + prompt, response = "", "" + if completion.get("kind") == "completion": + prompt = completion.get("request", {}).get("prompt", "") + response_choices = completion.get("response", {}).get("choices", []) + if response_choices: + response = response_choices[0].get("text", "") + elif completion.get("kind") == "chat": + request_messages = completion.get("request", {}).get("messages", []) + prompt = request_messages[0].get("content", "") if request_messages else "" + + response_choices = completion.get("response", {}).get("choices", []) + if response_choices: + # Handle 'message' and 'function_call' within the first choice safely + first_choice = response_choices[0] + if "message" in first_choice: + message = first_choice["message"] + response = ( + message.get("content") + or message.get("tool_calls", [])[-1].get("function", {}).get("arguments", "") + if message.get("tool_calls") + else "" + ) + elif "function_call" in first_choice: + response = json.dumps(first_choice.get("function_call", {})) + else: + rich.print(f"Unknown completion kind: {completion['kind']} for id: {completion['id']}") + + data_for_table.append( + { + "id": completion["id"], + "status": "success" if completion["status"] == "finished" else completion["status"], + "created_at": _get_time_diff(completion["created_at"]), + "prompt": prompt, + "completion": response, + "tags": [t["name"] for t in completion["tagResolved"]], + } + ) + # render data_for_table with rich table + table = Table(show_header=True, header_style="bold magenta") + + table.add_column("ID", style="dim") + table.add_column("Status") + table.add_column("Created At") + table.add_column("Prompt", overflow="fold") + table.add_column("Completion", overflow="fold") + table.add_column("Tags", justify="right") + + max_len = 40 + for item in data_for_table: + tags = ", ".join(item["tags"]) if item["tags"] else "" + if isinstance(item["prompt"], list): + item["prompt"] = " ".join(item["prompt"]) + short_prompt = item["prompt"][:max_len] + "..." if len(item["prompt"]) > max_len else item["prompt"] + completion = item.get("completion", "") + short_completion = completion[:max_len] + "..." if len(completion) > max_len else completion + table.add_row(item["id"], item["status"], item["created_at"], short_prompt, short_completion, tags) + + console = Console() + console.print(table) + console.print(f"{total_completions=}") + + +def _render_comparison_table(model_response_raw_data): + rich.print(f"completion_id: {model_response_raw_data['completion_id']}") + rich.print("original_request:") + rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4)) + + table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True) + table.add_column("Model") + table.add_column("Content") + table.add_column("Total Token Usage (Input/Output)") + table.add_column("Duration (ms)") + + for model, data in model_response_raw_data.items(): + # only display model data + if model not in ["completion_id", "original_request"]: + usage = data["usage"] + formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})" + table.add_row(model, data["content"], formatted_usage, str(data["duration"])) + rich.print(table) + + +def _create_dataframe_from_comparison_data(model_response_raw_data): + completion_id = model_response_raw_data["completion_id"] + original_request = model_response_raw_data["original_request"] + rows = [] + for model, model_data in model_response_raw_data.items(): + # only display model data + if model not in ["completion_id", "original_request"]: + content = model_data["content"] + usage = model_data["usage"] + prompt_tokens = usage["prompt_tokens"] + completion_tokens = usage["completion_tokens"] + total_tokens = usage["total_tokens"] + duration = model_data["duration"] + prompt_messages = json.dumps(original_request["messages"]) + rows.append( + [ + completion_id, + prompt_messages, + model, + content, + prompt_tokens, + completion_tokens, + total_tokens, + duration, + ] + ) + + df = pd.DataFrame( + rows, + columns=[ + "Completion ID", + "Prompt Messages", + "Model", + "Content", + "Prompt Tokens", + "Completion Tokens", + "Total Tokens", + "Duration (ms)", + ], + ) + + return df + + +@click.command() +@click.option("--limit", default=25, help="Specify the maximum number of completions to retrieve.") +@click.option("--offset", default=0, help="Set the starting point (offset) from where to begin fetching completions.") +@click.option( + "--timeout", default=10, help="Set the maximum time (in seconds) allowed for the HTTP request to complete." +) +@click.option("--tags", default="", help="Filter completions by specific tags. Separate multiple tags with commas.") +@click.option( + "--from", + "from_date", + type=click.DateTime(), + help="Define the start date for fetching completions (inclusive). Use the format: YYYY-MM-DD.", +) +@click.option( + "--to", + "to_date", + type=click.DateTime(), + help="Set the end date for fetching completions (inclusive). Use the format: YYYY-MM-DD.", +) +def list_completions(limit, offset, timeout, tags, from_date, to_date): + """ + List completions + """ + base_url = _log10_config.url + org_id = _log10_config.org_id + + url = _get_completions_url(limit, offset, tags, from_date, to_date, base_url, org_id) + # Fetch completions + res = _try_get(url, timeout) + + completions = res.json() + total_completions = completions["total"] + completions = completions["data"] + + _render_completions_table(completions, total_completions) + + +@click.command() +@click.option("--id", prompt="Enter completion id", help="Completion ID") +def get_completion(id): + """ + Get a completion by id + """ + res = _get_completion(id) + rich.print_json(json.dumps(res.json()["data"], indent=4)) + + +@click.command() +@click.option("--limit", default="", help="Specify the maximum number of completions to retrieve.") +@click.option("--offset", default="", help="Set the starting point (offset) from where to begin fetching completions.") +@click.option( + "--timeout", default=10, help="Set the maximum time (in seconds) allowed for the HTTP request to complete." +) +@click.option("--tags", default="", help="Filter completions by specific tags. Separate multiple tags with commas.") +@click.option( + "--from", + "from_date", + type=click.DateTime(), + help="Define the start date for fetching completions (inclusive). Use the format: YYYY-MM-DD.", +) +@click.option( + "--to", + "to_date", + type=click.DateTime(), + help="Set the end date for fetching completions (inclusive). Use the format: YYYY-MM-DD.", +) +@click.option("--compact", is_flag=True, help="Enable to download only the compact version of the output.") +@click.option("--file", "-f", default="completions.jsonl", help="Specify the filename and path for the output file.") +def download_completions(limit, offset, timeout, tags, from_date, to_date, compact, file): + """ + Download completions to a jsonl file + """ + base_url = _log10_config.url + org_id = _log10_config.org_id + + init_url = _get_completions_url(1, 0, tags, from_date, to_date, base_url, org_id) + res = _try_get(init_url) + if res.status_code != 200: + rich.print(f"Error: {res.json()}") + return + + total_completions = res.json()["total"] + offset = int(offset) if offset else 0 + limit = int(limit) if limit else total_completions + rich.print(f"Download total completions: {limit}/{total_completions}") + if not click.confirm("Do you want to continue?"): + return + + # dowlnoad completions + pbar = tqdm.tqdm(total=limit) + batch_size = 10 + end = offset + limit if offset + limit < total_completions else total_completions + for batch in range(offset, end, batch_size): + current_batch_size = batch_size if batch + batch_size < end else end - batch + download_url = _get_completions_url( + current_batch_size, batch, tags, from_date, to_date, base_url, org_id, printout=False + ) + res = _try_get(download_url, timeout) + _write_completions(res, file, compact) + pbar.update(current_batch_size) + + +@click.command() +@click.option("--ids", default="", help="Completion IDs. Separate multiple ids with commas.") +@click.option("--tags", default="", help="Filter completions by specific tags. Separate multiple tags with commas.") +@click.option("--limit", help="Specify the maximum number of completions to retrieve filtered by tags.") +@click.option( + "--offset", help="Set the starting point (offset) from where to begin fetching completions filtered by tags." +) +@click.option("--models", default="", help="Comma separated list of models to compare") +@click.option("--temperature", default=0.2, help="Temperature") +@click.option("--max_tokens", default=512, help="Max tokens") +@click.option("--top_p", default=1.0, help="Top p") +@click.option("--analyze_prompt", is_flag=True, help="Run prompt analyzer on the messages.") +@click.option("--file", "-f", help="Specify the filename for the report in markdown format.") +def benchmark_models(ids, tags, limit, offset, models, temperature, max_tokens, top_p, file, analyze_prompt): + """ + Compare completions using different models and generate report + """ + if ids and tags: + raise click.UsageError("--ids and --tags cannot be set together.") + if (limit or offset) and not tags: + raise click.UsageError("--limit and --offset can only be used with --tags.") + if tags: + if not limit: + limit = 5 + if not offset: + offset = 0 + + if not models: + raise click.UsageError("--models must be set to compare.") + else: + for model in [m for m in models.split(",") if m]: + if not _check_model_support(model): + raise click.UsageError(f"Model {model} is not supported.") + + # get completions ids + completion_ids = [] + if ids: + completion_ids = [id for id in ids.split(",") if id] + elif tags: + base_url = _log10_config.url + org_id = _log10_config.org_id + url = _get_completions_url(limit, offset, tags, None, None, base_url, org_id) + res = _try_get(url) + completions = res.json()["data"] + completion_ids = [completion["id"] for completion in completions] + if not completion_ids: + SystemExit(f"No completions found for tags: {tags}") + + compare_models = [m for m in models.split(",") if m] + + data = [] + skipped_completion_ids = [] + for id in completion_ids: + # get message from id + completion_data = _get_completion(id).json()["data"] + + # skip completion if status is not finished or kind is not chat + if completion_data["status"] != "finished" or completion_data["kind"] != "chat": + rich.print(f"Skip completion {id}. Status is not finished or kind is not chat.") + skipped_completion_ids.append(id) + continue + + original_model_request = completion_data["request"] + original_model_response = completion_data["response"] + original_model = original_model_response["model"] + benchmark_data = { + "completion_id": id, + "original_request": original_model_request, + f"{original_model} (original model)": { + "content": original_model_response["choices"][0]["message"]["content"], + "usage": original_model_response["usage"], + "duration": completion_data["duration"], + }, + } + messages = original_model_request["messages"] + compare_models_data = _compare(compare_models, messages, temperature, max_tokens, top_p) + benchmark_data.update(compare_models_data) + data.append(benchmark_data) + + prompt_analysis_data = {} + if analyze_prompt: + rich.print("Analyzing prompts") + for item in data: + completion_id = item["completion_id"] + prompt_messages = item["original_request"]["messages"] + all_messages = "\n\n".join([m["content"] for m in prompt_messages]) + analyzer = PromptAnalyzer() + suggestions = analyzer.analyze(all_messages) + prompt_analysis_data[completion_id] = suggestions + + # create an empty dataframe + all_df = pd.DataFrame( + columns=[ + "Completion ID", + "Prompt Messages", + "Model", + "Content", + "Prompt Tokens", + "Completion Tokens", + "Total Tokens", + "Duration (ms)", + ] + ) + + # + # Display or save the results + # + if not file: + # display in terminal using rich + for ret in data: + _render_comparison_table(ret) + if analyze_prompt: + completion_id = ret["completion_id"] + suggestions = prompt_analysis_data[completion_id] + rich.print(f"Prompt Analysis for completion_id: {completion_id}") + display_prompt_analyzer_suggestions(suggestions) + else: + # generate markdown report and save to file + for ret in data: + df = _create_dataframe_from_comparison_data(ret) + all_df = pd.concat([all_df, df]) + pivot_df = all_df.pivot(index="Completion ID", columns="Model", values="Content") + pivot_df["Prompt Messages"] = all_df.groupby("Completion ID")["Prompt Messages"].first() + # Reorder the columns + cols = pivot_df.columns.tolist() + cols = [cols[-1]] + cols[:-1] + pivot_df = pivot_df[cols] + + pivot_table = generate_results_table(pivot_df, section_name="model comparison") + all_results_table = generate_results_table(all_df, section_name="All Results") + + prompt_analysis_markdown = "" + if analyze_prompt: + prompt_analysis_markdown = "## Prompt Analysis\n\n" + for completion_id, suggestions in prompt_analysis_data.items(): + prompt_messages = all_df[all_df["Completion ID"] == completion_id]["Prompt Messages"].values[0] + prompt_analysis_markdown += ( + f"### Prompt Analysis for completion_id: {completion_id}\n\n{prompt_messages}\n\n" + ) + prompt_analysis_markdown += convert_suggestion_to_markdown(suggestions) + + # generate the list of skipped completions ids + skipped_completion_markdown = "" + if skipped_completion_ids: + skipped_completion_ids_str = ", ".join(skipped_completion_ids) + skipped_completion_markdown += "## Skipped Completion IDs\n\n" + skipped_completion_markdown += f"Skipped completions: {skipped_completion_ids_str}\n\n" + + generate_markdown_report( + file, [pivot_table, prompt_analysis_markdown, all_results_table, skipped_completion_markdown] + ) + rich.print(f"Report saved to {file}") diff --git a/log10/cli/feedback.py b/log10/cli/feedback.py new file mode 100644 index 00000000..f8398b77 --- /dev/null +++ b/log10/cli/feedback.py @@ -0,0 +1,129 @@ +import json + +import click +from rich.console import Console +from rich.table import Table +from tqdm import tqdm + +from log10.feedback.feedback import Feedback, _get_feedback_list + + +@click.command() +@click.option("--task_id", prompt="Enter task id", help="Task ID") +@click.option("--values", prompt="Enter task values", help="Feedback in JSON format") +@click.option( + "--completion_tags_selector", + prompt="Enter completion tags selector", + help="Completion tags selector", +) +@click.option("--comment", help="Comment", default="") +def create_feedback(task_id, values, completion_tags_selector, comment): + """ + Add feedback to a group of completions associated with a task + """ + click.echo("Creating feedback") + tags = completion_tags_selector.split(",") + values = json.loads(values) + feedback = Feedback().create(task_id=task_id, values=values, completion_tags_selector=tags, comment=comment) + click.echo(feedback.json()) + + +@click.command() +@click.option( + "--offset", default=0, type=int, help="The starting index from which to begin the feedback fetch. Defaults to 0." +) +@click.option( + "--limit", default=25, type=int, help="The maximum number of feedback items to retrieve. Defaults to 25." +) +@click.option( + "--task_id", + default="", + type=str, + help="The specific Task ID to filter feedback. If not provided, feedback for all tasks will be fetched.", +) +def list_feedback(offset, limit, task_id): + """ + List feedback based on the provided criteria. This command allows fetching feedback for a specific task or across all tasks, + with control over the starting point and the number of items to retrieve. + """ + feedback_data = _get_feedback_list(offset, limit, task_id) + data_for_table = [] + for feedback in feedback_data: + data_for_table.append( + { + "id": feedback["id"], + "task_name": feedback["task_name"], + "feedback": json.dumps(feedback["json_values"], ensure_ascii=False), + "matched_completion_ids": ",".join(feedback["matched_completion_ids"]), + } + ) + table = Table(title="Feedback") + table.add_column("ID") + table.add_column("Task Name") + table.add_column("Feedback") + table.add_column("Completion ID") + + for item in data_for_table: + table.add_row(item["id"], item["task_name"], item["feedback"], item["matched_completion_ids"]) + console = Console() + console.print(table) + console.print(f"Total feedback: {len(feedback_data)}") + + +@click.command() +@click.option("--id", required=True, help="Get feedback by ID") +def get_feedback(id): + """ + Get feedback based on provided ID. + """ + try: + res = Feedback().get(id) + except Exception as e: + click.echo(f"Error fetching feedback {e}") + if hasattr(e, "response") and hasattr(e.response, "json") and "error" in e.response.json(): + click.echo(e.response.json()["error"]) + return + console = Console() + feedback = json.dumps(res.json(), indent=4) + console.print_json(feedback) + + +@click.command() +@click.option( + "--offset", + default=0, + help="The starting index from which to begin the feedback fetch. Leave empty to start from the beginning.", +) +@click.option( + "--limit", default="", help="The maximum number of feedback items to retrieve. Leave empty to retrieve all." +) +@click.option( + "--task_id", + default="", + type=str, + help="The specific Task ID to filter feedback. If not provided, feedback for all tasks will be fetched.", +) +@click.option( + "--file", + "-f", + type=str, + required=False, + help="Path to the file where the feedback will be saved. The feedback data is saved in JSON Lines (jsonl) format. If not specified, feedback will be printed to stdout.", +) +def download_feedback(offset, limit, task_id, file): + """ + Download feedback based on the provided criteria. This command allows fetching feedback for a specific task or across all tasks, + with control over the starting point and the number of items to retrieve. + """ + feedback_data = _get_feedback_list(offset, limit, task_id) + + console = Console() + if not file: + for feedback in feedback_data: + console.print_json(json.dumps(feedback, indent=4)) + return + + with open(file, "w") as f: + console.print(f"Saving feedback to {file}") + for feedback in tqdm(feedback_data): + f.write(json.dumps(feedback) + "\n") diff --git a/log10/cli/feedback_task.py b/log10/cli/feedback_task.py new file mode 100644 index 00000000..056730d3 --- /dev/null +++ b/log10/cli/feedback_task.py @@ -0,0 +1,80 @@ +import json + +import click +from rich.console import Console +from rich.table import Table + +from log10._httpx_utils import _get_time_diff +from log10.feedback.feedback_task import FeedbackTask + + +# create a cli interface for FeebackTask.create function +@click.command() +@click.option("--name", prompt="Enter feedback task name", help="Name of the task") +@click.option("--task_schema", prompt="Enter feedback task schema", help="Task schema") +@click.option("--instruction", help="Task instruction", default="") +@click.option( + "--completion_tags_selector", + help="Completion tags selector", +) +def create_feedback_task(name, task_schema, instruction, completion_tags_selector=None): + click.echo("Creating feedback task") + tags = [] + + if completion_tags_selector: + tags = completion_tags_selector.split(",") + + task_schema = json.loads(task_schema) + task = FeedbackTask().create( + name=name, task_schema=task_schema, completion_tags_selector=tags, instruction=instruction + ) + click.echo(f"Use this task_id to add feedback: {task.json()['id']}") + + +@click.command() +@click.option("--limit", default=25, help="Number of feedback tasks to fetch") +@click.option("--offset", default=0, help="Offset for the feedback tasks") +def list_feedback_task(limit, offset): + res = FeedbackTask().list(limit=limit, offset=offset) + feedback_tasks = res.json() + + data_for_table = [] + + for task in feedback_tasks["data"]: + data_for_table.append( + { + "id": task["id"], + "created_at": _get_time_diff(task["created_at"]), + "name": task["name"], + "required": task["json_schema"]["required"], + "instruction": task["instruction"], + } + ) + + table = Table(title="Feedback Tasks") + table.add_column("ID", style="dim") + table.add_column("Created At") + table.add_column("Name") + table.add_column("Required") + table.add_column("Instruction") + for item in data_for_table: + required = ", ".join(item["required"]) if item["required"] else "" + table.add_row(item["id"], item["created_at"], item["name"], required, item["instruction"]) + + console = Console() + console.print(table) + + +@click.command() +@click.option("--id", help="Get feedback task by ID") +def get_feedback_task(id): + try: + res = FeedbackTask().get(id) + except Exception as e: + click.echo(f"Error fetching feedback task {e}") + if hasattr(e, "response") and hasattr(e.response, "json") and "error" in e.response.json(): + click.echo(e.response.json()["error"]) + return + task = json.dumps(res.json()) + console = Console() + console.print_json(task) diff --git a/log10/completions/completions.py b/log10/completions/completions.py index c0d46492..ed479dd4 100644 --- a/log10/completions/completions.py +++ b/log10/completions/completions.py @@ -3,16 +3,9 @@ import click import httpx -import pandas as pd -import rich -import tqdm -from rich.console import Console -from rich.table import Table - -from log10._httpx_utils import _get_time_diff, _try_get -from log10.cli_utils import generate_markdown_report, generate_results_table + +from log10._httpx_utils import _try_get from log10.llm import Log10Config -from log10.prompt_analyzer import PromptAnalyzer, convert_suggestion_to_markdown, display_prompt_analyzer_suggestions _log10_config = Log10Config() @@ -55,11 +48,11 @@ def _get_tag_ids(tags): def _get_completions_url(limit, offset, tags, from_date, to_date, base_url, org_id, printout=True): tag_ids_str = _get_tag_ids(tags) if tags else "" if tag_ids_str and printout: - rich.print(f"Filter with tags: {tags}") + print(f"Filter with tags: {tags}") date_range = _get_valid_date_range(from_date, to_date) if date_range and printout: - rich.print(f"Filter with created date: {date_range['from'][:10]} to {date_range['to'][:10]}") + print(f"Filter with created date: {date_range['from'][:10]} to {date_range['to'][:10]}") url = f"{base_url}/api/completions?organization_id={org_id}&offset={offset}&limit={limit}&tagFilter={tag_ids_str}&createdFilter={json.dumps(date_range)}&sort=created_at&desc=true&ids=" return url @@ -82,113 +75,6 @@ def _get_valid_date_range(from_date, to_date): return date_range -def _render_completions_table(completions_data, total_completions): - data_for_table = [] - for completion in completions_data: - prompt, response = "", "" - if completion.get("kind") == "completion": - prompt = completion.get("request", {}).get("prompt", "") - response_choices = completion.get("response", {}).get("choices", []) - if response_choices: - response = response_choices[0].get("text", "") - elif completion.get("kind") == "chat": - request_messages = completion.get("request", {}).get("messages", []) - prompt = request_messages[0].get("content", "") if request_messages else "" - - response_choices = completion.get("response", {}).get("choices", []) - if response_choices: - # Handle 'message' and 'function_call' within the first choice safely - first_choice = response_choices[0] - if "message" in first_choice: - response = first_choice["message"].get("content", "") - elif "function_call" in first_choice: - response = json.dumps(first_choice.get("function_call", {})) - else: - rich.print(f"Unknown completion kind: {completion['kind']} for id: {completion['id']}") - - data_for_table.append( - { - "id": completion["id"], - "status": "success" if completion["status"] == "finished" else completion["status"], - "created_at": _get_time_diff(completion["created_at"]), - "prompt": prompt, - "completion": response, - "tags": [t["name"] for t in completion["tagResolved"]], - } - ) - # render data_for_table with rich table - table = Table(show_header=True, header_style="bold magenta") - - table.add_column("ID", style="dim") - table.add_column("Status") - table.add_column("Created At") - table.add_column("Prompt", overflow="fold") - table.add_column("Completion", overflow="fold") - table.add_column("Tags", justify="right") - - max_len = 40 - for item in data_for_table: - tags = ", ".join(item["tags"]) if item["tags"] else "" - if isinstance(item["prompt"], list): - item["prompt"] = " ".join(item["prompt"]) - short_prompt = item["prompt"][:max_len] + "..." if len(item["prompt"]) > max_len else item["prompt"] - short_completion = ( - item["completion"][:max_len] + "..." if len(item["completion"]) > max_len else item["completion"] - ) - table.add_row(item["id"], item["status"], item["created_at"], short_prompt, short_completion, tags) - - console = Console() - console.print(table) - console.print(f"{total_completions=}") - - -@click.command() -@click.option("--limit", default=25, help="Specify the maximum number of completions to retrieve.") -@click.option("--offset", default=0, help="Set the starting point (offset) from where to begin fetching completions.") -@click.option( - "--timeout", default=10, help="Set the maximum time (in seconds) allowed for the HTTP request to complete." -) -@click.option("--tags", default="", help="Filter completions by specific tags. Separate multiple tags with commas.") -@click.option( - "--from", - "from_date", - type=click.DateTime(), - help="Define the start date for fetching completions (inclusive). Use the format: YYYY-MM-DD.", -) -@click.option( - "--to", - "to_date", - type=click.DateTime(), - help="Set the end date for fetching completions (inclusive). Use the format: YYYY-MM-DD.", -) -def list_completions(limit, offset, timeout, tags, from_date, to_date): - """ - List completions - """ - base_url = _log10_config.url - org_id = _log10_config.org_id - - url = _get_completions_url(limit, offset, tags, from_date, to_date, base_url, org_id) - # Fetch completions - res = _try_get(url, timeout) - - completions = res.json() - total_completions = completions["total"] - completions = completions["data"] - - _render_completions_table(completions, total_completions) - - -@click.command() -@click.option("--id", prompt="Enter completion id", help="Completion ID") -def get_completion(id): - """ - Get a completion by id - """ - res = _get_completion(id) - rich.print_json(json.dumps(res.json()["data"], indent=4)) - - def _write_completions(res, output_file, compact_mode): """Processes completions and appends them to the output file.""" with open(output_file, "a") as file: @@ -202,61 +88,6 @@ def _write_completions(res, output_file, compact_mode): file.write(json.dumps(completion) + "\n") -@click.command() -@click.option("--limit", default="", help="Specify the maximum number of completions to retrieve.") -@click.option("--offset", default="", help="Set the starting point (offset) from where to begin fetching completions.") -@click.option( - "--timeout", default=10, help="Set the maximum time (in seconds) allowed for the HTTP request to complete." -) -@click.option("--tags", default="", help="Filter completions by specific tags. Separate multiple tags with commas.") -@click.option( - "--from", - "from_date", - type=click.DateTime(), - help="Define the start date for fetching completions (inclusive). Use the format: YYYY-MM-DD.", -) -@click.option( - "--to", - "to_date", - type=click.DateTime(), - help="Set the end date for fetching completions (inclusive). Use the format: YYYY-MM-DD.", -) -@click.option("--compact", is_flag=True, help="Enable to download only the compact version of the output.") -@click.option("--file", "-f", default="completions.jsonl", help="Specify the filename and path for the output file.") -def download_completions(limit, offset, timeout, tags, from_date, to_date, compact, file): - """ - Download completions to a jsonl file - """ - base_url = _log10_config.url - org_id = _log10_config.org_id - - init_url = _get_completions_url(1, 0, tags, from_date, to_date, base_url, org_id) - res = _try_get(init_url) - if res.status_code != 200: - rich.print(f"Error: {res.json()}") - return - - total_completions = res.json()["total"] - offset = int(offset) if offset else 0 - limit = int(limit) if limit else total_completions - rich.print(f"Download total completions: {limit}/{total_completions}") - if not click.confirm("Do you want to continue?"): - return - - # dowlnoad completions - pbar = tqdm.tqdm(total=limit) - batch_size = 10 - end = offset + limit if offset + limit < total_completions else total_completions - for batch in range(offset, end, batch_size): - current_batch_size = batch_size if batch + batch_size < end else end - batch - download_url = _get_completions_url( - current_batch_size, batch, tags, from_date, to_date, base_url, org_id, printout=False - ) - res = _try_get(download_url, timeout) - _write_completions(res, file, compact) - pbar.update(current_batch_size) - - def _get_llm_repsone( model: str, messages: list[dict], @@ -317,75 +148,11 @@ def _get_llm_repsone( return ret -def _render_comparison_table(model_response_raw_data): - rich.print(f"completion_id: {model_response_raw_data['completion_id']}") - rich.print("original_request:") - rich.print_json(json.dumps(model_response_raw_data["original_request"], indent=4)) - - table = rich.table.Table(show_header=True, header_style="bold magenta", box=rich.box.ROUNDED, show_lines=True) - table.add_column("Model") - table.add_column("Content") - table.add_column("Total Token Usage (Input/Output)") - table.add_column("Duration (ms)") - - for model, data in model_response_raw_data.items(): - # only display model data - if model not in ["completion_id", "original_request"]: - usage = data["usage"] - formatted_usage = f"{usage['total_tokens']} ({usage['prompt_tokens']}/{usage['completion_tokens']})" - table.add_row(model, data["content"], formatted_usage, str(data["duration"])) - rich.print(table) - - -def _create_dataframe_from_comparison_data(model_response_raw_data): - completion_id = model_response_raw_data["completion_id"] - original_request = model_response_raw_data["original_request"] - rows = [] - for model, model_data in model_response_raw_data.items(): - # only display model data - if model not in ["completion_id", "original_request"]: - content = model_data["content"] - usage = model_data["usage"] - prompt_tokens = usage["prompt_tokens"] - completion_tokens = usage["completion_tokens"] - total_tokens = usage["total_tokens"] - duration = model_data["duration"] - prompt_messages = json.dumps(original_request["messages"]) - rows.append( - [ - completion_id, - prompt_messages, - model, - content, - prompt_tokens, - completion_tokens, - total_tokens, - duration, - ] - ) - - df = pd.DataFrame( - rows, - columns=[ - "Completion ID", - "Prompt Messages", - "Model", - "Content", - "Prompt Tokens", - "Completion Tokens", - "Total Tokens", - "Duration (ms)", - ], - ) - - return df - - def _compare(models: list[str], messages: dict, temperature: float = 0.2, max_tokens: float = 256, top_p: float = 1.0): ret = {} if models: for model in models: - rich.print(f"Running {model}") + print(f"Running {model}") response = _get_llm_repsone( model, messages, @@ -434,157 +201,3 @@ def _compare(models: list[str], messages: dict, temperature: float = 0.2, max_to def _check_model_support(model: str) -> bool: return model in _SUPPORTED_MODELS - - -@click.command() -@click.option("--ids", default="", help="Completion IDs. Separate multiple ids with commas.") -@click.option("--tags", default="", help="Filter completions by specific tags. Separate multiple tags with commas.") -@click.option("--limit", help="Specify the maximum number of completions to retrieve filtered by tags.") -@click.option( - "--offset", help="Set the starting point (offset) from where to begin fetching completions filtered by tags." -) -@click.option("--models", default="", help="Comma separated list of models to compare") -@click.option("--temperature", default=0.2, help="Temperature") -@click.option("--max_tokens", default=512, help="Max tokens") -@click.option("--top_p", default=1.0, help="Top p") -@click.option("--analyze_prompt", is_flag=True, help="Run prompt analyzer on the messages.") -@click.option("--file", "-f", help="Specify the filename for the report in markdown format.") -def benchmark_models(ids, tags, limit, offset, models, temperature, max_tokens, top_p, file, analyze_prompt): - """ - Compare completions using different models and generate report - """ - if ids and tags: - raise click.UsageError("--ids and --tags cannot be set together.") - if (limit or offset) and not tags: - raise click.UsageError("--limit and --offset can only be used with --tags.") - if tags: - if not limit: - limit = 5 - if not offset: - offset = 0 - - if not models: - raise click.UsageError("--models must be set to compare.") - else: - for model in [m for m in models.split(",") if m]: - if not _check_model_support(model): - raise click.UsageError(f"Model {model} is not supported.") - - # get completions ids - completion_ids = [] - if ids: - completion_ids = [id for id in ids.split(",") if id] - elif tags: - base_url = _log10_config.url - org_id = _log10_config.org_id - url = _get_completions_url(limit, offset, tags, None, None, base_url, org_id) - res = _try_get(url) - completions = res.json()["data"] - completion_ids = [completion["id"] for completion in completions] - if not completion_ids: - SystemExit(f"No completions found for tags: {tags}") - - compare_models = [m for m in models.split(",") if m] - - data = [] - skipped_completion_ids = [] - for id in completion_ids: - # get message from id - completion_data = _get_completion(id).json()["data"] - - # skip completion if status is not finished or kind is not chat - if completion_data["status"] != "finished" or completion_data["kind"] != "chat": - rich.print(f"Skip completion {id}. Status is not finished or kind is not chat.") - skipped_completion_ids.append(id) - continue - - original_model_request = completion_data["request"] - original_model_response = completion_data["response"] - original_model = original_model_response["model"] - benchmark_data = { - "completion_id": id, - "original_request": original_model_request, - f"{original_model} (original model)": { - "content": original_model_response["choices"][0]["message"]["content"], - "usage": original_model_response["usage"], - "duration": completion_data["duration"], - }, - } - messages = original_model_request["messages"] - compare_models_data = _compare(compare_models, messages, temperature, max_tokens, top_p) - benchmark_data.update(compare_models_data) - data.append(benchmark_data) - - prompt_analysis_data = {} - if analyze_prompt: - rich.print("Analyzing prompts") - for item in data: - completion_id = item["completion_id"] - prompt_messages = item["original_request"]["messages"] - all_messages = "\n\n".join([m["content"] for m in prompt_messages]) - analyzer = PromptAnalyzer() - suggestions = analyzer.analyze(all_messages) - prompt_analysis_data[completion_id] = suggestions - - # create an empty dataframe - all_df = pd.DataFrame( - columns=[ - "Completion ID", - "Prompt Messages", - "Model", - "Content", - "Prompt Tokens", - "Completion Tokens", - "Total Tokens", - "Duration (ms)", - ] - ) - - # - # Display or save the results - # - if not file: - # display in terminal using rich - for ret in data: - _render_comparison_table(ret) - if analyze_prompt: - completion_id = ret["completion_id"] - suggestions = prompt_analysis_data[completion_id] - rich.print(f"Prompt Analysis for completion_id: {completion_id}") - display_prompt_analyzer_suggestions(suggestions) - else: - # generate markdown report and save to file - for ret in data: - df = _create_dataframe_from_comparison_data(ret) - all_df = pd.concat([all_df, df]) - pivot_df = all_df.pivot(index="Completion ID", columns="Model", values="Content") - pivot_df["Prompt Messages"] = all_df.groupby("Completion ID")["Prompt Messages"].first() - # Reorder the columns - cols = pivot_df.columns.tolist() - cols = [cols[-1]] + cols[:-1] - pivot_df = pivot_df[cols] - - pivot_table = generate_results_table(pivot_df, section_name="model comparison") - all_results_table = generate_results_table(all_df, section_name="All Results") - - prompt_analysis_markdown = "" - if analyze_prompt: - prompt_analysis_markdown = "## Prompt Analysis\n\n" - for completion_id, suggestions in prompt_analysis_data.items(): - prompt_messages = all_df[all_df["Completion ID"] == completion_id]["Prompt Messages"].values[0] - prompt_analysis_markdown += ( - f"### Prompt Analysis for completion_id: {completion_id}\n\n{prompt_messages}\n\n" - ) - prompt_analysis_markdown += convert_suggestion_to_markdown(suggestions) - - # generate the list of skipped completions ids - skipped_completion_markdown = "" - if skipped_completion_ids: - skipped_completion_ids_str = ", ".join(skipped_completion_ids) - skipped_completion_markdown += "## Skipped Completion IDs\n\n" - skipped_completion_markdown += f"Skipped completions: {skipped_completion_ids_str}\n\n" - - generate_markdown_report( - file, [pivot_table, prompt_analysis_markdown, all_results_table, skipped_completion_markdown] - ) - rich.print(f"Report saved to {file}") diff --git a/log10/feedback/autofeedback.py b/log10/feedback/autofeedback.py index e435bd13..d6e0e353 100644 --- a/log10/feedback/autofeedback.py +++ b/log10/feedback/autofeedback.py @@ -3,10 +3,7 @@ import random from types import FunctionType -import click import httpx -import rich -from rich.console import Console from log10._httpx_utils import _try_post_graphql_request from log10.completions.completions import _get_completion @@ -130,43 +127,3 @@ def get_autofeedback(completion_id: str) -> httpx.Response: return response.json() else: response.raise_for_status() - - -@click.command() -@click.option("--task_id", help="Feedback task ID") -@click.option("--content", help="Completion content") -@click.option("--file", "-f", help="File containing completion content") -@click.option("--completion_id", help="Completion ID") -@click.option("--num_samples", default=5, help="Number of samples to use for few-shot learning") -def auto_feedback_icl(task_id: str, content: str, file: str, completion_id: str, num_samples: int): - """ - Generate feedback with existing human feedback based on in context learning - """ - options_count = sum([1 for option in [content, file, completion_id] if option]) - if options_count > 1: - click.echo("Only one of --content, --file, or --completion_id should be provided.") - return - - console = Console() - auto_feedback_icl = AutoFeedbackICL(task_id, num_samples=num_samples) - if completion_id: - results = auto_feedback_icl.predict(completion_id=completion_id) - console.print_json(results) - return - - if file: - with open(file, "r") as f: - content = f.read() - results = auto_feedback_icl.predict(text=content) - console.print_json(results) - - -@click.command() -@click.option("--completion-id", required=True, help="Completion ID") -def get_autofeedback_cli(completion_id: str): - """ - Get an auto feedback by completion id - """ - res = get_autofeedback(completion_id) - if res: - rich.print_json(json.dumps(res["data"], indent=4)) diff --git a/log10/feedback/feedback.py b/log10/feedback/feedback.py index 08c15c90..1f2b557e 100644 --- a/log10/feedback/feedback.py +++ b/log10/feedback/feedback.py @@ -1,11 +1,6 @@ -import json import logging -import click import httpx -from rich.console import Console -from rich.table import Table -from tqdm import tqdm from log10._httpx_utils import _try_get from log10.llm import Log10Config @@ -86,26 +81,6 @@ def get(self, id: str) -> httpx.Response: return res -@click.command() -@click.option("--task_id", prompt="Enter task id", help="Task ID") -@click.option("--values", prompt="Enter task values", help="Feedback in JSON format") -@click.option( - "--completion_tags_selector", - prompt="Enter completion tags selector", - help="Completion tags selector", -) -@click.option("--comment", help="Comment", default="") -def create_feedback(task_id, values, completion_tags_selector, comment): - """ - Add feedback to a group of completions associated with a task - """ - click.echo("Creating feedback") - tags = completion_tags_selector.split(",") - values = json.loads(values) - feedback = Feedback().create(task_id=task_id, values=values, completion_tags_selector=tags, comment=comment) - click.echo(feedback.json()) - - def _get_feedback_list(offset, limit, task_id): total_fetched = 0 feedback_data = [] @@ -130,110 +105,9 @@ def _get_feedback_list(offset, limit, task_id): if total_fetched >= limit or total_fetched >= total_feedback: break except Exception as e: - click.echo(f"Error fetching feedback {e}") + logger.error(f"Error fetching feedback {e}") if hasattr(e, "response") and hasattr(e.response, "json") and "error" in e.response.json(): - click.echo(e.response.json()["error"]) + logger.error(e.response.json()["error"]) return [] return feedback_data - - -@click.command() -@click.option( - "--offset", default=0, type=int, help="The starting index from which to begin the feedback fetch. Defaults to 0." -) -@click.option( - "--limit", default=25, type=int, help="The maximum number of feedback items to retrieve. Defaults to 25." -) -@click.option( - "--task_id", - default="", - type=str, - help="The specific Task ID to filter feedback. If not provided, feedback for all tasks will be fetched.", -) -def list_feedback(offset, limit, task_id): - """ - List feedback based on the provided criteria. This command allows fetching feedback for a specific task or across all tasks, - with control over the starting point and the number of items to retrieve. - """ - feedback_data = _get_feedback_list(offset, limit, task_id) - data_for_table = [] - for feedback in feedback_data: - data_for_table.append( - { - "id": feedback["id"], - "task_name": feedback["task_name"], - "feedback": json.dumps(feedback["json_values"], ensure_ascii=False), - "matched_completion_ids": ",".join(feedback["matched_completion_ids"]), - } - ) - table = Table(title="Feedback") - table.add_column("ID") - table.add_column("Task Name") - table.add_column("Feedback") - table.add_column("Completion ID") - - for item in data_for_table: - table.add_row(item["id"], item["task_name"], item["feedback"], item["matched_completion_ids"]) - console = Console() - console.print(table) - console.print(f"Total feedback: {len(feedback_data)}") - - -@click.command() -@click.option("--id", required=True, help="Get feedback by ID") -def get_feedback(id): - """ - Get feedback based on provided ID. - """ - try: - res = Feedback().get(id) - except Exception as e: - click.echo(f"Error fetching feedback {e}") - if hasattr(e, "response") and hasattr(e.response, "json") and "error" in e.response.json(): - click.echo(e.response.json()["error"]) - return - console = Console() - feedback = json.dumps(res.json(), indent=4) - console.print_json(feedback) - - -@click.command() -@click.option( - "--offset", - default=0, - help="The starting index from which to begin the feedback fetch. Leave empty to start from the beginning.", -) -@click.option( - "--limit", default="", help="The maximum number of feedback items to retrieve. Leave empty to retrieve all." -) -@click.option( - "--task_id", - default="", - type=str, - help="The specific Task ID to filter feedback. If not provided, feedback for all tasks will be fetched.", -) -@click.option( - "--file", - "-f", - type=str, - required=False, - help="Path to the file where the feedback will be saved. The feedback data is saved in JSON Lines (jsonl) format. If not specified, feedback will be printed to stdout.", -) -def download_feedback(offset, limit, task_id, file): - """ - Download feedback based on the provided criteria. This command allows fetching feedback for a specific task or across all tasks, - with control over the starting point and the number of items to retrieve. - """ - feedback_data = _get_feedback_list(offset, limit, task_id) - - console = Console() - if not file: - for feedback in feedback_data: - console.print_json(json.dumps(feedback, indent=4)) - return - - with open(file, "w") as f: - console.print(f"Saving feedback to {file}") - for feedback in tqdm(feedback_data): - f.write(json.dumps(feedback) + "\n") diff --git a/log10/feedback/feedback_task.py b/log10/feedback/feedback_task.py index 53c91049..9c75c3b8 100644 --- a/log10/feedback/feedback_task.py +++ b/log10/feedback/feedback_task.py @@ -1,13 +1,9 @@ -import json import logging -import click import httpx from dotenv import load_dotenv -from rich.console import Console -from rich.table import Table -from log10._httpx_utils import _get_time_diff, _try_get +from log10._httpx_utils import _try_get from log10.llm import Log10Config @@ -74,75 +70,3 @@ def get(self, id: str) -> httpx.Response: if res.status_code != 200: raise Exception(f"Error fetching feedback task {res.json()}") return res - - -# create a cli interface for FeebackTask.create function -@click.command() -@click.option("--name", prompt="Enter feedback task name", help="Name of the task") -@click.option("--task_schema", prompt="Enter feedback task schema", help="Task schema") -@click.option("--instruction", help="Task instruction", default="") -@click.option( - "--completion_tags_selector", - help="Completion tags selector", -) -def create_feedback_task(name, task_schema, instruction, completion_tags_selector=None): - click.echo("Creating feedback task") - tags = [] - - if completion_tags_selector: - tags = completion_tags_selector.split(",") - - task_schema = json.loads(task_schema) - task = FeedbackTask().create( - name=name, task_schema=task_schema, completion_tags_selector=tags, instruction=instruction - ) - click.echo(f"Use this task_id to add feedback: {task.json()['id']}") - - -@click.command() -@click.option("--limit", default=25, help="Number of feedback tasks to fetch") -@click.option("--offset", default=0, help="Offset for the feedback tasks") -def list_feedback_task(limit, offset): - res = FeedbackTask().list(limit=limit, offset=offset) - feedback_tasks = res.json() - - data_for_table = [] - - for task in feedback_tasks["data"]: - data_for_table.append( - { - "id": task["id"], - "created_at": _get_time_diff(task["created_at"]), - "name": task["name"], - "required": task["json_schema"]["required"], - "instruction": task["instruction"], - } - ) - - table = Table(title="Feedback Tasks") - table.add_column("ID", style="dim") - table.add_column("Created At") - table.add_column("Name") - table.add_column("Required") - table.add_column("Instruction") - for item in data_for_table: - required = ", ".join(item["required"]) if item["required"] else "" - table.add_row(item["id"], item["created_at"], item["name"], required, item["instruction"]) - - console = Console() - console.print(table) - - -@click.command() -@click.option("--id", help="Get feedback task by ID") -def get_feedback_task(id): - try: - res = FeedbackTask().get(id) - except Exception as e: - click.echo(f"Error fetching feedback task {e}") - if hasattr(e, "response") and hasattr(e.response, "json") and "error" in e.response.json(): - click.echo(e.response.json()["error"]) - return - task = json.dumps(res.json()) - console = Console() - console.print_json(task) diff --git a/poetry.lock b/poetry.lock index 474aaa2a..be74e05b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2178,7 +2178,7 @@ files = [ name = "numpy" version = "1.26.4" description = "Fundamental package for array computing in Python" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, @@ -2317,7 +2317,7 @@ files = [ name = "pandas" version = "2.2.2" description = "Powerful data structures for data analysis, time series, and statistics" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, @@ -2701,7 +2701,7 @@ testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"] name = "python-dateutil" version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" -optional = false +optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, @@ -2729,7 +2729,7 @@ cli = ["click (>=5.0)"] name = "pytz" version = "2024.1" description = "World timezone definitions, modern and historical" -optional = false +optional = true python-versions = "*" files = [ {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, @@ -3222,7 +3222,7 @@ test = ["pytest", "pytest-cov"] name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -optional = false +optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, @@ -3659,7 +3659,7 @@ typing-extensions = ">=3.7.4" name = "tzdata" version = "2024.1" description = "Provider of IANA time zone data" -optional = false +optional = true python-versions = ">=2" files = [ {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, @@ -3915,6 +3915,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] autofeedback-icl = ["magentic"] +cli = ["click", "pandas", "rich", "tabulate"] gemini = ["google-cloud-aiplatform"] google-generativeai = ["google-generativeai"] lamini = ["lamini"] @@ -3927,4 +3928,4 @@ together = ["together"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "2190ee6487f544438498170965a7719667b908a07ec2e43aaa21e7b9a34e53ad" +content-hash = "5dc9a83d0bf79d0fd7c4ee631af6381db64820612e76907ea635b9ee3b69d6ac" diff --git a/pyproject.toml b/pyproject.toml index 435be36c..a024fc55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,6 @@ anthropic = "<1" requests = "^2.31.0" python-dotenv = "^1.0.0" backoff = "^2.2.1" -pandas = ">=2" langchain = {version = "<0.2.0", optional = true} magentic = {version = ">=0.17.0", optional = true, markers = "python_version >= '3.10'"} litellm = {version = "^1.34.18", optional = true} @@ -52,6 +51,10 @@ together = {version = "^0.2.7", optional = true} mosaicml-cli = {version = "^0.5.30", optional = true} google-cloud-bigquery = {version = "^3.11.4", optional = true} google-generativeai = {version = "^0.6.0", optional = true} +click = {version = "^8.1.7", optional = true} +rich = {version = "^13.7.1", optional = true} +tabulate = {version = "^0.9.0", optional = true} +pandas = {version = ">=2", optional = true} [tool.poetry.extras] autofeedback_icl = ["magentic"] @@ -63,6 +66,7 @@ together = ["together"] mosaicml = ["mosaicml-cli"] google-generativeai = ["google-generativeai"] lamini = ["lamini"] +cli = ["click", "rich", "tabulate", "pandas"] [tool.ruff] # Never enforce `E501` (line length violations). diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 00000000..3758ead8 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,75 @@ +import pytest +from click.testing import CliRunner + +from log10.cli.cli_commands import cli + + +completion_id = "fe3c10f0-df31-4a42-b224-233adfe1eb7f" +feedback_id = "58b8d9b7-1d6a-4b7d-952e-bc97a649dc94" +feedback_task_id = "890bda39-2232-4cde-ba95-7c501afc4b95" + + +@pytest.fixture +def runner(): + return CliRunner() + + +def test_list_completions(runner): + result = runner.invoke(cli, ["completions", "list"]) + print(result.output) + assert result.exit_code == 0 + assert "total_completions=" in result.output + + +def test_get_completion(runner): + result = runner.invoke(cli, ["completions", "get", "--id", completion_id]) + assert result.exit_code == 0 + assert completion_id in result.output + + +def test_download_completions(runner): + result = runner.invoke(cli, ["completions", "download", "--limit", "1", "--tags", "log10/summary-grading"]) + assert result.exit_code == 0 + assert "Download total completions: 1/" in result.output + + +def test_benchmark_models(runner): + tag = "test_tag_c" + model = "gpt-3.5-turbo" + result = runner.invoke(cli, ["completions", "benchmark_models", "--models", model, "--limit", "1", "--tags", tag]) + assert result.exit_code == 0 + assert f"Filter with tags: {tag}" in result.output + assert f"Running {model}" in result.output + + +def test_list_feedback(runner): + result = runner.invoke(cli, ["feedback", "list"]) + assert result.exit_code == 0 + assert "Total feedback:" in result.output + + +def test_get_feedback(runner): + result = runner.invoke(cli, ["feedback", "get", "--id", feedback_id]) + assert result.exit_code == 0 + assert feedback_id in result.output + + +def test_download_feedback(runner): + result = runner.invoke(cli, ["feedback", "download", "--limit", "1"]) + assert result.exit_code == 0 + + +def test_get_autofeedback(runner): + result = runner.invoke(cli, ["feedback", "autofeedback", "get", "--completion-id", completion_id]) + assert result.exit_code == 0 + assert completion_id in result.output + + +def test_list_feedback_task(runner): + result = runner.invoke(cli, ["feedback-task", "list"]) + assert result.exit_code == 0 + + +def test_get_feedback_task(runner): + result = runner.invoke(cli, ["feedback-task", "get", "--id", feedback_task_id]) + assert result.exit_code == 0