From b666f79a45a8f334430f8ff6b36c463bb8909f9d Mon Sep 17 00:00:00 2001 From: ilai Date: Mon, 23 Oct 2023 12:44:55 +0300 Subject: [PATCH 1/5] [cli] Added option not to stop on the first error during CLI upsert Now that we're upserting in batches anyway, it makes more sense to allow the user to ignore errors and continue to upsert other documents --- src/resin_cli/cli.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/resin_cli/cli.py b/src/resin_cli/cli.py index e5bda7f7..14f9aa6a 100644 --- a/src/resin_cli/cli.py +++ b/src/resin_cli/cli.py @@ -168,8 +168,11 @@ def new(index_name): help="The name of the index to upload the data to. " "Inferred from INDEX_NAME env var if not provided." ) -@click.option("--batch-size", default=10, help="Batch size for upsert") -def upsert(index_name, data_path, batch_size): +@click.option("--batch-size", default=10, + help="Number of documents to upload in each batch. Defaults to 10.") +@click.option("--stop-on-error/--no-stop-on-error", default=True, + help="Whether to stop when the first error arises. Defaults to True.") +def upsert(index_name: str, data_path: str, batch_size: int, stop_on_error: bool): if index_name is None: msg = ( "No index name provided. Please set --index-name or INDEX_NAME environment " @@ -224,18 +227,30 @@ def upsert(index_name, data_path, batch_size): abort=True) pbar = tqdm(total=len(data), desc="Upserting documents") + failed_docs = [] for i in range(0, len(data), batch_size): batch = data[i:i + batch_size] try: kb.upsert(data) except Exception as e: - msg = ( - f"Failed to upsert data to index {kb.index_name}. Underlying error: {e}" - ) - raise CLIError(msg) + if stop_on_error: + msg = ( + f"Failed to upsert data to index {kb.index_name}. " + f"Underlying error: {e}" + ) + raise CLIError(msg) + else: + failed_docs.extend([_.id for _ in batch]) pbar.update(len(batch)) + if failed_docs: + msg = ( + f"Failed to upsert the following documents to index {kb.index_name}: " + f"{failed_docs}" + ) + raise CLIError(msg) + click.echo(click.style("Success!", fg="green")) From 928373fd2e6d6663f4f0c19ecac19eec49dd7d44 Mon Sep 17 00:00:00 2001 From: ilai Date: Mon, 23 Oct 2023 12:49:19 +0300 Subject: [PATCH 2/5] [cli] Upsert 'stop-on-error' - added tolerance of 10% of the data If more than 10% of the data has failed - stop trying to upsert more documents. --- src/resin_cli/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/resin_cli/cli.py b/src/resin_cli/cli.py index 14f9aa6a..a29b48c0 100644 --- a/src/resin_cli/cli.py +++ b/src/resin_cli/cli.py @@ -233,7 +233,7 @@ def upsert(index_name: str, data_path: str, batch_size: int, stop_on_error: bool try: kb.upsert(data) except Exception as e: - if stop_on_error: + if stop_on_error or len(failed_docs) > len(data) // 10: msg = ( f"Failed to upsert data to index {kb.index_name}. " f"Underlying error: {e}" From 15f81b055fa07804b20f330d8fc16343453c9bb6 Mon Sep 17 00:00:00 2001 From: ilai Date: Tue, 24 Oct 2023 10:07:02 +0300 Subject: [PATCH 3/5] [cli] Improve stop-on-error mechanism - Changed the name to "allow_failures", default false. - Print out the first encountered error --- src/resin_cli/cli.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/resin_cli/cli.py b/src/resin_cli/cli.py index 0d64ab34..4386c825 100644 --- a/src/resin_cli/cli.py +++ b/src/resin_cli/cli.py @@ -170,9 +170,12 @@ def new(index_name): ) @click.option("--batch-size", default=10, help="Number of documents to upload in each batch. Defaults to 10.") -@click.option("--stop-on-error/--no-stop-on-error", default=True, - help="Whether to stop when the first error arises. Defaults to True.") -def upsert(index_name: str, data_path: str, batch_size: int, stop_on_error: bool): +@click.option("--allow-failures/--dont-allow-failures", default=False, + help="On default, the upsert process will stop if any document fails to " + "be uploaded. " + "When set to True, the upsert process will continue on failure, as " + "long as less than 10% of the documents have failed to be uploaded.") +def upsert(index_name: str, data_path: str, batch_size: int, allow_failures: bool): if index_name is None: msg = ( "No index name provided. Please set --index-name or INDEX_NAME environment " @@ -228,26 +231,29 @@ def upsert(index_name: str, data_path: str, batch_size: int, stop_on_error: bool pbar = tqdm(total=len(data), desc="Upserting documents") failed_docs = [] + first_error = None for i in range(0, len(data), batch_size): batch = data[i:i + batch_size] try: kb.upsert(data) except Exception as e: - if stop_on_error or len(failed_docs) > len(data) // 10: + if allow_failures and len(failed_docs) < len(data) // 10: + failed_docs.extend([_.id for _ in batch]) + if first_error is None: + first_error = str(e) + else: msg = ( f"Failed to upsert data to index {kb.index_name}. " f"Underlying error: {e}" ) raise CLIError(msg) - else: - failed_docs.extend([_.id for _ in batch]) pbar.update(len(batch)) if failed_docs: msg = ( f"Failed to upsert the following documents to index {kb.index_name}: " - f"{failed_docs}" + f"{failed_docs}. The first encountered error was: {first_error}" ) raise CLIError(msg) From 0e83e428ed9af0e81d3e4bdbf8a2541d204526a4 Mon Sep 17 00:00:00 2001 From: ilai Date: Tue, 24 Oct 2023 12:04:22 +0300 Subject: [PATCH 4/5] [cli] Improve error message Tell the user they can use --allow-failures --- src/resin_cli/cli.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/resin_cli/cli.py b/src/resin_cli/cli.py index 4386c825..5248ce71 100644 --- a/src/resin_cli/cli.py +++ b/src/resin_cli/cli.py @@ -1,4 +1,5 @@ import os +from typing import List, Optional import click import time @@ -230,8 +231,8 @@ def upsert(index_name: str, data_path: str, batch_size: int, allow_failures: boo abort=True) pbar = tqdm(total=len(data), desc="Upserting documents") - failed_docs = [] - first_error = None + failed_docs: List[str] = [] + first_error: Optional[str] = None for i in range(0, len(data), batch_size): batch = data[i:i + batch_size] try: @@ -244,7 +245,8 @@ def upsert(index_name: str, data_path: str, batch_size: int, allow_failures: boo else: msg = ( f"Failed to upsert data to index {kb.index_name}. " - f"Underlying error: {e}" + f"Underlying error: {e}\n" + f"You can allow partial failures by setting --allow-failures. " ) raise CLIError(msg) From f685c79e9dbebc2ff2e78e9905b0989ee16b1e92 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Mon, 23 Oct 2023 16:54:59 +0300 Subject: [PATCH 5/5] resolve mypy --- src/resin_cli/cli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/resin_cli/cli.py b/src/resin_cli/cli.py index 5248ce71..5a8b208b 100644 --- a/src/resin_cli/cli.py +++ b/src/resin_cli/cli.py @@ -52,7 +52,10 @@ def check_service_health(url: str): raise CLIError(msg) except requests.exceptions.HTTPError as e: - error = e.response.json().get("detail", None) or e.response.text + if e.response is not None: + error = e.response.json().get("detail", None) or e.response.text + else: + error = str(e) msg = ( f"Resin service on {url} is not healthy, failed with error: {error}" )