small pr comments

pinecone-io · Oct 15, 2023 · 652c477 · 652c477
1 parent ef42e1c
commit 652c477
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 31 deletions.
diff --git a/resin/knoweldge_base/knowledge_base.py b/resin/knoweldge_base/knowledge_base.py
@@ -221,13 +221,6 @@ def _get_full_index_name(index_name: str) -> str:
         else:
             return INDEX_NAME_PREFIX + index_name
 
-    @staticmethod
-    def _df_to_documents(df: pd.DataFrame) -> List[Document]:
-        documents = [
-            Document(**row) for row in df.to_dict(orient="records")  # type: ignore
-            ]
-        return documents
-
     @property
     def index_name(self) -> str:
         return self._index_name

diff --git a/resin_cli/cli.py b/resin_cli/cli.py
@@ -123,14 +123,15 @@ def new(index_name, tokenizer_model):
 @click.option("--tokenizer-model", default="gpt-3.5-turbo", help="Tokenizer model")
 def upsert(index_name, data_path, tokenizer_model):
     if index_name is None:
-        msg = "Index name is not provided, please provide it with" + \
-              ' --index-name or set it with env var `export INDEX_NAME="MY_INDEX_NAME`'
+        msg = ("Index name is not provided, please provide it with" +
+               ' --index-name or set it with env var + '
+               '`export INDEX_NAME="MY_INDEX_NAME`')
         click.echo(click.style(msg, fg="red"), err=True)
         sys.exit(1)
     Tokenizer.initialize(OpenAITokenizer, tokenizer_model)
     if data_path is None:
-        msg = "Data path is not provided," + \
-              " please provide it with --data-path or set it with env var"
+        msg = ("Data path is not provided," +
+               " please provide it with --data-path or set it with env var")
         click.echo(click.style(msg, fg="red"), err=True)
         sys.exit(1)
     click.echo("Resin is going to upsert data from ", nl=False)

diff --git a/resin_cli/data_loader/data_loader.py b/resin_cli/data_loader/data_loader.py
@@ -20,6 +20,24 @@ def __init__(self, message):
         super().__init__(message)
 
 
+def _process_metadata(value):
+    if pd.isna(value):
+        return {}
+
+    if isinstance(value, str):
+        try:
+            value = json.loads(value)
+        except json.JSONDecodeError as e:
+            raise DocumentsValidationError(
+                f"Metadata must be a valid json string. Error: {e}"
+            ) from e
+
+    if not isinstance(value, dict):
+        raise DocumentsValidationError("Metadata must be a dict or json string")
+
+    return {k: v for k, v in value.items() if not pd.isna(v)}
+
+
 def _df_to_documents(df: pd.DataFrame) -> List[Document]:
     if not isinstance(df, pd.DataFrame):
         raise ValueError("Dataframe must be a pandas DataFrame")
@@ -29,26 +47,12 @@ def _df_to_documents(df: pd.DataFrame) -> List[Document]:
         raise IDsNotUniqueError("IDs must be unique")
 
     try:
-        documents: List[Document] = []
-        for row in df.to_dict(orient="records"):
-            if "metadata" in row:
-                if pd.isna(row["metadata"]):
-                    row["metadata"] = {}
-                elif type(row["metadata"]) is str:
-                    try:
-                        row["metadata"] = json.loads(row["metadata"])
-                    except json.JSONDecodeError as e:
-                        raise DocumentsValidationError(
-                            f"Metadata must be a valid json string. Error: {e}"
-                        ) from e
-                elif type(row["metadata"]) is not dict:
-                    raise DocumentsValidationError(
-                        "Metadata must be a dict or json string"
-                    )
-                row["metadata"] = {k: v for k, v in row["metadata"].items()
-                                   if not pd.isna(v)}
-            row = {k: v for k, v in row.items() if not pd.isna(v)}
-            documents.append(Document(**row))  # type: ignore
+        if "metadata" in df.columns:
+            df.loc[:, "metadata"] = df["metadata"].apply(_process_metadata)
+        documents = [
+            Document(**{k: v for k, v in row._asdict().items() if not pd.isna(v)})
+            for row in df.itertuples(index=False)
+        ]
     except ValidationError as e:
         raise DocumentsValidationError("Documents failed validation") from e
     except ValueError as e: