Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Commit

Permalink
small pr comments
Browse files Browse the repository at this point in the history
  • Loading branch information
acatav committed Oct 15, 2023
1 parent ef42e1c commit 652c477
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 31 deletions.
7 changes: 0 additions & 7 deletions resin/knoweldge_base/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,13 +221,6 @@ def _get_full_index_name(index_name: str) -> str:
else:
return INDEX_NAME_PREFIX + index_name

@staticmethod
def _df_to_documents(df: pd.DataFrame) -> List[Document]:
documents = [
Document(**row) for row in df.to_dict(orient="records") # type: ignore
]
return documents

@property
def index_name(self) -> str:
return self._index_name
Expand Down
9 changes: 5 additions & 4 deletions resin_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,15 @@ def new(index_name, tokenizer_model):
@click.option("--tokenizer-model", default="gpt-3.5-turbo", help="Tokenizer model")
def upsert(index_name, data_path, tokenizer_model):
if index_name is None:
msg = "Index name is not provided, please provide it with" + \
' --index-name or set it with env var `export INDEX_NAME="MY_INDEX_NAME`'
msg = ("Index name is not provided, please provide it with" +
' --index-name or set it with env var + '
'`export INDEX_NAME="MY_INDEX_NAME`')
click.echo(click.style(msg, fg="red"), err=True)
sys.exit(1)
Tokenizer.initialize(OpenAITokenizer, tokenizer_model)
if data_path is None:
msg = "Data path is not provided," + \
" please provide it with --data-path or set it with env var"
msg = ("Data path is not provided," +
" please provide it with --data-path or set it with env var")
click.echo(click.style(msg, fg="red"), err=True)
sys.exit(1)
click.echo("Resin is going to upsert data from ", nl=False)
Expand Down
44 changes: 24 additions & 20 deletions resin_cli/data_loader/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,24 @@ def __init__(self, message):
super().__init__(message)


def _process_metadata(value):
if pd.isna(value):
return {}

if isinstance(value, str):
try:
value = json.loads(value)
except json.JSONDecodeError as e:
raise DocumentsValidationError(
f"Metadata must be a valid json string. Error: {e}"
) from e

if not isinstance(value, dict):
raise DocumentsValidationError("Metadata must be a dict or json string")

return {k: v for k, v in value.items() if not pd.isna(v)}


def _df_to_documents(df: pd.DataFrame) -> List[Document]:
if not isinstance(df, pd.DataFrame):
raise ValueError("Dataframe must be a pandas DataFrame")
Expand All @@ -29,26 +47,12 @@ def _df_to_documents(df: pd.DataFrame) -> List[Document]:
raise IDsNotUniqueError("IDs must be unique")

try:
documents: List[Document] = []
for row in df.to_dict(orient="records"):
if "metadata" in row:
if pd.isna(row["metadata"]):
row["metadata"] = {}
elif type(row["metadata"]) is str:
try:
row["metadata"] = json.loads(row["metadata"])
except json.JSONDecodeError as e:
raise DocumentsValidationError(
f"Metadata must be a valid json string. Error: {e}"
) from e
elif type(row["metadata"]) is not dict:
raise DocumentsValidationError(
"Metadata must be a dict or json string"
)
row["metadata"] = {k: v for k, v in row["metadata"].items()
if not pd.isna(v)}
row = {k: v for k, v in row.items() if not pd.isna(v)}
documents.append(Document(**row)) # type: ignore
if "metadata" in df.columns:
df.loc[:, "metadata"] = df["metadata"].apply(_process_metadata)
documents = [
Document(**{k: v for k, v in row._asdict().items() if not pd.isna(v)})
for row in df.itertuples(index=False)
]
except ValidationError as e:
raise DocumentsValidationError("Documents failed validation") from e
except ValueError as e:
Expand Down

0 comments on commit 652c477

Please sign in to comment.