From 9262c45755aed5d214ef9d60a5c8726ae018170a Mon Sep 17 00:00:00 2001 From: Alex Co Date: Sun, 3 Nov 2024 14:10:21 +0800 Subject: [PATCH 1/3] Convert json to text before sending to index Signed-off-by: Alex Co --- .../danswer/connectors/airtable/connector.py | 39 +++++++++++++++---- backend/requirements/default.txt | 1 - backend/requirements/model_server.txt | 4 +- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/backend/danswer/connectors/airtable/connector.py b/backend/danswer/connectors/airtable/connector.py index 2cae2ca3a18..184edbf1491 100644 --- a/backend/danswer/connectors/airtable/connector.py +++ b/backend/danswer/connectors/airtable/connector.py @@ -3,12 +3,13 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.interfaces import GenerateDocumentsOutput -from danswer.connectors.interfaces import LoadConnector -from danswer.connectors.interfaces import PollConnector -from danswer.connectors.interfaces import SecondsSinceUnixEpoch -from danswer.connectors.models import Document -from danswer.connectors.models import Section +from danswer.connectors.interfaces import ( + GenerateDocumentsOutput, + LoadConnector, + PollConnector, + SecondsSinceUnixEpoch, +) +from danswer.connectors.models import Document, Section from pyairtable import Api as AirtableApi @@ -34,6 +35,23 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None return None + def json_to_text(self, obj: Any, indent: int = 0) -> str: + """ + Recursively converts JSON object to plain text. + """ + text = "" + if isinstance(obj, dict): + for key, value in obj.items(): + text += " " * indent + str(key) + ":\n" + text += self.json_to_text(value, indent + 1) + elif isinstance(obj, list): + for item in obj: + text += self.json_to_text(item, indent) + else: + text += " " * indent + str(obj) + "\n" + + return text + def poll_source( self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None ) -> GenerateDocumentsOutput: @@ -41,7 +59,10 @@ def poll_source( raise AirtableClientNotSetUpError() table = self.airtable_client.table(self.base_id, self.table_name_or_id) - all_records = table.all() + + table_name = table.schema().name + base_name = self.airtable_client.base(self.base_id, validate=True).name + all_records = table.all(cell_format="string", time_zone="UTC", user_locale="en") record_documents = [] for record in all_records: @@ -50,7 +71,7 @@ def poll_source( sections=[ Section( link=f"https://airtable.com/{self.base_id}/{self.table_name_or_id}/", - text=json.dumps(record.get("fields")), + text=self.json_to_text(record.get("fields")), ) ], source=DocumentSource.AIRTABLE, @@ -58,6 +79,8 @@ def poll_source( metadata={ "type": "airtable", "created_time": record.get("createdTime"), + "table_name": table_name, + "base_name": base_name, }, ) record_documents.append(record_document) diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 853a483478f..eca169124be 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -35,7 +35,6 @@ msal==1.28.0 nltk==3.8.1 Office365-REST-Python-Client==2.5.9 oauthlib==3.2.2 -openai==1.41.1 openpyxl==3.1.2 playwright==1.41.2 psutil==5.9.5 diff --git a/backend/requirements/model_server.txt b/backend/requirements/model_server.txt index 18c2cefed28..410abc7edaf 100644 --- a/backend/requirements/model_server.txt +++ b/backend/requirements/model_server.txt @@ -3,12 +3,12 @@ einops==0.8.0 fastapi==0.109.2 google-cloud-aiplatform==1.58.0 numpy==1.26.4 -openai==1.41.1 +openai==1.53.0 pydantic==2.8.2 retry==0.9.2 safetensors==0.4.2 sentence-transformers==2.6.1 -torch==2.2.0 transformers==4.39.2 uvicorn==0.21.1 voyageai==0.2.3 +torch==2.5.1 \ No newline at end of file From 4a8ccbae3e39a35551683ca7dc17d66edb41e23e Mon Sep 17 00:00:00 2001 From: Alex Co Date: Sun, 3 Nov 2024 14:11:34 +0800 Subject: [PATCH 2/3] Disable build on PR Signed-off-by: Alex Co --- .../workflows/gar-build-push-model-server-container-on-tag.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/gar-build-push-model-server-container-on-tag.yml b/.github/workflows/gar-build-push-model-server-container-on-tag.yml index 52ef0e424d5..c6dd416df53 100644 --- a/.github/workflows/gar-build-push-model-server-container-on-tag.yml +++ b/.github/workflows/gar-build-push-model-server-container-on-tag.yml @@ -4,9 +4,6 @@ on: push: tags: - "*" - pull_request: - branches: - - "main" env: GarProjectID: mv-auxiliary From 8def9f3a1dc2fa915af713f460582a3d697ae357 Mon Sep 17 00:00:00 2001 From: Alex Co Date: Sun, 3 Nov 2024 14:15:35 +0800 Subject: [PATCH 3/3] Add openai dep Signed-off-by: Alex Co --- backend/requirements/default.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index eca169124be..dbad39347a0 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -75,4 +75,5 @@ boto3-stubs[s3]==1.34.133 ultimate_sitemap_parser==0.5 pyairtable==3.0.0a3 anthropic[vertex]==0.36.1 -google-cloud-aiplatform==1.70.0 \ No newline at end of file +google-cloud-aiplatform==1.70.0 +openai==1.53.0