Skip to content

Commit

Permalink
Merge pull request #71 from mindvalley/chore/improve-airtable-indexing
Browse files Browse the repository at this point in the history
Convert json to text before sending to index
  • Loading branch information
onimsha authored Nov 3, 2024
2 parents 9837bc7 + 8def9f3 commit bcdf0b7
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@ on:
push:
tags:
- "*"
pull_request:
branches:
- "main"

env:
GarProjectID: mv-auxiliary
Expand Down
39 changes: 31 additions & 8 deletions backend/danswer/connectors/airtable/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@

from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.connectors.interfaces import (
GenerateDocumentsOutput,
LoadConnector,
PollConnector,
SecondsSinceUnixEpoch,
)
from danswer.connectors.models import Document, Section
from pyairtable import Api as AirtableApi


Expand All @@ -34,14 +35,34 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None

return None

def json_to_text(self, obj: Any, indent: int = 0) -> str:
"""
Recursively converts JSON object to plain text.
"""
text = ""
if isinstance(obj, dict):
for key, value in obj.items():
text += " " * indent + str(key) + ":\n"
text += self.json_to_text(value, indent + 1)
elif isinstance(obj, list):
for item in obj:
text += self.json_to_text(item, indent)
else:
text += " " * indent + str(obj) + "\n"

return text

def poll_source(
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
) -> GenerateDocumentsOutput:
if not self.airtable_client:
raise AirtableClientNotSetUpError()

table = self.airtable_client.table(self.base_id, self.table_name_or_id)
all_records = table.all()

table_name = table.schema().name
base_name = self.airtable_client.base(self.base_id, validate=True).name
all_records = table.all(cell_format="string", time_zone="UTC", user_locale="en")

record_documents = []
for record in all_records:
Expand All @@ -50,14 +71,16 @@ def poll_source(
sections=[
Section(
link=f"https://airtable.com/{self.base_id}/{self.table_name_or_id}/",
text=json.dumps(record.get("fields")),
text=self.json_to_text(record.get("fields")),
)
],
source=DocumentSource.AIRTABLE,
semantic_identifier=f"Airtable Base ID: {self.base_id}. Table Name or ID: {self.table_name_or_id}",
metadata={
"type": "airtable",
"created_time": record.get("createdTime"),
"table_name": table_name,
"base_name": base_name,
},
)
record_documents.append(record_document)
Expand Down
4 changes: 2 additions & 2 deletions backend/requirements/default.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ msal==1.28.0
nltk==3.8.1
Office365-REST-Python-Client==2.5.9
oauthlib==3.2.2
openai==1.41.1
openpyxl==3.1.2
playwright==1.41.2
psutil==5.9.5
Expand Down Expand Up @@ -76,4 +75,5 @@ boto3-stubs[s3]==1.34.133
ultimate_sitemap_parser==0.5
pyairtable==3.0.0a3
anthropic[vertex]==0.36.1
google-cloud-aiplatform==1.70.0
google-cloud-aiplatform==1.70.0
openai==1.53.0
4 changes: 2 additions & 2 deletions backend/requirements/model_server.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ einops==0.8.0
fastapi==0.109.2
google-cloud-aiplatform==1.58.0
numpy==1.26.4
openai==1.41.1
openai==1.53.0
pydantic==2.8.2
retry==0.9.2
safetensors==0.4.2
sentence-transformers==2.6.1
torch==2.2.0
transformers==4.39.2
uvicorn==0.21.1
voyageai==0.2.3
torch==2.5.1

0 comments on commit bcdf0b7

Please sign in to comment.