From 3eaf2a883a5fb52169af2ba2e0571189fb3712eb Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Wed, 20 Nov 2024 16:50:58 -0800 Subject: [PATCH 1/7] quick pass --- backend/danswer/connectors/google_drive/doc_conversion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py index ec0a7837f2e..1cde233c25c 100644 --- a/backend/danswer/connectors/google_drive/doc_conversion.py +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -42,6 +42,7 @@ def _extract_sections_basic( ) -> list[Section]: mime_type = file["mimeType"] link = file["webViewLink"] + print(file) if mime_type not in set(item.value for item in GDriveMimeType): # Unsupported file types can still have a title, finding this way is still useful From d7f2a3e112c00bda2813933d673fb18080d6de6d Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Wed, 20 Nov 2024 16:59:50 -0800 Subject: [PATCH 2/7] k --- .../connectors/google_drive/doc_conversion.py | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py index 1cde233c25c..e55c3e64c8d 100644 --- a/backend/danswer/connectors/google_drive/doc_conversion.py +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -2,6 +2,7 @@ from datetime import datetime from datetime import timezone +from googleapiclient.discovery import build from googleapiclient.errors import HttpError # type: ignore from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE @@ -52,7 +53,6 @@ def _extract_sections_basic( if mime_type in [ GDriveMimeType.DOC.value, GDriveMimeType.PPT.value, - GDriveMimeType.SPREADSHEET.value, ]: export_mime_type = ( "text/plain" @@ -66,6 +66,36 @@ def _extract_sections_basic( .decode("utf-8") ) return [Section(link=link, text=text)] + elif mime_type == GDriveMimeType.SPREADSHEET.value: + sheets_service = build("sheets", "v4", credentials=service._credentials) + spreadsheet = ( + sheets_service.spreadsheets().get(spreadsheetId=file["id"]).execute() + ) + + sections = [] + for sheet in spreadsheet["sheets"]: + sheet_name = sheet["properties"]["title"] + range_name = f"'{sheet_name}'!A1:Z1000" # Adjust range as needed + result = ( + sheets_service.spreadsheets() + .values() + .get(spreadsheetId=file["id"], range=range_name) + .execute() + ) + values = result.get("values", []) + + if values: + text = f"Sheet: {sheet_name}\n" + for row in values: + text += "\t".join(str(cell) for cell in row) + "\n" + sections.append( + Section( + link=f"{link}#gid={sheet['properties']['sheetId']}", + text=text, + ) + ) + + return sections elif mime_type in [ GDriveMimeType.PLAIN_TEXT.value, GDriveMimeType.MARKDOWN.value, From 5e326bcd08d019103f78da1c8a4a45ba4e401353 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Wed, 20 Nov 2024 19:35:31 -0800 Subject: [PATCH 3/7] update sheet --- backend/danswer/configs/app_configs.py | 4 ++ .../connectors/google_drive/doc_conversion.py | 48 ++++++++++++------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 1b092555541..522f51ef389 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -468,6 +468,10 @@ AZURE_DALLE_API_BASE = os.environ.get("AZURE_DALLE_API_BASE") AZURE_DALLE_DEPLOYMENT_NAME = os.environ.get("AZURE_DALLE_DEPLOYMENT_NAME") +# Drive Configs +GOOGLE_SHEET_API_ENABLED = ( + os.environ.get("GOOGLE_SHEET_API_ENABLED", "").lower() == "true" +) # Use managed Vespa (Vespa Cloud). If set, must also set VESPA_CLOUD_URL, VESPA_CLOUD_CERT_PATH and VESPA_CLOUD_KEY_PATH MANAGED_VESPA = os.environ.get("MANAGED_VESPA", "").lower() == "true" diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py index e55c3e64c8d..4074b5d95ab 100644 --- a/backend/danswer/connectors/google_drive/doc_conversion.py +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -6,6 +6,7 @@ from googleapiclient.errors import HttpError # type: ignore from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE +from danswer.configs.app_configs import GOOGLE_SHEET_API_ENABLED from danswer.configs.constants import DocumentSource from danswer.configs.constants import IGNORE_FOR_QA from danswer.connectors.google_drive.constants import DRIVE_FOLDER_TYPE @@ -45,32 +46,23 @@ def _extract_sections_basic( link = file["webViewLink"] print(file) + print(mime_type) if mime_type not in set(item.value for item in GDriveMimeType): + print("here2") # Unsupported file types can still have a title, finding this way is still useful return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] try: - if mime_type in [ - GDriveMimeType.DOC.value, - GDriveMimeType.PPT.value, - ]: - export_mime_type = ( - "text/plain" - if mime_type != GDriveMimeType.SPREADSHEET.value - else "text/csv" + if mime_type == GDriveMimeType.SPREADSHEET.value and GOOGLE_SHEET_API_ENABLED: + print(service) + # Access credentials from the service object's HTTP object + sheets_service = build( + "sheets", "v4", credentials=service._http.credentials ) - text = ( - service.files() - .export(fileId=file["id"], mimeType=export_mime_type) - .execute() - .decode("utf-8") - ) - return [Section(link=link, text=text)] - elif mime_type == GDriveMimeType.SPREADSHEET.value: - sheets_service = build("sheets", "v4", credentials=service._credentials) spreadsheet = ( sheets_service.spreadsheets().get(spreadsheetId=file["id"]).execute() ) + print(spreadsheet) sections = [] for sheet in spreadsheet["sheets"]: @@ -96,6 +88,25 @@ def _extract_sections_basic( ) return sections + if mime_type in [ + GDriveMimeType.DOC.value, + GDriveMimeType.PPT.value, + GDriveMimeType.SPREADSHEET.value, + ]: + print("here") + export_mime_type = ( + "text/plain" + if mime_type != GDriveMimeType.SPREADSHEET.value + else "text/csv" + ) + text = ( + service.files() + .export(fileId=file["id"], mimeType=export_mime_type) + .execute() + .decode("utf-8") + ) + return [Section(link=link, text=text)] + elif mime_type in [ GDriveMimeType.PLAIN_TEXT.value, GDriveMimeType.MARKDOWN.value, @@ -140,7 +151,8 @@ def _extract_sections_basic( return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] - except Exception: + except Exception as e: + print(e) return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] From 3a4804b4b7d54fd3db576b698b5187d8dc0aa5ca Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Wed, 20 Nov 2024 19:38:29 -0800 Subject: [PATCH 4/7] add multiple sheet stuff --- .../danswer/connectors/google_drive/doc_conversion.py | 11 ++--------- deployment/docker_compose/docker-compose.dev.yml | 1 + 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py index 4074b5d95ab..6c0d862182c 100644 --- a/backend/danswer/connectors/google_drive/doc_conversion.py +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -44,17 +44,13 @@ def _extract_sections_basic( ) -> list[Section]: mime_type = file["mimeType"] link = file["webViewLink"] - print(file) - print(mime_type) if mime_type not in set(item.value for item in GDriveMimeType): - print("here2") # Unsupported file types can still have a title, finding this way is still useful return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] try: if mime_type == GDriveMimeType.SPREADSHEET.value and GOOGLE_SHEET_API_ENABLED: - print(service) # Access credentials from the service object's HTTP object sheets_service = build( "sheets", "v4", credentials=service._http.credentials @@ -62,7 +58,6 @@ def _extract_sections_basic( spreadsheet = ( sheets_service.spreadsheets().get(spreadsheetId=file["id"]).execute() ) - print(spreadsheet) sections = [] for sheet in spreadsheet["sheets"]: @@ -88,12 +83,11 @@ def _extract_sections_basic( ) return sections - if mime_type in [ + elif mime_type in [ GDriveMimeType.DOC.value, GDriveMimeType.PPT.value, GDriveMimeType.SPREADSHEET.value, ]: - print("here") export_mime_type = ( "text/plain" if mime_type != GDriveMimeType.SPREADSHEET.value @@ -151,8 +145,7 @@ def _extract_sections_basic( return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] - except Exception as e: - print(e) + except Exception: return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index d59719a70a0..5fedf2bc9de 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -181,6 +181,7 @@ services: - GONG_CONNECTOR_START_TIME=${GONG_CONNECTOR_START_TIME:-} - NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP=${NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP:-} - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-} + - GOOGLE_SHEET_API_ENABLED=${GOOGLE_SHEET_API_ENABLED:-} # Celery Configs (defaults are set in the supervisord.conf file. # prefer doing that to have one source of defaults) - CELERY_WORKER_INDEXING_CONCURRENCY=${CELERY_WORKER_INDEXING_CONCURRENCY:-} From 36c1fc23d087f41db06e2680233a1ade7e65e594 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Thu, 21 Nov 2024 09:05:11 -0800 Subject: [PATCH 5/7] k --- backend/danswer/configs/app_configs.py | 1 + .../connectors/google_drive/doc_conversion.py | 54 +++++++++++++------ 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 522f51ef389..5b2dd9da61e 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -469,6 +469,7 @@ AZURE_DALLE_DEPLOYMENT_NAME = os.environ.get("AZURE_DALLE_DEPLOYMENT_NAME") # Drive Configs +# Note: Google Sheet API must be enabled for this feature to work GOOGLE_SHEET_API_ENABLED = ( os.environ.get("GOOGLE_SHEET_API_ENABLED", "").lower() == "true" ) diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py index 6c0d862182c..fde52145d10 100644 --- a/backend/danswer/connectors/google_drive/doc_conversion.py +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -62,27 +62,51 @@ def _extract_sections_basic( sections = [] for sheet in spreadsheet["sheets"]: sheet_name = sheet["properties"]["title"] - range_name = f"'{sheet_name}'!A1:Z1000" # Adjust range as needed - result = ( - sheets_service.spreadsheets() - .values() - .get(spreadsheetId=file["id"], range=range_name) - .execute() - ) - values = result.get("values", []) + sheet_id = sheet["properties"]["sheetId"] + + # Get sheet dimensions + grid_properties = sheet["properties"].get("gridProperties", {}) + row_count = grid_properties.get("rowCount", 1000) + column_count = grid_properties.get("columnCount", 26) + + # Convert column count to letter (e.g., 26 -> Z, 27 -> AA) + end_column = "" + while column_count: + column_count, remainder = divmod(column_count - 1, 26) + end_column = chr(65 + remainder) + end_column + + range_name = f"'{sheet_name}'!A1:{end_column}{row_count}" - if values: - text = f"Sheet: {sheet_name}\n" - for row in values: - text += "\t".join(str(cell) for cell in row) + "\n" + try: + result = ( + sheets_service.spreadsheets() + .values() + .get(spreadsheetId=file["id"], range=range_name) + .execute() + ) + values = result.get("values", []) + + if values: + text = f"Sheet: {sheet_name}\n" + for row in values: + text += "\t".join(str(cell) for cell in row) + "\n" + sections.append( + Section( + link=f"{link}#gid={sheet_id}", + text=text, + ) + ) + except HttpError as e: + logger.warning(f"Error fetching data for sheet '{sheet_name}': {e}") + # Optionally, add a section indicating the error sections.append( Section( - link=f"{link}#gid={sheet['properties']['sheetId']}", - text=text, + link=f"{link}#gid={sheet_id}", + text=f"Error fetching data for sheet '{sheet_name}': {e}", ) ) - return sections + elif mime_type in [ GDriveMimeType.DOC.value, GDriveMimeType.PPT.value, From 026134805a1418f32b61973f55571756ba102c09 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Thu, 21 Nov 2024 09:09:31 -0800 Subject: [PATCH 6/7] finalized --- .../danswer/connectors/google_drive/doc_conversion.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py index fde52145d10..1f24cbbd4ab 100644 --- a/backend/danswer/connectors/google_drive/doc_conversion.py +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -51,7 +51,6 @@ def _extract_sections_basic( try: if mime_type == GDriveMimeType.SPREADSHEET.value and GOOGLE_SHEET_API_ENABLED: - # Access credentials from the service object's HTTP object sheets_service = build( "sheets", "v4", credentials=service._http.credentials ) @@ -98,13 +97,8 @@ def _extract_sections_basic( ) except HttpError as e: logger.warning(f"Error fetching data for sheet '{sheet_name}': {e}") - # Optionally, add a section indicating the error - sections.append( - Section( - link=f"{link}#gid={sheet_id}", - text=f"Error fetching data for sheet '{sheet_name}': {e}", - ) - ) + continue + return sections elif mime_type in [ From 98aa32055203d32a6d25eb1266deab6c58a176fb Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Thu, 21 Nov 2024 09:42:42 -0800 Subject: [PATCH 7/7] update configuration --- backend/danswer/configs/app_configs.py | 5 - .../connectors/google_drive/doc_conversion.py | 107 ++++++++++-------- .../docker_compose/docker-compose.dev.yml | 1 - 3 files changed, 58 insertions(+), 55 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 5b2dd9da61e..1b092555541 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -468,11 +468,6 @@ AZURE_DALLE_API_BASE = os.environ.get("AZURE_DALLE_API_BASE") AZURE_DALLE_DEPLOYMENT_NAME = os.environ.get("AZURE_DALLE_DEPLOYMENT_NAME") -# Drive Configs -# Note: Google Sheet API must be enabled for this feature to work -GOOGLE_SHEET_API_ENABLED = ( - os.environ.get("GOOGLE_SHEET_API_ENABLED", "").lower() == "true" -) # Use managed Vespa (Vespa Cloud). If set, must also set VESPA_CLOUD_URL, VESPA_CLOUD_CERT_PATH and VESPA_CLOUD_KEY_PATH MANAGED_VESPA = os.environ.get("MANAGED_VESPA", "").lower() == "true" diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py index 1f24cbbd4ab..a3febd9d172 100644 --- a/backend/danswer/connectors/google_drive/doc_conversion.py +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -2,11 +2,10 @@ from datetime import datetime from datetime import timezone -from googleapiclient.discovery import build +from googleapiclient.discovery import build # type: ignore from googleapiclient.errors import HttpError # type: ignore from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE -from danswer.configs.app_configs import GOOGLE_SHEET_API_ENABLED from danswer.configs.constants import DocumentSource from danswer.configs.constants import IGNORE_FOR_QA from danswer.connectors.google_drive.constants import DRIVE_FOLDER_TYPE @@ -50,58 +49,68 @@ def _extract_sections_basic( return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] try: - if mime_type == GDriveMimeType.SPREADSHEET.value and GOOGLE_SHEET_API_ENABLED: - sheets_service = build( - "sheets", "v4", credentials=service._http.credentials - ) - spreadsheet = ( - sheets_service.spreadsheets().get(spreadsheetId=file["id"]).execute() - ) + if mime_type == GDriveMimeType.SPREADSHEET.value: + try: + sheets_service = build( + "sheets", "v4", credentials=service._http.credentials + ) + spreadsheet = ( + sheets_service.spreadsheets() + .get(spreadsheetId=file["id"]) + .execute() + ) - sections = [] - for sheet in spreadsheet["sheets"]: - sheet_name = sheet["properties"]["title"] - sheet_id = sheet["properties"]["sheetId"] - - # Get sheet dimensions - grid_properties = sheet["properties"].get("gridProperties", {}) - row_count = grid_properties.get("rowCount", 1000) - column_count = grid_properties.get("columnCount", 26) - - # Convert column count to letter (e.g., 26 -> Z, 27 -> AA) - end_column = "" - while column_count: - column_count, remainder = divmod(column_count - 1, 26) - end_column = chr(65 + remainder) + end_column - - range_name = f"'{sheet_name}'!A1:{end_column}{row_count}" - - try: - result = ( - sheets_service.spreadsheets() - .values() - .get(spreadsheetId=file["id"], range=range_name) - .execute() - ) - values = result.get("values", []) - - if values: - text = f"Sheet: {sheet_name}\n" - for row in values: - text += "\t".join(str(cell) for cell in row) + "\n" - sections.append( - Section( - link=f"{link}#gid={sheet_id}", - text=text, + sections = [] + for sheet in spreadsheet["sheets"]: + sheet_name = sheet["properties"]["title"] + sheet_id = sheet["properties"]["sheetId"] + + # Get sheet dimensions + grid_properties = sheet["properties"].get("gridProperties", {}) + row_count = grid_properties.get("rowCount", 1000) + column_count = grid_properties.get("columnCount", 26) + + # Convert column count to letter (e.g., 26 -> Z, 27 -> AA) + end_column = "" + while column_count: + column_count, remainder = divmod(column_count - 1, 26) + end_column = chr(65 + remainder) + end_column + + range_name = f"'{sheet_name}'!A1:{end_column}{row_count}" + + try: + result = ( + sheets_service.spreadsheets() + .values() + .get(spreadsheetId=file["id"], range=range_name) + .execute() + ) + values = result.get("values", []) + + if values: + text = f"Sheet: {sheet_name}\n" + for row in values: + text += "\t".join(str(cell) for cell in row) + "\n" + sections.append( + Section( + link=f"{link}#gid={sheet_id}", + text=text, + ) ) + except HttpError as e: + logger.warning( + f"Error fetching data for sheet '{sheet_name}': {e}" ) - except HttpError as e: - logger.warning(f"Error fetching data for sheet '{sheet_name}': {e}") - continue + continue + return sections - return sections + except Exception as e: + logger.warning( + f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'." + " Falling back to basic extraction." + ) - elif mime_type in [ + if mime_type in [ GDriveMimeType.DOC.value, GDriveMimeType.PPT.value, GDriveMimeType.SPREADSHEET.value, diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index 5fedf2bc9de..d59719a70a0 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -181,7 +181,6 @@ services: - GONG_CONNECTOR_START_TIME=${GONG_CONNECTOR_START_TIME:-} - NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP=${NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP:-} - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-} - - GOOGLE_SHEET_API_ENABLED=${GOOGLE_SHEET_API_ENABLED:-} # Celery Configs (defaults are set in the supervisord.conf file. # prefer doing that to have one source of defaults) - CELERY_WORKER_INDEXING_CONCURRENCY=${CELERY_WORKER_INDEXING_CONCURRENCY:-}