From 2758ffd9d59130fa40bded5d5a338f418afe4d30 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Wed, 6 Nov 2024 18:07:35 -0800 Subject: [PATCH] Google Drive Improvements (#3057) * Google Drive Improvements * mypy * should work! * variable cleanup * final fixes --- backend/danswer/connectors/gmail/connector.py | 20 +- .../connectors/google_drive/connector.py | 289 +++++++++++------- .../connectors/google_drive/doc_conversion.py | 26 +- .../connectors/google_drive/file_retrieval.py | 109 ++++--- .../connectors/google_utils/google_kv.py | 66 ++-- .../connectors/google_utils/google_utils.py | 33 +- .../connectors/google_utils/resources.py | 4 +- .../google_utils/shared_constants.py | 2 +- backend/danswer/db/credentials.py | 21 +- backend/danswer/server/documents/connector.py | 27 +- backend/danswer/server/documents/models.py | 14 +- .../{helpers.py => consts_and_utils.py} | 153 ++++------ .../google_drive/test_google_drive_oauth.py | 154 +++++++--- .../test_google_drive_sections.py | 6 +- .../test_google_drive_service_acct.py | 201 +++++++++--- .../test_google_drive_slim_docs.py | 89 +++--- .../[connector]/pages/gdrive/Credential.tsx | 20 +- .../pages/gdrive/GoogleDrivePage.tsx | 6 +- .../[connector]/pages/gmail/Credential.tsx | 28 +- .../[connector]/pages/gmail/GmailPage.tsx | 8 +- .../[connector]/pages/utils/hooks.ts | 14 +- web/src/lib/connectors/credentials.ts | 37 +-- 22 files changed, 802 insertions(+), 525 deletions(-) rename backend/tests/daily/connectors/google_drive/{helpers.py => consts_and_utils.py} (51%) diff --git a/backend/danswer/connectors/gmail/connector.py b/backend/danswer/connectors/gmail/connector.py index 268ad871f7c..f9ef995d701 100644 --- a/backend/danswer/connectors/gmail/connector.py +++ b/backend/danswer/connectors/gmail/connector.py @@ -277,16 +277,16 @@ def _fetch_threads( fields=THREAD_LIST_FIELDS, q=query, ): - full_thread = add_retries( - lambda: gmail_service.users() - .threads() - .get( - userId=user_email, - id=thread["id"], - fields=THREAD_FIELDS, - ) - .execute() - )() + full_threads = execute_paginated_retrieval( + retrieval_function=gmail_service.users().threads().get, + list_key=None, + userId=user_email, + fields=THREAD_FIELDS, + id=thread["id"], + ) + # full_threads is an iterator containing a single thread + # so we need to convert it to a list and grab the first element + full_thread = list(full_threads)[0] doc = thread_to_document(full_thread) if doc is None: continue diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 6d23606260d..6bf2ec476a6 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -1,4 +1,8 @@ +from collections.abc import Callable from collections.abc import Iterator +from concurrent.futures import as_completed +from concurrent.futures import ThreadPoolExecutor +from functools import partial from typing import Any from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore @@ -6,11 +10,12 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.google_drive.doc_conversion import build_slim_document from danswer.connectors.google_drive.doc_conversion import ( convert_drive_item_to_document, ) from danswer.connectors.google_drive.file_retrieval import crawl_folders_for_files -from danswer.connectors.google_drive.file_retrieval import get_files_in_my_drive +from danswer.connectors.google_drive.file_retrieval import get_all_files_in_my_drive from danswer.connectors.google_drive.file_retrieval import get_files_in_shared_drive from danswer.connectors.google_drive.models import GoogleDriveFileType from danswer.connectors.google_utils.google_auth import get_google_creds @@ -32,10 +37,11 @@ from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.interfaces import SlimConnector -from danswer.connectors.models import SlimDocument from danswer.utils.logger import setup_logger logger = setup_logger() +# TODO: Improve this by using the batch utility: https://googleapis.github.io/google-api-python-client/docs/batch.html +# All file retrievals could be batched and made at once def _extract_str_list_from_comma_str(string: str | None) -> list[str]: @@ -48,6 +54,34 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]: return [url.split("/")[-1] for url in urls] +def _convert_single_file( + creds: Any, primary_admin_email: str, file: dict[str, Any] +) -> Any: + user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email + user_drive_service = get_drive_service(creds, user_email=user_email) + docs_service = get_google_docs_service(creds, user_email=user_email) + return convert_drive_item_to_document( + file=file, + drive_service=user_drive_service, + docs_service=docs_service, + ) + + +def _process_files_batch( + files: list[GoogleDriveFileType], convert_func: Callable, batch_size: int +) -> GenerateDocumentsOutput: + doc_batch = [] + with ThreadPoolExecutor(max_workers=min(16, len(files))) as executor: + for doc in executor.map(convert_func, files): + if doc: + doc_batch.append(doc) + if len(doc_batch) >= batch_size: + yield doc_batch + doc_batch = [] + if doc_batch: + yield doc_batch + + class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector): def __init__( self, @@ -97,19 +131,23 @@ def __init__( self.include_shared_drives = include_shared_drives shared_drive_url_list = _extract_str_list_from_comma_str(shared_drive_urls) - self.shared_drive_ids = _extract_ids_from_urls(shared_drive_url_list) + self._requested_shared_drive_ids = set( + _extract_ids_from_urls(shared_drive_url_list) + ) self.include_my_drives = include_my_drives - self.my_drive_emails = _extract_str_list_from_comma_str(my_drive_emails) + self._requested_my_drive_emails = set( + _extract_str_list_from_comma_str(my_drive_emails) + ) shared_folder_url_list = _extract_str_list_from_comma_str(shared_folder_urls) - self.shared_folder_ids = _extract_ids_from_urls(shared_folder_url_list) + self._requested_folder_ids = set(_extract_ids_from_urls(shared_folder_url_list)) self._primary_admin_email: str | None = None self._creds: OAuthCredentials | ServiceAccountCredentials | None = None - self._TRAVERSED_PARENT_IDS: set[str] = set() + self._retrieved_ids: set[str] = set() @property def primary_admin_email(self) -> str: @@ -141,9 +179,6 @@ def creds(self) -> OAuthCredentials | ServiceAccountCredentials: ) return self._creds - def _update_traversed_parent_ids(self, folder_id: str) -> None: - self._TRAVERSED_PARENT_IDS.add(folder_id) - def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None: primary_admin_email = credentials[DB_CREDENTIALS_PRIMARY_ADMIN_KEY] self._primary_admin_email = primary_admin_email @@ -154,125 +189,167 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None ) return new_creds_dict - def _get_all_user_emails(self) -> list[str]: + def _update_traversed_parent_ids(self, folder_id: str) -> None: + self._retrieved_ids.add(folder_id) + + def _get_all_user_emails(self, admins_only: bool) -> list[str]: admin_service = get_admin_service( creds=self.creds, user_email=self.primary_admin_email, ) + query = "isAdmin=true" if admins_only else "isAdmin=false" emails = [] for user in execute_paginated_retrieval( retrieval_function=admin_service.users().list, list_key="users", fields=USER_FIELDS, domain=self.google_domain, + query=query, ): if email := user.get("primaryEmail"): emails.append(email) return emails - def _fetch_drive_items( + def _get_all_drive_ids(self) -> set[str]: + primary_drive_service = get_drive_service( + creds=self.creds, + user_email=self.primary_admin_email, + ) + all_drive_ids = set() + for drive in execute_paginated_retrieval( + retrieval_function=primary_drive_service.drives().list, + list_key="drives", + useDomainAdminAccess=True, + fields="drives(id)", + ): + all_drive_ids.add(drive["id"]) + return all_drive_ids + + def _initialize_all_class_variables(self) -> None: + # Get all user emails + # Get admins first becuase they are more likely to have access to the most files + user_emails = [self.primary_admin_email] + for admins_only in [True, False]: + for email in self._get_all_user_emails(admins_only=admins_only): + if email not in user_emails: + user_emails.append(email) + self._all_org_emails = user_emails + + self._all_drive_ids: set[str] = self._get_all_drive_ids() + + # remove drive ids from the folder ids because they are queried differently + self._requested_folder_ids -= self._all_drive_ids + + # Remove drive_ids that are not in the all_drive_ids and check them as folders instead + invalid_drive_ids = self._requested_shared_drive_ids - self._all_drive_ids + if invalid_drive_ids: + logger.warning( + f"Some shared drive IDs were not found. IDs: {invalid_drive_ids}" + ) + logger.warning("Checking for folder access instead...") + self._requested_folder_ids.update(invalid_drive_ids) + + if not self.include_shared_drives: + self._requested_shared_drive_ids = set() + elif not self._requested_shared_drive_ids: + self._requested_shared_drive_ids = self._all_drive_ids + + def _impersonate_user_for_retrieval( self, + user_email: str, is_slim: bool, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, ) -> Iterator[GoogleDriveFileType]: - primary_drive_service = get_drive_service( - creds=self.creds, - user_email=self.primary_admin_email, - ) + drive_service = get_drive_service(self.creds, user_email) + if self.include_my_drives and ( + not self._requested_my_drive_emails + or user_email in self._requested_my_drive_emails + ): + yield from get_all_files_in_my_drive( + service=drive_service, + update_traversed_ids_func=self._update_traversed_parent_ids, + is_slim=is_slim, + start=start, + end=end, + ) - if self.include_shared_drives: - shared_drive_urls = self.shared_drive_ids - if not shared_drive_urls: - # if no parent ids are specified, get all shared drives using the admin account - for drive in execute_paginated_retrieval( - retrieval_function=primary_drive_service.drives().list, - list_key="drives", - useDomainAdminAccess=True, - fields="drives(id)", - ): - shared_drive_urls.append(drive["id"]) - - # For each shared drive, retrieve all files - for shared_drive_id in shared_drive_urls: - for file in get_files_in_shared_drive( - service=primary_drive_service, - drive_id=shared_drive_id, - is_slim=is_slim, - cache_folders=bool(self.shared_folder_ids), - update_traversed_ids_func=self._update_traversed_parent_ids, - start=start, - end=end, - ): - yield file - - if self.shared_folder_ids: - # Crawl all the shared parent ids for files - for folder_id in self.shared_folder_ids: - yield from crawl_folders_for_files( - service=primary_drive_service, - parent_id=folder_id, - personal_drive=False, - traversed_parent_ids=self._TRAVERSED_PARENT_IDS, - update_traversed_ids_func=self._update_traversed_parent_ids, - start=start, - end=end, - ) + remaining_drive_ids = self._requested_shared_drive_ids - self._retrieved_ids + for drive_id in remaining_drive_ids: + yield from get_files_in_shared_drive( + service=drive_service, + drive_id=drive_id, + is_slim=is_slim, + update_traversed_ids_func=self._update_traversed_parent_ids, + start=start, + end=end, + ) - all_user_emails = [] - # get all personal docs from each users' personal drive - if self.include_my_drives: - if isinstance(self.creds, ServiceAccountCredentials): - all_user_emails = self.my_drive_emails or [] - - # If using service account and no emails specified, fetch all users - if not all_user_emails: - all_user_emails = self._get_all_user_emails() - - elif self.primary_admin_email: - # If using OAuth, only fetch the primary admin email - all_user_emails = [self.primary_admin_email] - - for email in all_user_emails: - logger.info(f"Fetching personal files for user: {email}") - user_drive_service = get_drive_service(self.creds, user_email=email) - - yield from get_files_in_my_drive( - service=user_drive_service, - email=email, - is_slim=is_slim, - start=start, - end=end, - ) + remaining_folders = self._requested_folder_ids - self._retrieved_ids + for folder_id in remaining_folders: + yield from crawl_folders_for_files( + service=drive_service, + parent_id=folder_id, + traversed_parent_ids=self._retrieved_ids, + update_traversed_ids_func=self._update_traversed_parent_ids, + start=start, + end=end, + ) + + def _fetch_drive_items( + self, + is_slim: bool, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> Iterator[GoogleDriveFileType]: + self._initialize_all_class_variables() + + # Process users in parallel using ThreadPoolExecutor + with ThreadPoolExecutor(max_workers=10) as executor: + future_to_email = { + executor.submit( + self._impersonate_user_for_retrieval, email, is_slim, start, end + ): email + for email in self._all_org_emails + } + + # Yield results as they complete + for future in as_completed(future_to_email): + yield from future.result() + + remaining_folders = self._requested_folder_ids - self._retrieved_ids + if remaining_folders: + logger.warning( + f"Some folders/drives were not retrieved. IDs: {remaining_folders}" + ) def _extract_docs_from_google_drive( self, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, ) -> GenerateDocumentsOutput: - doc_batch = [] - for file in self._fetch_drive_items( - is_slim=False, - start=start, - end=end, - ): - user_email = ( - file.get("owners", [{}])[0].get("emailAddress") - or self.primary_admin_email - ) - user_drive_service = get_drive_service(self.creds, user_email=user_email) - docs_service = get_google_docs_service(self.creds, user_email=user_email) - if doc := convert_drive_item_to_document( - file=file, - drive_service=user_drive_service, - docs_service=docs_service, - ): - doc_batch.append(doc) - if len(doc_batch) >= self.batch_size: - yield doc_batch - doc_batch = [] + # Create a larger process pool for file conversion + convert_func = partial( + _convert_single_file, self.creds, self.primary_admin_email + ) - yield doc_batch + # Process files in larger batches + LARGE_BATCH_SIZE = self.batch_size * 4 + files_to_process = [] + # Gather the files into batches to be processed in parallel + for file in self._fetch_drive_items(is_slim=False, start=start, end=end): + files_to_process.append(file) + if len(files_to_process) >= LARGE_BATCH_SIZE: + yield from _process_files_batch( + files_to_process, convert_func, self.batch_size + ) + files_to_process = [] + + # Process any remaining files + if files_to_process: + yield from _process_files_batch( + files_to_process, convert_func, self.batch_size + ) def load_from_state(self) -> GenerateDocumentsOutput: try: @@ -303,18 +380,8 @@ def _extract_slim_docs_from_google_drive( start=start, end=end, ): - slim_batch.append( - SlimDocument( - id=file["webViewLink"], - perm_sync_data={ - "doc_id": file.get("id"), - "permissions": file.get("permissions", []), - "permission_ids": file.get("permissionIds", []), - "name": file.get("name"), - "owner_email": file.get("owners", [{}])[0].get("emailAddress"), - }, - ) - ) + if doc := build_slim_document(file): + slim_batch.append(doc) if len(slim_batch) >= SLIM_BATCH_SIZE: yield slim_batch slim_batch = [] diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py index 44a03818088..ec0a7837f2e 100644 --- a/backend/danswer/connectors/google_drive/doc_conversion.py +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -7,6 +7,7 @@ from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.constants import DocumentSource from danswer.configs.constants import IGNORE_FOR_QA +from danswer.connectors.google_drive.constants import DRIVE_FOLDER_TYPE from danswer.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE from danswer.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT from danswer.connectors.google_drive.models import GDriveMimeType @@ -16,6 +17,7 @@ from danswer.connectors.google_utils.resources import GoogleDriveService from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.connectors.models import SlimDocument from danswer.file_processing.extract_file_text import docx_to_text from danswer.file_processing.extract_file_text import pptx_to_text from danswer.file_processing.extract_file_text import read_pdf_file @@ -25,6 +27,7 @@ logger = setup_logger() + # these errors don't represent a failure in the connector, but simply files # that can't / shouldn't be indexed ERRORS_TO_CONTINUE_ON = [ @@ -120,6 +123,10 @@ def convert_drive_item_to_document( if file.get("mimeType") == DRIVE_SHORTCUT_TYPE: logger.info("Ignoring Drive Shortcut Filetype") return None + # Skip files that are folders + if file.get("mimeType") == DRIVE_FOLDER_TYPE: + logger.info("Ignoring Drive Folder Filetype") + return None sections: list[Section] = [] @@ -133,7 +140,6 @@ def convert_drive_item_to_document( f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'." " Falling back to basic extraction." ) - # NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc if not sections: try: @@ -150,7 +156,6 @@ def convert_drive_item_to_document( return None raise - if not sections: return None @@ -173,3 +178,20 @@ def convert_drive_item_to_document( logger.exception("Ran into exception when pulling a file from Google Drive") return None + + +def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None: + # Skip files that are folders or shortcuts + if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]: + return None + + return SlimDocument( + id=file["webViewLink"], + perm_sync_data={ + "doc_id": file.get("id"), + "permissions": file.get("permissions", []), + "permission_ids": file.get("permissionIds", []), + "name": file.get("name"), + "owner_email": file.get("owners", [{}])[0].get("emailAddress"), + }, + ) diff --git a/backend/danswer/connectors/google_drive/file_retrieval.py b/backend/danswer/connectors/google_drive/file_retrieval.py index 4f8cddab6df..c0ed998561d 100644 --- a/backend/danswer/connectors/google_drive/file_retrieval.py +++ b/backend/danswer/connectors/google_drive/file_retrieval.py @@ -1,6 +1,7 @@ from collections.abc import Callable from collections.abc import Iterator from datetime import datetime +from typing import Any from googleapiclient.discovery import Resource # type: ignore @@ -41,7 +42,6 @@ def _generate_time_range_filter( def _get_folders_in_parent( service: Resource, parent_id: str | None = None, - personal_drive: bool = False, ) -> Iterator[GoogleDriveFileType]: # Follow shortcuts to folders query = f"(mimeType = '{DRIVE_FOLDER_TYPE}' or mimeType = '{DRIVE_SHORTCUT_TYPE}')" @@ -53,9 +53,10 @@ def _get_folders_in_parent( for file in execute_paginated_retrieval( retrieval_function=service.files().list, list_key="files", - corpora="user" if personal_drive else "allDrives", - supportsAllDrives=not personal_drive, - includeItemsFromAllDrives=not personal_drive, + continue_on_404_or_403=True, + corpora="allDrives", + supportsAllDrives=True, + includeItemsFromAllDrives=True, fields=FOLDER_FIELDS, q=query, ): @@ -65,7 +66,6 @@ def _get_folders_in_parent( def _get_files_in_parent( service: Resource, parent_id: str, - personal_drive: bool, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, is_slim: bool = False, @@ -77,9 +77,10 @@ def _get_files_in_parent( for file in execute_paginated_retrieval( retrieval_function=service.files().list, list_key="files", - corpora="user" if personal_drive else "allDrives", - supportsAllDrives=not personal_drive, - includeItemsFromAllDrives=not personal_drive, + continue_on_404_or_403=True, + corpora="allDrives", + supportsAllDrives=True, + includeItemsFromAllDrives=True, fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, q=query, ): @@ -89,7 +90,6 @@ def _get_files_in_parent( def crawl_folders_for_files( service: Resource, parent_id: str, - personal_drive: bool, traversed_parent_ids: set[str], update_traversed_ids_func: Callable[[str], None], start: SecondsSinceUnixEpoch | None = None, @@ -99,29 +99,30 @@ def crawl_folders_for_files( This function starts crawling from any folder. It is slower though. """ if parent_id in traversed_parent_ids: - print(f"Skipping subfolder since already traversed: {parent_id}") + logger.info(f"Skipping subfolder since already traversed: {parent_id}") return - update_traversed_ids_func(parent_id) - - yield from _get_files_in_parent( + found_files = False + for file in _get_files_in_parent( service=service, - personal_drive=personal_drive, start=start, end=end, parent_id=parent_id, - ) + ): + found_files = True + yield file + + if found_files: + update_traversed_ids_func(parent_id) for subfolder in _get_folders_in_parent( service=service, parent_id=parent_id, - personal_drive=personal_drive, ): logger.info("Fetching all files in subfolder: " + subfolder["name"]) yield from crawl_folders_for_files( service=service, parent_id=subfolder["id"], - personal_drive=personal_drive, traversed_parent_ids=traversed_parent_ids, update_traversed_ids_func=update_traversed_ids_func, start=start, @@ -133,55 +134,59 @@ def get_files_in_shared_drive( service: Resource, drive_id: str, is_slim: bool = False, - cache_folders: bool = True, update_traversed_ids_func: Callable[[str], None] = lambda _: None, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, ) -> Iterator[GoogleDriveFileType]: # If we know we are going to folder crawl later, we can cache the folders here - if cache_folders: - # Get all folders being queried and add them to the traversed set - query = f"mimeType = '{DRIVE_FOLDER_TYPE}'" - query += " and trashed = false" - for file in execute_paginated_retrieval( - retrieval_function=service.files().list, - list_key="files", - corpora="drive", - driveId=drive_id, - supportsAllDrives=True, - includeItemsFromAllDrives=True, - fields="nextPageToken, files(id)", - q=query, - ): - update_traversed_ids_func(file["id"]) + # Get all folders being queried and add them to the traversed set + query = f"mimeType = '{DRIVE_FOLDER_TYPE}'" + query += " and trashed = false" + found_folders = False + for file in execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + continue_on_404_or_403=True, + corpora="drive", + driveId=drive_id, + supportsAllDrives=True, + includeItemsFromAllDrives=True, + fields="nextPageToken, files(id)", + q=query, + ): + update_traversed_ids_func(file["id"]) + found_folders = True + if found_folders: + update_traversed_ids_func(drive_id) # Get all files in the shared drive query = f"mimeType != '{DRIVE_FOLDER_TYPE}'" query += " and trashed = false" query += _generate_time_range_filter(start, end) - for file in execute_paginated_retrieval( + yield from execute_paginated_retrieval( retrieval_function=service.files().list, list_key="files", + continue_on_404_or_403=True, corpora="drive", driveId=drive_id, supportsAllDrives=True, includeItemsFromAllDrives=True, fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, q=query, - ): - yield file + ) -def get_files_in_my_drive( - service: Resource, - email: str, +def get_all_files_in_my_drive( + service: Any, + update_traversed_ids_func: Callable, is_slim: bool = False, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, ) -> Iterator[GoogleDriveFileType]: - query = f"mimeType != '{DRIVE_FOLDER_TYPE}' and '{email}' in owners" - query += " and trashed = false" - query += _generate_time_range_filter(start, end) + # If we know we are going to folder crawl later, we can cache the folders here + # Get all folders being queried and add them to the traversed set + query = "trashed = false and 'me' in owners" + found_folders = False for file in execute_paginated_retrieval( retrieval_function=service.files().list, list_key="files", @@ -189,7 +194,25 @@ def get_files_in_my_drive( fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, q=query, ): - yield file + update_traversed_ids_func(file["id"]) + found_folders = True + if found_folders: + update_traversed_ids_func(get_root_folder_id(service)) + + # Then get the files + query = "trashed = false and 'me' in owners" + query += _generate_time_range_filter(start, end) + fields = "files(id, name, mimeType, webViewLink, modifiedTime, createdTime)" + if not is_slim: + fields += ", files(permissions, permissionIds, owners)" + + yield from execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + corpora="user", + fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, + q=query, + ) # Just in case we need to get the root folder id diff --git a/backend/danswer/connectors/google_utils/google_kv.py b/backend/danswer/connectors/google_utils/google_kv.py index 1231293be69..7984681ed81 100644 --- a/backend/danswer/connectors/google_utils/google_kv.py +++ b/backend/danswer/connectors/google_utils/google_kv.py @@ -6,7 +6,6 @@ from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore from google_auth_oauthlib.flow import InstalledAppFlow # type: ignore -from googleapiclient.discovery import build # type: ignore from sqlalchemy.orm import Session from danswer.configs.app_configs import WEB_DOMAIN @@ -16,6 +15,8 @@ from danswer.configs.constants import KV_GMAIL_SERVICE_ACCOUNT_KEY from danswer.configs.constants import KV_GOOGLE_DRIVE_CRED_KEY from danswer.configs.constants import KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY +from danswer.connectors.google_utils.resources import get_drive_service +from danswer.connectors.google_utils.resources import get_gmail_service from danswer.connectors.google_utils.shared_constants import ( DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, ) @@ -45,8 +46,40 @@ logger = setup_logger() -def _build_frontend_google_drive_redirect() -> str: - return f"{WEB_DOMAIN}/admin/connectors/google-drive/auth/callback" +def _build_frontend_google_drive_redirect(source: DocumentSource) -> str: + if source == DocumentSource.GOOGLE_DRIVE: + return f"{WEB_DOMAIN}/admin/connectors/google-drive/auth/callback" + elif source == DocumentSource.GMAIL: + return f"{WEB_DOMAIN}/admin/connectors/gmail/auth/callback" + else: + raise ValueError(f"Unsupported source: {source}") + + +def _get_current_oauth_user(creds: OAuthCredentials, source: DocumentSource) -> str: + if source == DocumentSource.GOOGLE_DRIVE: + drive_service = get_drive_service(creds) + user_info = ( + drive_service.about() + .get( + fields="user(emailAddress)", + ) + .execute() + ) + email = user_info.get("user", {}).get("emailAddress") + elif source == DocumentSource.GMAIL: + gmail_service = get_gmail_service(creds) + user_info = ( + gmail_service.users() + .getProfile( + userId="me", + fields="emailAddress", + ) + .execute() + ) + email = user_info.get("emailAddress") + else: + raise ValueError(f"Unsupported source: {source}") + return email def verify_csrf(credential_id: int, state: str) -> None: @@ -67,8 +100,8 @@ def update_credential_access_tokens( app_credentials = get_google_app_cred(source) flow = InstalledAppFlow.from_client_config( app_credentials.model_dump(), - scopes=GOOGLE_SCOPES, - redirect_uri=_build_frontend_google_drive_redirect(), + scopes=GOOGLE_SCOPES[source], + redirect_uri=_build_frontend_google_drive_redirect(source), ) flow.fetch_token(code=auth_code) creds = flow.credentials @@ -77,15 +110,7 @@ def update_credential_access_tokens( # Get user email from Google API so we know who # the primary admin is for this connector try: - admin_service = build("drive", "v3", credentials=creds) - user_info = ( - admin_service.about() - .get( - fields="user(emailAddress)", - ) - .execute() - ) - email = user_info.get("user", {}).get("emailAddress") + email = _get_current_oauth_user(creds, source) except Exception as e: if MISSING_SCOPES_ERROR_STR in str(e): raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e @@ -120,13 +145,18 @@ def build_service_account_creds( ) -def get_auth_url(credential_id: int) -> str: - creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_CRED_KEY)) +def get_auth_url(credential_id: int, source: DocumentSource) -> str: + if source == DocumentSource.GOOGLE_DRIVE: + creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_CRED_KEY)) + elif source == DocumentSource.GMAIL: + creds_str = str(get_kv_store().load(KV_GMAIL_CRED_KEY)) + else: + raise ValueError(f"Unsupported source: {source}") credential_json = json.loads(creds_str) flow = InstalledAppFlow.from_client_config( credential_json, - scopes=GOOGLE_SCOPES, - redirect_uri=_build_frontend_google_drive_redirect(), + scopes=GOOGLE_SCOPES[source], + redirect_uri=_build_frontend_google_drive_redirect(source), ) auth_url, _ = flow.authorization_url(prompt="consent") diff --git a/backend/danswer/connectors/google_utils/google_utils.py b/backend/danswer/connectors/google_utils/google_utils.py index 037ec3731ac..0ebdd42950d 100644 --- a/backend/danswer/connectors/google_utils/google_utils.py +++ b/backend/danswer/connectors/google_utils/google_utils.py @@ -23,7 +23,7 @@ def _execute_with_retry(request: Any) -> Any: max_attempts = 10 - attempt = 0 + attempt = 1 while attempt < max_attempts: # Note for reasons unknown, the Google API will sometimes return a 429 @@ -81,7 +81,8 @@ def _execute_with_retry(request: Any) -> Any: def execute_paginated_retrieval( retrieval_function: Callable, - list_key: str, + list_key: str | None = None, + continue_on_404_or_403: bool = False, **kwargs: Any, ) -> Iterator[GoogleDriveFileType]: """Execute a paginated retrieval from Google Drive API @@ -95,8 +96,30 @@ def execute_paginated_retrieval( if next_page_token: request_kwargs["pageToken"] = next_page_token - results = add_retries(lambda: retrieval_function(**request_kwargs).execute())() + try: + results = retrieval_function(**request_kwargs).execute() + except HttpError as e: + if e.resp.status >= 500: + results = add_retries( + lambda: retrieval_function(**request_kwargs).execute() + )() + elif e.resp.status == 404 or e.resp.status == 403: + if continue_on_404_or_403: + logger.warning(f"Error executing request: {e}") + results = {} + else: + raise e + elif e.resp.status == 429: + results = _execute_with_retry( + lambda: retrieval_function(**request_kwargs).execute() + ) + else: + logger.exception("Error executing request:") + raise e next_page_token = results.get("nextPageToken") - for item in results.get(list_key, []): - yield item + if list_key: + for item in results.get(list_key, []): + yield item + else: + yield results diff --git a/backend/danswer/connectors/google_utils/resources.py b/backend/danswer/connectors/google_utils/resources.py index 341c99df155..c93c5809755 100644 --- a/backend/danswer/connectors/google_utils/resources.py +++ b/backend/danswer/connectors/google_utils/resources.py @@ -51,13 +51,13 @@ def get_drive_service( def get_admin_service( creds: ServiceAccountCredentials | OAuthCredentials, - user_email: str, + user_email: str | None = None, ) -> AdminService: return _get_google_service("admin", "directory_v1", creds, user_email) def get_gmail_service( creds: ServiceAccountCredentials | OAuthCredentials, - user_email: str, + user_email: str | None = None, ) -> GmailService: return _get_google_service("gmail", "v1", creds, user_email) diff --git a/backend/danswer/connectors/google_utils/shared_constants.py b/backend/danswer/connectors/google_utils/shared_constants.py index 3aa3d1048b6..ef3c0bb0302 100644 --- a/backend/danswer/connectors/google_utils/shared_constants.py +++ b/backend/danswer/connectors/google_utils/shared_constants.py @@ -31,7 +31,7 @@ # Documentation and error messages SCOPE_DOC_URL = "https://docs.danswer.dev/connectors/google_drive/overview" ONYX_SCOPE_INSTRUCTIONS = ( - "You have upgraded Danswer without updating the Google Drive scopes. " + "You have upgraded Danswer without updating the Google Auth scopes. " f"Please refer to the documentation to learn how to update the scopes: {SCOPE_DOC_URL}" ) diff --git a/backend/danswer/db/credentials.py b/backend/danswer/db/credentials.py index 86532dfc542..4a146c5c5f4 100644 --- a/backend/danswer/db/credentials.py +++ b/backend/danswer/db/credentials.py @@ -10,7 +10,6 @@ from danswer.auth.schemas import UserRole from danswer.configs.constants import DocumentSource -from danswer.configs.constants import KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY from danswer.connectors.google_utils.shared_constants import ( DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, ) @@ -422,23 +421,15 @@ def cleanup_google_drive_credentials(db_session: Session) -> None: db_session.commit() -def delete_gmail_service_account_credentials( - user: User | None, db_session: Session +def delete_service_account_credentials( + user: User | None, db_session: Session, source: DocumentSource ) -> None: credentials = fetch_credentials(db_session=db_session, user=user) for credential in credentials: - if credential.credential_json.get(DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY): - db_session.delete(credential) - - db_session.commit() - - -def delete_google_drive_service_account_credentials( - user: User | None, db_session: Session -) -> None: - credentials = fetch_credentials(db_session=db_session, user=user) - for credential in credentials: - if credential.credential_json.get(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY): + if ( + credential.credential_json.get(DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY) + and credential.source == source + ): db_session.delete(credential) db_session.commit() diff --git a/backend/danswer/server/documents/connector.py b/backend/danswer/server/documents/connector.py index 0e2c00d67bc..0bddcef605d 100644 --- a/backend/danswer/server/documents/connector.py +++ b/backend/danswer/server/documents/connector.py @@ -67,8 +67,7 @@ from danswer.db.credentials import cleanup_gmail_credentials from danswer.db.credentials import cleanup_google_drive_credentials from danswer.db.credentials import create_credential -from danswer.db.credentials import delete_gmail_service_account_credentials -from danswer.db.credentials import delete_google_drive_service_account_credentials +from danswer.db.credentials import delete_service_account_credentials from danswer.db.credentials import fetch_credential_by_id from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed from danswer.db.document import get_document_counts_for_cc_pairs @@ -309,13 +308,13 @@ def upsert_service_account_credential( try: credential_base = build_service_account_creds( DocumentSource.GOOGLE_DRIVE, - primary_admin_email=service_account_credential_request.google_drive_primary_admin, + primary_admin_email=service_account_credential_request.google_primary_admin, ) except KvKeyNotFoundError as e: raise HTTPException(status_code=400, detail=str(e)) # first delete all existing service account credentials - delete_google_drive_service_account_credentials(user, db_session) + delete_service_account_credentials(user, db_session, DocumentSource.GOOGLE_DRIVE) # `user=None` since this credential is not a personal credential credential = create_credential( credential_data=credential_base, user=user, db_session=db_session @@ -335,13 +334,13 @@ def upsert_gmail_service_account_credential( try: credential_base = build_service_account_creds( DocumentSource.GMAIL, - primary_admin_email=service_account_credential_request.gmail_primary_admin, + primary_admin_email=service_account_credential_request.google_primary_admin, ) except KvKeyNotFoundError as e: raise HTTPException(status_code=400, detail=str(e)) # first delete all existing service account credentials - delete_gmail_service_account_credentials(user, db_session) + delete_service_account_credentials(user, db_session, DocumentSource.GMAIL) # `user=None` since this credential is not a personal credential credential = create_credential( credential_data=credential_base, user=user, db_session=db_session @@ -894,7 +893,7 @@ def gmail_auth( httponly=True, max_age=600, ) - return AuthUrl(auth_url=get_auth_url(int(credential_id))) + return AuthUrl(auth_url=get_auth_url(int(credential_id), DocumentSource.GMAIL)) @router.get("/connector/google-drive/authorize/{credential_id}") @@ -908,7 +907,9 @@ def google_drive_auth( httponly=True, max_age=600, ) - return AuthUrl(auth_url=get_auth_url(int(credential_id))) + return AuthUrl( + auth_url=get_auth_url(int(credential_id), DocumentSource.GOOGLE_DRIVE) + ) @router.get("/connector/gmail/callback") @@ -925,12 +926,10 @@ def gmail_callback( ) credential_id = int(credential_id_cookie) verify_csrf(credential_id, callback.state) - if ( - update_credential_access_tokens( - callback.code, credential_id, user, db_session, DocumentSource.GMAIL - ) - is None - ): + credentials: Credentials | None = update_credential_access_tokens( + callback.code, credential_id, user, db_session, DocumentSource.GMAIL + ) + if credentials is None: raise HTTPException( status_code=500, detail="Unable to fetch Gmail access tokens" ) diff --git a/backend/danswer/server/documents/models.py b/backend/danswer/server/documents/models.py index d885641e398..fbd7c1e59e4 100644 --- a/backend/danswer/server/documents/models.py +++ b/backend/danswer/server/documents/models.py @@ -4,7 +4,6 @@ from pydantic import BaseModel from pydantic import Field -from pydantic import model_validator from danswer.configs.app_configs import MASK_CREDENTIAL_PREFIX from danswer.configs.constants import DocumentSource @@ -377,18 +376,7 @@ class GoogleServiceAccountKey(BaseModel): class GoogleServiceAccountCredentialRequest(BaseModel): - google_drive_primary_admin: str | None = None # email of user to impersonate - gmail_primary_admin: str | None = None # email of user to impersonate - - @model_validator(mode="after") - def check_user_delegation(self) -> "GoogleServiceAccountCredentialRequest": - if (self.google_drive_primary_admin is None) == ( - self.gmail_primary_admin is None - ): - raise ValueError( - "Exactly one of google_drive_primary_admin or gmail_primary_admin must be set" - ) - return self + google_primary_admin: str | None = None # email of user to impersonate class FileUploadResponse(BaseModel): diff --git a/backend/tests/daily/connectors/google_drive/helpers.py b/backend/tests/daily/connectors/google_drive/consts_and_utils.py similarity index 51% rename from backend/tests/daily/connectors/google_drive/helpers.py rename to backend/tests/daily/connectors/google_drive/consts_and_utils.py index 7a120412e9d..f808f6812e7 100644 --- a/backend/tests/daily/connectors/google_drive/helpers.py +++ b/backend/tests/daily/connectors/google_drive/consts_and_utils.py @@ -6,128 +6,99 @@ SHARED_DRIVE_FILES = list(range(20, 25)) -_ADMIN_FILE_IDS = list(range(0, 5)) -_TEST_USER_1_FILE_IDS = list(range(5, 10)) -_TEST_USER_2_FILE_IDS = list(range(10, 15)) -_TEST_USER_3_FILE_IDS = list(range(15, 20)) -_SHARED_DRIVE_1_FILE_IDS = list(range(20, 25)) -_FOLDER_1_FILE_IDS = list(range(25, 30)) -_FOLDER_1_1_FILE_IDS = list(range(30, 35)) -_FOLDER_1_2_FILE_IDS = list(range(35, 40)) -_SHARED_DRIVE_2_FILE_IDS = list(range(40, 45)) -_FOLDER_2_FILE_IDS = list(range(45, 50)) -_FOLDER_2_1_FILE_IDS = list(range(50, 55)) -_FOLDER_2_2_FILE_IDS = list(range(55, 60)) -_SECTIONS_FILE_IDS = [61] - -_PUBLIC_FOLDER_RANGE = _FOLDER_1_2_FILE_IDS -_PUBLIC_FILE_IDS = list(range(55, 57)) -PUBLIC_RANGE = _PUBLIC_FOLDER_RANGE + _PUBLIC_FILE_IDS - -_SHARED_DRIVE_1_URL = "https://drive.google.com/drive/folders/0AC_OJ4BkMd4kUk9PVA" +ADMIN_FILE_IDS = list(range(0, 5)) +ADMIN_FOLDER_3_FILE_IDS = list(range(65, 70)) +TEST_USER_1_FILE_IDS = list(range(5, 10)) +TEST_USER_2_FILE_IDS = list(range(10, 15)) +TEST_USER_3_FILE_IDS = list(range(15, 20)) +SHARED_DRIVE_1_FILE_IDS = list(range(20, 25)) +FOLDER_1_FILE_IDS = list(range(25, 30)) +FOLDER_1_1_FILE_IDS = list(range(30, 35)) +FOLDER_1_2_FILE_IDS = list(range(35, 40)) +SHARED_DRIVE_2_FILE_IDS = list(range(40, 45)) +FOLDER_2_FILE_IDS = list(range(45, 50)) +FOLDER_2_1_FILE_IDS = list(range(50, 55)) +FOLDER_2_2_FILE_IDS = list(range(55, 60)) +SECTIONS_FILE_IDS = [61] + +PUBLIC_FOLDER_RANGE = FOLDER_1_2_FILE_IDS +PUBLIC_FILE_IDS = list(range(55, 57)) +PUBLIC_RANGE = PUBLIC_FOLDER_RANGE + PUBLIC_FILE_IDS + +SHARED_DRIVE_1_URL = "https://drive.google.com/drive/folders/0AC_OJ4BkMd4kUk9PVA" # Group 1 is given access to this folder -_FOLDER_1_URL = ( +FOLDER_1_URL = ( "https://drive.google.com/drive/folders/1d3I7U3vUZMDziF1OQqYRkB8Jp2s_GWUn" ) -_FOLDER_1_1_URL = ( +FOLDER_1_1_URL = ( "https://drive.google.com/drive/folders/1aR33-zwzl_mnRAwH55GgtWTE-4A4yWWI" ) -_FOLDER_1_2_URL = ( +FOLDER_1_2_URL = ( "https://drive.google.com/drive/folders/1IO0X55VhvLXf4mdxzHxuKf4wxrDBB6jq" ) -_SHARED_DRIVE_2_URL = "https://drive.google.com/drive/folders/0ABKspIh7P4f4Uk9PVA" -_FOLDER_2_URL = ( +SHARED_DRIVE_2_URL = "https://drive.google.com/drive/folders/0ABKspIh7P4f4Uk9PVA" +FOLDER_2_URL = ( "https://drive.google.com/drive/folders/1lNpCJ1teu8Se0louwL0oOHK9nEalskof" ) -_FOLDER_2_1_URL = ( +FOLDER_2_1_URL = ( "https://drive.google.com/drive/folders/1XeDOMWwxTDiVr9Ig2gKum3Zq_Wivv6zY" ) -_FOLDER_2_2_URL = ( +FOLDER_2_2_URL = ( "https://drive.google.com/drive/folders/1RKlsexA8h7NHvBAWRbU27MJotic7KXe3" ) +FOLDER_3_URL = ( + "https://drive.google.com/drive/folders/1LHibIEXfpUmqZ-XjBea44SocA91Nkveu" +) +SECTIONS_FOLDER_URL = ( + "https://drive.google.com/drive/u/5/folders/1loe6XJ-pJxu9YYPv7cF3Hmz296VNzA33" +) -_ADMIN_EMAIL = "admin@onyx-test.com" -_TEST_USER_1_EMAIL = "test_user_1@onyx-test.com" -_TEST_USER_2_EMAIL = "test_user_2@onyx-test.com" -_TEST_USER_3_EMAIL = "test_user_3@onyx-test.com" - -# Dictionary for ranges -DRIVE_ID_MAPPING: dict[str, list[int]] = { - "ADMIN": _ADMIN_FILE_IDS, - "TEST_USER_1": _TEST_USER_1_FILE_IDS, - "TEST_USER_2": _TEST_USER_2_FILE_IDS, - "TEST_USER_3": _TEST_USER_3_FILE_IDS, - "SHARED_DRIVE_1": _SHARED_DRIVE_1_FILE_IDS, - "FOLDER_1": _FOLDER_1_FILE_IDS, - "FOLDER_1_1": _FOLDER_1_1_FILE_IDS, - "FOLDER_1_2": _FOLDER_1_2_FILE_IDS, - "SHARED_DRIVE_2": _SHARED_DRIVE_2_FILE_IDS, - "FOLDER_2": _FOLDER_2_FILE_IDS, - "FOLDER_2_1": _FOLDER_2_1_FILE_IDS, - "FOLDER_2_2": _FOLDER_2_2_FILE_IDS, - "SECTIONS": _SECTIONS_FILE_IDS, -} - -# Dictionary for emails -EMAIL_MAPPING: dict[str, str] = { - "ADMIN": _ADMIN_EMAIL, - "TEST_USER_1": _TEST_USER_1_EMAIL, - "TEST_USER_2": _TEST_USER_2_EMAIL, - "TEST_USER_3": _TEST_USER_3_EMAIL, -} - -# Dictionary for URLs -URL_MAPPING: dict[str, str] = { - "SHARED_DRIVE_1": _SHARED_DRIVE_1_URL, - "FOLDER_1": _FOLDER_1_URL, - "FOLDER_1_1": _FOLDER_1_1_URL, - "FOLDER_1_2": _FOLDER_1_2_URL, - "SHARED_DRIVE_2": _SHARED_DRIVE_2_URL, - "FOLDER_2": _FOLDER_2_URL, - "FOLDER_2_1": _FOLDER_2_1_URL, - "FOLDER_2_2": _FOLDER_2_2_URL, -} +ADMIN_EMAIL = "admin@onyx-test.com" +TEST_USER_1_EMAIL = "test_user_1@onyx-test.com" +TEST_USER_2_EMAIL = "test_user_2@onyx-test.com" +TEST_USER_3_EMAIL = "test_user_3@onyx-test.com" # Dictionary for access permissions # All users have access to their own My Drive as well as public files ACCESS_MAPPING: dict[str, list[int]] = { # Admin has access to everything in shared - "ADMIN": ( - _ADMIN_FILE_IDS - + _SHARED_DRIVE_1_FILE_IDS - + _FOLDER_1_FILE_IDS - + _FOLDER_1_1_FILE_IDS - + _FOLDER_1_2_FILE_IDS - + _SHARED_DRIVE_2_FILE_IDS - + _FOLDER_2_FILE_IDS - + _FOLDER_2_1_FILE_IDS - + _FOLDER_2_2_FILE_IDS - + _SECTIONS_FILE_IDS + ADMIN_EMAIL: ( + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS ), # This user has access to drive 1 # This user has redundant access to folder 1 because of group access # This user has been given individual access to files in Admin's My Drive - "TEST_USER_1": ( - _TEST_USER_1_FILE_IDS - + _SHARED_DRIVE_1_FILE_IDS - + _FOLDER_1_FILE_IDS - + _FOLDER_1_1_FILE_IDS - + _FOLDER_1_2_FILE_IDS + TEST_USER_1_EMAIL: ( + TEST_USER_1_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + list(range(0, 2)) ), # Group 1 includes this user, giving access to folder 1 # This user has also been given access to folder 2-1 # This user has also been given individual access to files in folder 2 - "TEST_USER_2": ( - _TEST_USER_2_FILE_IDS - + _FOLDER_1_FILE_IDS - + _FOLDER_1_1_FILE_IDS - + _FOLDER_1_2_FILE_IDS - + _FOLDER_2_1_FILE_IDS + TEST_USER_2_EMAIL: ( + TEST_USER_2_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + list(range(45, 47)) ), # This user can only see his own files and public files - "TEST_USER_3": _TEST_USER_3_FILE_IDS, + TEST_USER_3_EMAIL: TEST_USER_3_FILE_IDS, } SPECIAL_FILE_ID_TO_CONTENT_MAP: dict[int, str] = { diff --git a/backend/tests/daily/connectors/google_drive/test_google_drive_oauth.py b/backend/tests/daily/connectors/google_drive/test_google_drive_oauth.py index 9e788a7ffa2..cf9776c1806 100644 --- a/backend/tests/daily/connectors/google_drive/test_google_drive_oauth.py +++ b/backend/tests/daily/connectors/google_drive/test_google_drive_oauth.py @@ -5,12 +5,29 @@ from danswer.connectors.google_drive.connector import GoogleDriveConnector from danswer.connectors.models import Document -from tests.daily.connectors.google_drive.helpers import ( +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ( assert_retrieved_docs_match_expected, ) -from tests.daily.connectors.google_drive.helpers import DRIVE_ID_MAPPING -from tests.daily.connectors.google_drive.helpers import EMAIL_MAPPING -from tests.daily.connectors.google_drive.helpers import URL_MAPPING +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_3_URL +from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_EMAIL @patch( @@ -32,16 +49,17 @@ def test_include_all( # Should get everything in shared and admin's My Drive with oauth expected_file_ids = ( - DRIVE_ID_MAPPING["ADMIN"] - + DRIVE_ID_MAPPING["SHARED_DRIVE_1"] - + DRIVE_ID_MAPPING["FOLDER_1"] - + DRIVE_ID_MAPPING["FOLDER_1_1"] - + DRIVE_ID_MAPPING["FOLDER_1_2"] - + DRIVE_ID_MAPPING["SHARED_DRIVE_2"] - + DRIVE_ID_MAPPING["FOLDER_2"] - + DRIVE_ID_MAPPING["FOLDER_2_1"] - + DRIVE_ID_MAPPING["FOLDER_2_2"] - + DRIVE_ID_MAPPING["SECTIONS"] + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, @@ -68,15 +86,15 @@ def test_include_shared_drives_only( # Should only get shared drives expected_file_ids = ( - DRIVE_ID_MAPPING["SHARED_DRIVE_1"] - + DRIVE_ID_MAPPING["FOLDER_1"] - + DRIVE_ID_MAPPING["FOLDER_1_1"] - + DRIVE_ID_MAPPING["FOLDER_1_2"] - + DRIVE_ID_MAPPING["SHARED_DRIVE_2"] - + DRIVE_ID_MAPPING["FOLDER_2"] - + DRIVE_ID_MAPPING["FOLDER_2_1"] - + DRIVE_ID_MAPPING["FOLDER_2_2"] - + DRIVE_ID_MAPPING["SECTIONS"] + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, @@ -101,8 +119,8 @@ def test_include_my_drives_only( for doc_batch in connector.poll_source(0, time.time()): retrieved_docs.extend(doc_batch) - # Should only get everyone's My Drives - expected_file_ids = list(range(0, 5)) # Admin's My Drive only + # Should only get primary_admins My Drive because we are impersonating them + expected_file_ids = ADMIN_FILE_IDS + ADMIN_FOLDER_3_FILE_IDS assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, expected_file_ids=expected_file_ids, @@ -118,9 +136,7 @@ def test_drive_one_only( google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], ) -> None: print("\n\nRunning test_drive_one_only") - drive_urls = [ - URL_MAPPING["SHARED_DRIVE_1"], - ] + drive_urls = [SHARED_DRIVE_1_URL] connector = google_drive_oauth_connector_factory( include_shared_drives=True, include_my_drives=False, @@ -130,8 +146,12 @@ def test_drive_one_only( for doc_batch in connector.poll_source(0, time.time()): retrieved_docs.extend(doc_batch) - # We ignore shared_drive_urls if include_shared_drives is False - expected_file_ids = list(range(20, 40)) # Shared Drive 1 and its folders + expected_file_ids = ( + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, expected_file_ids=expected_file_ids, @@ -147,8 +167,8 @@ def test_folder_and_shared_drive( google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], ) -> None: print("\n\nRunning test_folder_and_shared_drive") - drive_urls = [URL_MAPPING["SHARED_DRIVE_1"]] - folder_urls = [URL_MAPPING["FOLDER_2"]] + drive_urls = [SHARED_DRIVE_1_URL] + folder_urls = [FOLDER_2_URL] connector = google_drive_oauth_connector_factory( include_shared_drives=True, include_my_drives=True, @@ -159,11 +179,16 @@ def test_folder_and_shared_drive( for doc_batch in connector.poll_source(0, time.time()): retrieved_docs.extend(doc_batch) - # Should expected_file_ids = ( - list(range(0, 5)) # Admin's My Drive - + list(range(20, 40)) # Shared Drive 1 and its folders - + list(range(45, 60)) # Folder 2 and its subfolders + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, @@ -181,23 +206,32 @@ def test_folders_only( ) -> None: print("\n\nRunning test_folders_only") folder_urls = [ - URL_MAPPING["FOLDER_1_1"], - URL_MAPPING["FOLDER_1_2"], - URL_MAPPING["FOLDER_2_1"], - URL_MAPPING["FOLDER_2_2"], + FOLDER_1_2_URL, + FOLDER_2_1_URL, + FOLDER_2_2_URL, + FOLDER_3_URL, + ] + # This should get converted to a drive request and spit out a warning in the logs + shared_drive_urls = [ + FOLDER_1_1_URL, ] connector = google_drive_oauth_connector_factory( include_shared_drives=False, include_my_drives=False, + shared_drive_urls=",".join([str(url) for url in shared_drive_urls]), shared_folder_urls=",".join([str(url) for url in folder_urls]), ) retrieved_docs: list[Document] = [] for doc_batch in connector.poll_source(0, time.time()): retrieved_docs.extend(doc_batch) - expected_file_ids = list(range(30, 40)) + list( # Folders 1_1 and 1_2 - range(50, 60) - ) # Folders 2_1 and 2_2 + expected_file_ids = ( + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, expected_file_ids=expected_file_ids, @@ -214,8 +248,8 @@ def test_specific_emails( ) -> None: print("\n\nRunning test_specific_emails") my_drive_emails = [ - EMAIL_MAPPING["TEST_USER_1"], - EMAIL_MAPPING["TEST_USER_3"], + TEST_USER_1_EMAIL, + TEST_USER_3_EMAIL, ] connector = google_drive_oauth_connector_factory( include_shared_drives=False, @@ -228,7 +262,35 @@ def test_specific_emails( # No matter who is specified, when using oauth, if include_my_drives is True, # we will get all the files from the admin's My Drive - expected_file_ids = DRIVE_ID_MAPPING["ADMIN"] + expected_file_ids = ADMIN_FILE_IDS + ADMIN_FOLDER_3_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_personal_folders_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_personal_folders_only") + folder_urls = [ + FOLDER_3_URL, + ] + connector = google_drive_oauth_connector_factory( + include_shared_drives=False, + include_my_drives=False, + shared_folder_urls=",".join([str(url) for url in folder_urls]), + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ADMIN_FOLDER_3_FILE_IDS assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, expected_file_ids=expected_file_ids, diff --git a/backend/tests/daily/connectors/google_drive/test_google_drive_sections.py b/backend/tests/daily/connectors/google_drive/test_google_drive_sections.py index 4f4556a06d6..b6d2423a092 100644 --- a/backend/tests/daily/connectors/google_drive/test_google_drive_sections.py +++ b/backend/tests/daily/connectors/google_drive/test_google_drive_sections.py @@ -5,11 +5,7 @@ from danswer.connectors.google_drive.connector import GoogleDriveConnector from danswer.connectors.models import Document - - -SECTIONS_FOLDER_URL = ( - "https://drive.google.com/drive/u/5/folders/1loe6XJ-pJxu9YYPv7cF3Hmz296VNzA33" -) +from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FOLDER_URL @patch( diff --git a/backend/tests/daily/connectors/google_drive/test_google_drive_service_acct.py b/backend/tests/daily/connectors/google_drive/test_google_drive_service_acct.py index 3ea2c6ddf49..5dcbb7f5b64 100644 --- a/backend/tests/daily/connectors/google_drive/test_google_drive_service_acct.py +++ b/backend/tests/daily/connectors/google_drive/test_google_drive_service_acct.py @@ -5,12 +5,32 @@ from danswer.connectors.google_drive.connector import GoogleDriveConnector from danswer.connectors.models import Document -from tests.daily.connectors.google_drive.helpers import ( +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ( assert_retrieved_docs_match_expected, ) -from tests.daily.connectors.google_drive.helpers import DRIVE_ID_MAPPING -from tests.daily.connectors.google_drive.helpers import EMAIL_MAPPING -from tests.daily.connectors.google_drive.helpers import URL_MAPPING +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_3_URL +from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_FILE_IDS @patch( @@ -32,19 +52,20 @@ def test_include_all( # Should get everything expected_file_ids = ( - DRIVE_ID_MAPPING["ADMIN"] - + DRIVE_ID_MAPPING["TEST_USER_1"] - + DRIVE_ID_MAPPING["TEST_USER_2"] - + DRIVE_ID_MAPPING["TEST_USER_3"] - + DRIVE_ID_MAPPING["SHARED_DRIVE_1"] - + DRIVE_ID_MAPPING["FOLDER_1"] - + DRIVE_ID_MAPPING["FOLDER_1_1"] - + DRIVE_ID_MAPPING["FOLDER_1_2"] - + DRIVE_ID_MAPPING["SHARED_DRIVE_2"] - + DRIVE_ID_MAPPING["FOLDER_2"] - + DRIVE_ID_MAPPING["FOLDER_2_1"] - + DRIVE_ID_MAPPING["FOLDER_2_2"] - + DRIVE_ID_MAPPING["SECTIONS"] + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + TEST_USER_1_FILE_IDS + + TEST_USER_2_FILE_IDS + + TEST_USER_3_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, @@ -71,15 +92,15 @@ def test_include_shared_drives_only( # Should only get shared drives expected_file_ids = ( - DRIVE_ID_MAPPING["SHARED_DRIVE_1"] - + DRIVE_ID_MAPPING["FOLDER_1"] - + DRIVE_ID_MAPPING["FOLDER_1_1"] - + DRIVE_ID_MAPPING["FOLDER_1_2"] - + DRIVE_ID_MAPPING["SHARED_DRIVE_2"] - + DRIVE_ID_MAPPING["FOLDER_2"] - + DRIVE_ID_MAPPING["FOLDER_2_1"] - + DRIVE_ID_MAPPING["FOLDER_2_2"] - + DRIVE_ID_MAPPING["SECTIONS"] + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, @@ -105,7 +126,13 @@ def test_include_my_drives_only( retrieved_docs.extend(doc_batch) # Should only get everyone's My Drives - expected_file_ids = list(range(0, 20)) # All My Drives + expected_file_ids = ( + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + TEST_USER_1_FILE_IDS + + TEST_USER_2_FILE_IDS + + TEST_USER_3_FILE_IDS + ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, expected_file_ids=expected_file_ids, @@ -121,7 +148,7 @@ def test_drive_one_only( google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], ) -> None: print("\n\nRunning test_drive_one_only") - urls = [URL_MAPPING["SHARED_DRIVE_1"]] + urls = [SHARED_DRIVE_1_URL] connector = google_drive_service_acct_connector_factory( include_shared_drives=True, include_my_drives=False, @@ -132,7 +159,12 @@ def test_drive_one_only( retrieved_docs.extend(doc_batch) # We ignore shared_drive_urls if include_shared_drives is False - expected_file_ids = list(range(20, 40)) # Shared Drive 1 and its folders + expected_file_ids = ( + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, expected_file_ids=expected_file_ids, @@ -148,10 +180,8 @@ def test_folder_and_shared_drive( google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], ) -> None: print("\n\nRunning test_folder_and_shared_drive") - drive_urls = [ - URL_MAPPING["SHARED_DRIVE_1"], - ] - folder_urls = [URL_MAPPING["FOLDER_2"]] + drive_urls = [SHARED_DRIVE_1_URL] + folder_urls = [FOLDER_2_URL] connector = google_drive_service_acct_connector_factory( include_shared_drives=True, include_my_drives=True, @@ -162,11 +192,20 @@ def test_folder_and_shared_drive( for doc_batch in connector.poll_source(0, time.time()): retrieved_docs.extend(doc_batch) - # Should + # Should get everything except for the top level files in drive 2 expected_file_ids = ( - list(range(0, 20)) # All My Drives - + list(range(20, 40)) # Shared Drive 1 and its folders - + list(range(45, 60)) # Folder 2 and its subfolders + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + TEST_USER_1_FILE_IDS + + TEST_USER_2_FILE_IDS + + TEST_USER_3_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, @@ -184,23 +223,32 @@ def test_folders_only( ) -> None: print("\n\nRunning test_folders_only") folder_urls = [ - URL_MAPPING["FOLDER_1_1"], - URL_MAPPING["FOLDER_1_2"], - URL_MAPPING["FOLDER_2_1"], - URL_MAPPING["FOLDER_2_2"], + FOLDER_1_2_URL, + FOLDER_2_1_URL, + FOLDER_2_2_URL, + FOLDER_3_URL, + ] + # This should get converted to a drive request and spit out a warning in the logs + shared_drive_urls = [ + FOLDER_1_1_URL, ] connector = google_drive_service_acct_connector_factory( include_shared_drives=False, include_my_drives=False, + shared_drive_urls=",".join([str(url) for url in shared_drive_urls]), shared_folder_urls=",".join([str(url) for url in folder_urls]), ) retrieved_docs: list[Document] = [] for doc_batch in connector.poll_source(0, time.time()): retrieved_docs.extend(doc_batch) - expected_file_ids = list(range(30, 40)) + list( # Folders 1_1 and 1_2 - range(50, 60) - ) # Folders 2_1 and 2_2 + expected_file_ids = ( + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + ) assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, expected_file_ids=expected_file_ids, @@ -217,8 +265,37 @@ def test_specific_emails( ) -> None: print("\n\nRunning test_specific_emails") my_drive_emails = [ - EMAIL_MAPPING["TEST_USER_1"], - EMAIL_MAPPING["TEST_USER_3"], + TEST_USER_1_EMAIL, + TEST_USER_3_EMAIL, + ] + connector = google_drive_service_acct_connector_factory( + include_shared_drives=False, + include_my_drives=True, + my_drive_emails=",".join([str(email) for email in my_drive_emails]), + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = TEST_USER_1_FILE_IDS + TEST_USER_3_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def get_specific_folders_in_my_drive( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning get_specific_folders_in_my_drive") + my_drive_emails = [ + TEST_USER_1_EMAIL, + TEST_USER_3_EMAIL, ] connector = google_drive_service_acct_connector_factory( include_shared_drives=False, @@ -229,9 +306,35 @@ def test_specific_emails( for doc_batch in connector.poll_source(0, time.time()): retrieved_docs.extend(doc_batch) - expected_file_ids = list(range(5, 10)) + list( - range(15, 20) - ) # TEST_USER_1 and TEST_USER_3 My Drives + expected_file_ids = TEST_USER_1_FILE_IDS + TEST_USER_3_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_personal_folders_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_personal_folders_only") + folder_urls = [ + FOLDER_3_URL, + ] + connector = google_drive_oauth_connector_factory( + include_shared_drives=False, + include_my_drives=False, + shared_folder_urls=",".join([str(url) for url in folder_urls]), + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ADMIN_FOLDER_3_FILE_IDS assert_retrieved_docs_match_expected( retrieved_docs=retrieved_docs, expected_file_ids=expected_file_ids, diff --git a/backend/tests/daily/connectors/google_drive/test_google_drive_slim_docs.py b/backend/tests/daily/connectors/google_drive/test_google_drive_slim_docs.py index 1361c198849..c9d70d814a1 100644 --- a/backend/tests/daily/connectors/google_drive/test_google_drive_slim_docs.py +++ b/backend/tests/daily/connectors/google_drive/test_google_drive_slim_docs.py @@ -10,10 +10,28 @@ from ee.danswer.external_permissions.google_drive.doc_sync import ( _get_permissions_from_slim_doc, ) -from tests.daily.connectors.google_drive.helpers import EMAIL_MAPPING -from tests.daily.connectors.google_drive.helpers import file_name_template -from tests.daily.connectors.google_drive.helpers import print_discrepencies -from tests.daily.connectors.google_drive.helpers import PUBLIC_RANGE +from tests.daily.connectors.google_drive.consts_and_utils import ACCESS_MAPPING +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import file_name_template +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import print_discrepencies +from tests.daily.connectors.google_drive.consts_and_utils import PUBLIC_RANGE +from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_FILE_IDS def get_keys_available_to_user_from_access_map( @@ -113,72 +131,71 @@ def test_all_permissions( ) access_map: dict[str, ExternalAccess] = {} + found_file_names = set() for slim_doc_batch in google_drive_connector.retrieve_all_slim_documents( 0, time.time() ): for slim_doc in slim_doc_batch: - access_map[ - (slim_doc.perm_sync_data or {})["name"] - ] = _get_permissions_from_slim_doc( + name = (slim_doc.perm_sync_data or {})["name"] + access_map[name] = _get_permissions_from_slim_doc( google_drive_connector=google_drive_connector, slim_doc=slim_doc, ) + found_file_names.add(name) for file_name, external_access in access_map.items(): print(file_name, external_access) expected_file_range = ( - list(range(0, 5)) # Admin's My Drive - + list(range(5, 10)) # TEST_USER_1's My Drive - + list(range(10, 15)) # TEST_USER_2's My Drive - + list(range(15, 20)) # TEST_USER_3's My Drive - + list(range(20, 25)) # Shared Drive 1 - + list(range(25, 30)) # Folder 1 - + list(range(30, 35)) # Folder 1_1 - + list(range(35, 40)) # Folder 1_2 - + list(range(40, 45)) # Shared Drive 2 - + list(range(45, 50)) # Folder 2 - + list(range(50, 55)) # Folder 2_1 - + list(range(55, 60)) # Folder 2_2 - + [61] # Sections + ADMIN_FILE_IDS # Admin's My Drive + + ADMIN_FOLDER_3_FILE_IDS # Admin's Folder 3 + + TEST_USER_1_FILE_IDS # TEST_USER_1's My Drive + + TEST_USER_2_FILE_IDS # TEST_USER_2's My Drive + + TEST_USER_3_FILE_IDS # TEST_USER_3's My Drive + + SHARED_DRIVE_1_FILE_IDS # Shared Drive 1 + + FOLDER_1_FILE_IDS # Folder 1 + + FOLDER_1_1_FILE_IDS # Folder 1_1 + + FOLDER_1_2_FILE_IDS # Folder 1_2 + + SHARED_DRIVE_2_FILE_IDS # Shared Drive 2 + + FOLDER_2_FILE_IDS # Folder 2 + + FOLDER_2_1_FILE_IDS # Folder 2_1 + + FOLDER_2_2_FILE_IDS # Folder 2_2 + + SECTIONS_FILE_IDS # Sections ) + expected_file_names = { + file_name_template.format(file_id) for file_id in expected_file_range + } # Should get everything - assert len(access_map) == len(expected_file_range) + print_discrepencies(expected_file_names, found_file_names) + assert expected_file_names == found_file_names group_map = get_group_map(google_drive_connector) print("groups:\n", group_map) assert_correct_access_for_user( - user_email=EMAIL_MAPPING["ADMIN"], - expected_access_ids=list(range(0, 5)) # Admin's My Drive - + list(range(20, 60)) # All shared drive content - + [61], # Sections + user_email=ADMIN_EMAIL, + expected_access_ids=ACCESS_MAPPING[ADMIN_EMAIL], group_map=group_map, retrieved_access_map=access_map, ) assert_correct_access_for_user( - user_email=EMAIL_MAPPING["TEST_USER_1"], - expected_access_ids=list(range(5, 10)) # TEST_USER_1's My Drive - + list(range(20, 40)) # Shared Drive 1 and its folders - + list(range(0, 2)), # Access to some of Admin's files + user_email=TEST_USER_1_EMAIL, + expected_access_ids=ACCESS_MAPPING[TEST_USER_1_EMAIL], group_map=group_map, retrieved_access_map=access_map, ) assert_correct_access_for_user( - user_email=EMAIL_MAPPING["TEST_USER_2"], - expected_access_ids=list(range(10, 15)) # TEST_USER_2's My Drive - + list(range(25, 40)) # Folder 1 and its subfolders - + list(range(50, 55)) # Folder 2_1 - + list(range(45, 47)), # Some files in Folder 2 + user_email=TEST_USER_2_EMAIL, + expected_access_ids=ACCESS_MAPPING[TEST_USER_2_EMAIL], group_map=group_map, retrieved_access_map=access_map, ) assert_correct_access_for_user( - user_email=EMAIL_MAPPING["TEST_USER_3"], - expected_access_ids=list(range(15, 20)), # TEST_USER_3's My Drive only + user_email=TEST_USER_3_EMAIL, + expected_access_ids=ACCESS_MAPPING[TEST_USER_3_EMAIL], group_map=group_map, retrieved_access_map=access_map, ) diff --git a/web/src/app/admin/connectors/[connector]/pages/gdrive/Credential.tsx b/web/src/app/admin/connectors/[connector]/pages/gdrive/Credential.tsx index d0f5d58e858..a68296f6f6c 100644 --- a/web/src/app/admin/connectors/[connector]/pages/gdrive/Credential.tsx +++ b/web/src/app/admin/connectors/[connector]/pages/gdrive/Credential.tsx @@ -360,22 +360,12 @@ export const DriveAuthSection = ({ if (serviceAccountKeyData?.service_account_email) { return (
-

- When using a Google Drive Service Account, you must specify the email - of the primary admin that you would like the service account to - impersonate. -
-
- Ideally, this account should be an owner/admin of the Google - Organization that owns the Google Drive(s) you want to index. -

- (
diff --git a/web/src/app/admin/connectors/[connector]/pages/gdrive/GoogleDrivePage.tsx b/web/src/app/admin/connectors/[connector]/pages/gdrive/GoogleDrivePage.tsx index a4efe283683..9ff60ebc223 100644 --- a/web/src/app/admin/connectors/[connector]/pages/gdrive/GoogleDrivePage.tsx +++ b/web/src/app/admin/connectors/[connector]/pages/gdrive/GoogleDrivePage.tsx @@ -101,12 +101,14 @@ const GDriveMain = ({}: {}) => { | Credential | undefined = credentialsData.find( (credential) => - credential.credential_json?.google_drive_tokens && credential.admin_public + credential.credential_json?.google_tokens && + credential.admin_public && + credential.source === "google_drive" ); const googleDriveServiceAccountCredential: | Credential | undefined = credentialsData.find( - (credential) => credential.credential_json?.google_drive_service_account_key + (credential) => credential.credential_json?.google_service_account_key ); const googleDriveConnectorIndexingStatuses: ConnectorIndexingStatus< diff --git a/web/src/app/admin/connectors/[connector]/pages/gmail/Credential.tsx b/web/src/app/admin/connectors/[connector]/pages/gmail/Credential.tsx index 5b6787b301c..d413e4b0169 100644 --- a/web/src/app/admin/connectors/[connector]/pages/gmail/Credential.tsx +++ b/web/src/app/admin/connectors/[connector]/pages/gmail/Credential.tsx @@ -276,10 +276,12 @@ export const GmailJsonUploadSection = ({ > here {" "} - to setup a google OAuth App in your company workspace. + to either (1) setup a google OAuth App in your company workspace or (2) + create a Service Account.

- Download the credentials JSON and upload it here. + Download the credentials JSON if choosing option (1) or the Service + Account key JSON if chooosing option (2), and upload it here.

@@ -344,23 +346,13 @@ export const GmailAuthSection = ({ if (serviceAccountKeyData?.service_account_email) { return (
-

- When using a Gmail Service Account, you must specify the email of the - primary admin that you would like the service account to impersonate. -
-
- For this connector to index all users Gmail, the primary admin email - should be an owner/admin of the Google Organization that being - indexed. -

- { formikHelpers.setSubmitting(true); @@ -373,7 +365,7 @@ export const GmailAuthSection = ({ "Content-Type": "application/json", }, body: JSON.stringify({ - gmail_primary_admin: values.gmail_primary_admin, + google_primary_admin: values.google_primary_admin, }), } ); @@ -396,7 +388,7 @@ export const GmailAuthSection = ({ {({ isSubmitting }) => ( @@ -457,8 +449,8 @@ export const GmailAuthSection = ({ // case where no keys have been uploaded in step 1 return (

- Please upload a OAuth Client Credential JSON in Step 1 before moving onto - Step 2. + Please upload an OAuth or Service Account Credential JSON in Step 1 before + moving onto Step 2.

); }; diff --git a/web/src/app/admin/connectors/[connector]/pages/gmail/GmailPage.tsx b/web/src/app/admin/connectors/[connector]/pages/gmail/GmailPage.tsx index 0ee95e0037c..b3120fbefdf 100644 --- a/web/src/app/admin/connectors/[connector]/pages/gmail/GmailPage.tsx +++ b/web/src/app/admin/connectors/[connector]/pages/gmail/GmailPage.tsx @@ -109,12 +109,16 @@ export const GmailMain = () => { const gmailPublicCredential: Credential | undefined = credentialsData.find( (credential) => - credential.credential_json?.gmail_tokens && credential.admin_public + (credential.credential_json?.google_service_account_key || + credential.credential_json?.google_tokens) && + credential.admin_public ); const gmailServiceAccountCredential: | Credential | undefined = credentialsData.find( - (credential) => credential.credential_json?.gmail_service_account_key + (credential) => + credential.credential_json?.google_service_account_key && + credential.source === "gmail" ); const gmailConnectorIndexingStatuses: ConnectorIndexingStatus< GmailConfig, diff --git a/web/src/app/admin/connectors/[connector]/pages/utils/hooks.ts b/web/src/app/admin/connectors/[connector]/pages/utils/hooks.ts index 5d5f61dd820..a293116c82c 100644 --- a/web/src/app/admin/connectors/[connector]/pages/utils/hooks.ts +++ b/web/src/app/admin/connectors/[connector]/pages/utils/hooks.ts @@ -23,7 +23,7 @@ export const useGmailCredentials = (connector: string) => { const gmailPublicCredential: Credential | undefined = credentialsData?.find( (credential) => - credential.credential_json?.google_service_account_key && + credential.credential_json?.google_tokens && credential.admin_public && credential.source === connector ); @@ -31,7 +31,10 @@ export const useGmailCredentials = (connector: string) => { const gmailServiceAccountCredential: | Credential | undefined = credentialsData?.find( - (credential) => credential.credential_json?.gmail_service_account_key + (credential) => + credential.credential_json?.google_service_account_key && + credential.admin_public && + credential.source === connector ); const liveGmailCredential = @@ -49,7 +52,7 @@ export const useGoogleDriveCredentials = (connector: string) => { | Credential | undefined = credentialsData?.find( (credential) => - credential.credential_json?.google_service_account_key && + credential.credential_json?.google_tokens && credential.admin_public && credential.source === connector ); @@ -57,7 +60,10 @@ export const useGoogleDriveCredentials = (connector: string) => { const googleDriveServiceAccountCredential: | Credential | undefined = credentialsData?.find( - (credential) => credential.credential_json?.google_drive_service_account_key + (credential) => + credential.credential_json?.google_service_account_key && + credential.admin_public && + credential.source === connector ); const liveGDriveCredential = diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts index 86a922c74f2..56fc18047ca 100644 --- a/web/src/lib/connectors/credentials.ts +++ b/web/src/lib/connectors/credentials.ts @@ -53,23 +53,23 @@ export interface SlackCredentialJson { } export interface GmailCredentialJson { - gmail_tokens: string; - gmail_primary_admin: string; + google_tokens: string; + google_primary_admin: string; } export interface GoogleDriveCredentialJson { - google_drive_tokens: string; - google_drive_primary_admin: string; + google_tokens: string; + google_primary_admin: string; } export interface GmailServiceAccountCredentialJson { - gmail_service_account_key: string; - gmail_primary_admin: string; + google_service_account_key: string; + google_primary_admin: string; } export interface GoogleDriveServiceAccountCredentialJson { - google_drive_service_account_key: string; - google_drive_primary_admin: string; + google_service_account_key: string; + google_primary_admin: string; } export interface SlabCredentialJson { @@ -301,8 +301,8 @@ export const credentialTemplates: Record = { ingestion_api: null, // NOTE: These are Special Cases - google_drive: { google_drive_tokens: "" } as GoogleDriveCredentialJson, - gmail: { gmail_tokens: "" } as GmailCredentialJson, + google_drive: { google_tokens: "" } as GoogleDriveCredentialJson, + gmail: { google_tokens: "" } as GmailCredentialJson, }; export const credentialDisplayNames: Record = { @@ -332,19 +332,10 @@ export const credentialDisplayNames: Record = { // Slack slack_bot_token: "Slack Bot Token", - // Gmail - gmail_tokens: "Gmail Tokens", - - // Google Drive - google_drive_tokens: "Google Drive Tokens", - - // Gmail Service Account - gmail_service_account_key: "Gmail Service Account Key", - gmail_primary_admin: "Gmail Primary Admin", - - // Google Drive Service Account - google_drive_service_account_key: "Google Drive Service Account Key", - google_drive_primary_admin: "Google Drive Delegated User", + // Gmail and Google Drive + google_tokens: "Google Oauth Tokens", + google_service_account_key: "Google Service Account Key", + google_primary_admin: "Primary Admin Email", // Slab slab_bot_token: "Slab Bot Token",