Google drive shared files fix + shortcuts (#300)

Also fixes foreign key constraint issue when manually wiping postgres + keeps track of accessed folders
danswer-ai · Aug 17, 2023 · 81a4934 · 81a4934 · vercel · Aug 17, 2023
1 parent 11c071d
commit 81a4934
Show file tree

Hide file tree

Showing 7 changed files with 137 additions and 33 deletions.
diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py
@@ -93,6 +93,7 @@
 # Connector Configs
 #####
 GOOGLE_DRIVE_INCLUDE_SHARED = False
+GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
 FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
     "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
 )

diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py
@@ -11,6 +11,7 @@
 from googleapiclient import discovery  # type: ignore
 from PyPDF2 import PdfReader
 
+from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
 from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
@@ -40,9 +41,7 @@
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 ]
 DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
-ID_KEY = "id"
-LINK_KEY = "link"
-TYPE_KEY = "type"
+DRIVE_SHORTCUT_TYPE = "application/vnd.google-apps.shortcut"
 
 GoogleDriveFileType = dict[str, Any]
 
@@ -51,6 +50,7 @@ def _run_drive_file_query(
     service: discovery.Resource,
     query: str,
     include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
+    follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
     batch_size: int = INDEX_BATCH_SIZE,
 ) -> Generator[GoogleDriveFileType, None, None]:
     next_page_token = ""
@@ -61,7 +61,11 @@ def _run_drive_file_query(
             .list(
                 pageSize=batch_size,
                 supportsAllDrives=include_shared,
-                fields="nextPageToken, files(mimeType, id, name, webViewLink)",
+                includeItemsFromAllDrives=include_shared,
+                fields=(
+                    "nextPageToken, files(mimeType, id, name, "
+                    "webViewLink, shortcutDetails)"
+                ),
                 pageToken=next_page_token,
                 q=query,
             )
@@ -70,45 +74,82 @@ def _run_drive_file_query(
         next_page_token = results.get("nextPageToken")
         files = results["files"]
         for file in files:
+            if follow_shortcuts and "shortcutDetails" in file:
+                file = service.files().get(
+                    fileId=file["shortcutDetails"]["targetId"],
+                    supportsAllDrives=include_shared,
+                    fields="mimeType, id, name, webViewLink, shortcutDetails",
+                )
+                file = file.execute()
             yield file
 
 
 def _get_folder_id(
-    service: discovery.Resource, parent_id: str, folder_name: str
+    service: discovery.Resource,
+    parent_id: str,
+    folder_name: str,
+    include_shared: bool,
+    follow_shortcuts: bool,
 ) -> str | None:
     """
     Get the ID of a folder given its name and the ID of its parent folder.
     """
-    query = (
-        f"'{parent_id}' in parents and name='{folder_name}' and "
-        f"mimeType='{DRIVE_FOLDER_TYPE}'"
-    )
+    query = f"'{parent_id}' in parents and name='{folder_name}' and "
+    if follow_shortcuts:
+        query += f"(mimeType='{DRIVE_FOLDER_TYPE}' or mimeType='{DRIVE_SHORTCUT_TYPE}')"
+    else:
+        query += f"mimeType='{DRIVE_FOLDER_TYPE}'"
+
     results = (
         service.files()
-        .list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
+        .list(
+            q=query,
+            spaces="drive",
+            fields="nextPageToken, files(id, name, shortcutDetails)",
+            supportsAllDrives=include_shared,
+            includeItemsFromAllDrives=include_shared,
+        )
         .execute()
     )
     items = results.get("files", [])
-    return items[0]["id"] if items else None
+
+    folder_id = None
+    if items:
+        if follow_shortcuts and "shortcutDetails" in items[0]:
+            folder_id = items[0]["shortcutDetails"]["targetId"]
+        else:
+            folder_id = items[0]["id"]
+    return folder_id
 
 
 def _get_folders(
     service: discovery.Resource,
     folder_id: str | None = None,  # if specified, only fetches files within this folder
     include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
+    follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
     batch_size: int = INDEX_BATCH_SIZE,
 ) -> Generator[GoogleDriveFileType, None, None]:
     query = f"mimeType = '{DRIVE_FOLDER_TYPE}' "
+    if follow_shortcuts:
+        query = "(" + query + f" or mimeType = '{DRIVE_SHORTCUT_TYPE}'" + ") "
+
     if folder_id:
         query += f"and '{folder_id}' in parents "
     query = query.rstrip()  # remove the trailing space(s)
 
-    yield from _run_drive_file_query(
+    for file in _run_drive_file_query(
         service=service,
         query=query,
         include_shared=include_shared,
+        follow_shortcuts=follow_shortcuts,
         batch_size=batch_size,
-    )
+    ):
+        # Need to check this since file may have been a target of a shortcut
+        # and not necessarily a folder
+        if file["mimeType"] == DRIVE_FOLDER_TYPE:
+            yield file
+        else:
+            pass
 
 
 def _get_files(
@@ -117,6 +158,7 @@ def _get_files(
     time_range_end: SecondsSinceUnixEpoch | None = None,
     folder_id: str | None = None,  # if specified, only fetches files within this folder
     include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
+    follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
     supported_drive_doc_types: list[str] = SUPPORTED_DRIVE_DOC_TYPES,
     batch_size: int = INDEX_BATCH_SIZE,
 ) -> Generator[GoogleDriveFileType, None, None]:
@@ -137,6 +179,7 @@ def _get_files(
         service=service,
         query=query,
         include_shared=include_shared,
+        follow_shortcuts=follow_shortcuts,
         batch_size=batch_size,
     )
     for file in files:
@@ -147,12 +190,15 @@ def _get_files(
 def get_all_files_batched(
     service: discovery.Resource,
     include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
+    follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
     batch_size: int = INDEX_BATCH_SIZE,
     time_range_start: SecondsSinceUnixEpoch | None = None,
     time_range_end: SecondsSinceUnixEpoch | None = None,
     folder_id: str | None = None,  # if specified, only fetches files within this folder
-    # if True, will fetch files in sub-folders of the specified folder ID. Only applies if folder_id is specified.
+    # if True, will fetch files in sub-folders of the specified folder ID.
+    # Only applies if folder_id is specified.
     traverse_subfolders: bool = True,
+    folder_ids_traversed: list[str] | None = None,
 ) -> Generator[list[GoogleDriveFileType], None, None]:
     """Gets all files matching the criteria specified by the args from Google Drive
     in batches of size `batch_size`.
@@ -163,6 +209,7 @@ def get_all_files_batched(
         time_range_end=time_range_end,
         folder_id=folder_id,
         include_shared=include_shared,
+        follow_shortcuts=follow_shortcuts,
         batch_size=batch_size,
     )
     yield from batch_generator(
@@ -174,23 +221,33 @@ def get_all_files_batched(
     )
 
     if traverse_subfolders:
+        folder_ids_traversed = folder_ids_traversed or []
         subfolders = _get_folders(
             service=service,
             folder_id=folder_id,
             include_shared=include_shared,
+            follow_shortcuts=follow_shortcuts,
             batch_size=batch_size,
         )
         for subfolder in subfolders:
-            logger.info("Fetching all files in subfolder: " + subfolder["name"])
-            yield from get_all_files_batched(
-                service=service,
-                include_shared=include_shared,
-                batch_size=batch_size,
-                time_range_start=time_range_start,
-                time_range_end=time_range_end,
-                folder_id=subfolder["id"],
-                traverse_subfolders=traverse_subfolders,
-            )
+            if subfolder["id"] not in folder_ids_traversed:
+                logger.info("Fetching all files in subfolder: " + subfolder["name"])
+                folder_ids_traversed.append(subfolder["id"])
+                yield from get_all_files_batched(
+                    service=service,
+                    include_shared=include_shared,
+                    follow_shortcuts=follow_shortcuts,
+                    batch_size=batch_size,
+                    time_range_start=time_range_start,
+                    time_range_end=time_range_end,
+                    folder_id=subfolder["id"],
+                    traverse_subfolders=traverse_subfolders,
+                    folder_ids_traversed=folder_ids_traversed,
+                )
+            else:
+                logger.debug(
+                    "Skipping subfolder since already traversed: " + subfolder["name"]
+                )
 
 
 def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
@@ -209,7 +266,6 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
             .execute()
             .decode("utf-8")
         )
-    # Default download to PDF since most types can be exported as a PDF
     elif (
         mime_type
         == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
@@ -220,7 +276,7 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
             temp.write(word_stream.getvalue())
             temp_path = temp.name
         return docx2txt.process(temp_path)
-
+    # Default download to PDF since most types can be exported as a PDF
     else:
         response = service.files().get_media(fileId=file["id"]).execute()
         pdf_stream = io.BytesIO(response)
@@ -236,15 +292,20 @@ def __init__(
         folder_paths: list[str] | None = None,
         batch_size: int = INDEX_BATCH_SIZE,
         include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
+        follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
     ) -> None:
         self.folder_paths = folder_paths or []
         self.batch_size = batch_size
         self.include_shared = include_shared
+        self.follow_shortcuts = follow_shortcuts
         self.creds: Credentials | None = None
 
     @staticmethod
     def _process_folder_paths(
-        service: discovery.Resource, folder_paths: list[str]
+        service: discovery.Resource,
+        folder_paths: list[str],
+        include_shared: bool,
+        follow_shortcuts: bool,
     ) -> list[str]:
         """['Folder/Sub Folder'] -> ['<FOLDER_ID>']"""
         folder_ids: list[str] = []
@@ -253,10 +314,19 @@ def _process_folder_paths(
             parent_id = "root"
             for folder_name in folder_names:
                 found_parent_id = _get_folder_id(
-                    service=service, parent_id=parent_id, folder_name=folder_name
+                    service=service,
+                    parent_id=parent_id,
+                    folder_name=folder_name,
+                    include_shared=include_shared,
+                    follow_shortcuts=follow_shortcuts,
                 )
                 if found_parent_id is None:
-                    raise ValueError(f"Folder path '{path}' not found in Google Drive")
+                    raise ValueError(
+                        (
+                            f"Folder '{folder_name}' in path '{path}' "
+                            "not found in Google Drive"
+                        )
+                    )
                 parent_id = found_parent_id
             folder_ids.append(parent_id)
 
@@ -283,7 +353,7 @@ def _fetch_docs_from_drive(
 
         service = discovery.build("drive", "v3", credentials=self.creds)
         folder_ids: Sequence[str | None] = self._process_folder_paths(
-            service, self.folder_paths
+            service, self.folder_paths, self.include_shared, self.follow_shortcuts
         )
         if not folder_ids:
             folder_ids = [None]
@@ -293,6 +363,7 @@ def _fetch_docs_from_drive(
                 get_all_files_batched(
                     service=service,
                     include_shared=self.include_shared,
+                    follow_shortcuts=self.follow_shortcuts,
                     batch_size=self.batch_size,
                     time_range_start=start,
                     time_range_end=end,
@@ -326,9 +397,10 @@ def load_from_state(self) -> GenerateDocumentsOutput:
     def poll_source(
         self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
     ) -> GenerateDocumentsOutput:
-        # need to subtract 10 minutes from start time to account for modifiedTime propogation
-        # if a document is modified, it takes some time for the API to reflect these changes
-        # if we do not have an offset, then we may "miss" the update when polling
+        # need to subtract 10 minutes from start time to account for modifiedTime
+        # propogation if a document is modified, it takes some time for the API to
+        # reflect these changes if we do not have an offset, then we may "miss" the
+        # update when polling
         yield from self._fetch_docs_from_drive(
             max(start - DRIVE_START_TIME_OFFSET, 0, 0), end
         )
diff --git a/backend/scripts/reset_postgres.py b/backend/scripts/reset_postgres.py
@@ -34,6 +34,7 @@ def wipe_all_rows(database: str) -> None:
     cur.execute(f"DELETE FROM document")
     cur.execute(f"DELETE FROM connector_credential_pair")
     cur.execute(f"DELETE FROM index_attempt")
+    cur.execute(f"DELETE FROM credential")
     conn.commit()
 
     for table_name in table_names:

diff --git a/web/src/app/admin/connectors/google-drive/ConnectorEditPopup.tsx b/web/src/app/admin/connectors/google-drive/ConnectorEditPopup.tsx
@@ -47,6 +47,10 @@ export const ConnectorEditPopup = ({ existingConnector, onSubmit }: Props) => {
                 name="include_shared"
                 label="Include Shared Files"
               />
+              <BooleanFormField
+                name="follow_shortcuts"
+                label="Follow Shortcuts"
+              />
             </div>
           )}
           validationSchema={Yup.object().shape({
@@ -58,6 +62,7 @@ export const ConnectorEditPopup = ({ existingConnector, onSubmit }: Props) => {
               )
               .required(),
             include_shared: Yup.boolean().required(),
+            follow_shortcuts: Yup.boolean().required(),
           })}
           onSubmit={onSubmit}
         />

diff --git a/web/src/app/admin/connectors/google-drive/GoogleDriveConnectorsTable.tsx b/web/src/app/admin/connectors/google-drive/GoogleDriveConnectorsTable.tsx
@@ -90,6 +90,10 @@ export const GoogleDriveConnectorsTable = ({
           header: "Include Shared",
           key: "include_shared",
         },
+        {
+          header: "Follow Shortcuts",
+          key: "follow_shortcuts",
+        },
         {
           header: "Status",
           key: "status",
@@ -132,6 +136,16 @@ export const GoogleDriveConnectorsTable = ({
               )}
             </div>
           ),
+          follow_shortcuts: (
+            <div>
+              {connectorIndexingStatus.connector.connector_specific_config
+                .follow_shortcuts ? (
+                <i>Yes</i>
+              ) : (
+                <i>No</i>
+              )}
+            </div>
+          ),
           status: (
             <StatusRow
               connectorIndexingStatus={connectorIndexingStatus}

diff --git a/web/src/app/admin/connectors/google-drive/page.tsx b/web/src/app/admin/connectors/google-drive/page.tsx
@@ -281,6 +281,14 @@ const GoogleDriveConnectorManagement = ({
                   "that match both criteria."
                 }
               />
+              <BooleanFormField
+                name="follow_shortcuts"
+                label="Follow Shortcuts"
+                subtext={
+                  "If checked, then will follow shortcuts to files and folder and " +
+                  "attempt to index those as well."
+                }
+              />
             </>
           )}
           validationSchema={Yup.object().shape({
@@ -292,10 +300,12 @@ const GoogleDriveConnectorManagement = ({
               )
               .required(),
             include_shared: Yup.boolean().required(),
+            follow_shortcuts: Yup.boolean().required(),
           })}
           initialValues={{
             folder_paths: [],
             include_shared: false,
+            follow_shortcuts: false,
           }}
           refreshFreq={10 * 60} // 10 minutes
           onSubmit={async (isSuccess, responseJson) => {