Skip to content

Commit

Permalink
Google drive shared files fix + shortcuts (#300)
Browse files Browse the repository at this point in the history
Also fixes foreign key constraint issue when manually wiping postgres + keeps track of accessed folders
  • Loading branch information
sidravi1 authored Aug 17, 2023
1 parent 11c071d commit 81a4934
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 33 deletions.
1 change: 1 addition & 0 deletions backend/danswer/configs/app_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
# Connector Configs
#####
GOOGLE_DRIVE_INCLUDE_SHARED = False
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
)
Expand Down
138 changes: 105 additions & 33 deletions backend/danswer/connectors/google_drive/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from googleapiclient import discovery # type: ignore
from PyPDF2 import PdfReader

from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
Expand Down Expand Up @@ -40,9 +41,7 @@
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
ID_KEY = "id"
LINK_KEY = "link"
TYPE_KEY = "type"
DRIVE_SHORTCUT_TYPE = "application/vnd.google-apps.shortcut"

GoogleDriveFileType = dict[str, Any]

Expand All @@ -51,6 +50,7 @@ def _run_drive_file_query(
service: discovery.Resource,
query: str,
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
batch_size: int = INDEX_BATCH_SIZE,
) -> Generator[GoogleDriveFileType, None, None]:
next_page_token = ""
Expand All @@ -61,7 +61,11 @@ def _run_drive_file_query(
.list(
pageSize=batch_size,
supportsAllDrives=include_shared,
fields="nextPageToken, files(mimeType, id, name, webViewLink)",
includeItemsFromAllDrives=include_shared,
fields=(
"nextPageToken, files(mimeType, id, name, "
"webViewLink, shortcutDetails)"
),
pageToken=next_page_token,
q=query,
)
Expand All @@ -70,45 +74,82 @@ def _run_drive_file_query(
next_page_token = results.get("nextPageToken")
files = results["files"]
for file in files:
if follow_shortcuts and "shortcutDetails" in file:
file = service.files().get(
fileId=file["shortcutDetails"]["targetId"],
supportsAllDrives=include_shared,
fields="mimeType, id, name, webViewLink, shortcutDetails",
)
file = file.execute()
yield file


def _get_folder_id(
service: discovery.Resource, parent_id: str, folder_name: str
service: discovery.Resource,
parent_id: str,
folder_name: str,
include_shared: bool,
follow_shortcuts: bool,
) -> str | None:
"""
Get the ID of a folder given its name and the ID of its parent folder.
"""
query = (
f"'{parent_id}' in parents and name='{folder_name}' and "
f"mimeType='{DRIVE_FOLDER_TYPE}'"
)
query = f"'{parent_id}' in parents and name='{folder_name}' and "
if follow_shortcuts:
query += f"(mimeType='{DRIVE_FOLDER_TYPE}' or mimeType='{DRIVE_SHORTCUT_TYPE}')"
else:
query += f"mimeType='{DRIVE_FOLDER_TYPE}'"

results = (
service.files()
.list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
.list(
q=query,
spaces="drive",
fields="nextPageToken, files(id, name, shortcutDetails)",
supportsAllDrives=include_shared,
includeItemsFromAllDrives=include_shared,
)
.execute()
)
items = results.get("files", [])
return items[0]["id"] if items else None

folder_id = None
if items:
if follow_shortcuts and "shortcutDetails" in items[0]:
folder_id = items[0]["shortcutDetails"]["targetId"]
else:
folder_id = items[0]["id"]
return folder_id


def _get_folders(
service: discovery.Resource,
folder_id: str | None = None, # if specified, only fetches files within this folder
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
batch_size: int = INDEX_BATCH_SIZE,
) -> Generator[GoogleDriveFileType, None, None]:
query = f"mimeType = '{DRIVE_FOLDER_TYPE}' "
if follow_shortcuts:
query = "(" + query + f" or mimeType = '{DRIVE_SHORTCUT_TYPE}'" + ") "

if folder_id:
query += f"and '{folder_id}' in parents "
query = query.rstrip() # remove the trailing space(s)

yield from _run_drive_file_query(
for file in _run_drive_file_query(
service=service,
query=query,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
)
):
# Need to check this since file may have been a target of a shortcut
# and not necessarily a folder
if file["mimeType"] == DRIVE_FOLDER_TYPE:
yield file
else:
pass


def _get_files(
Expand All @@ -117,6 +158,7 @@ def _get_files(
time_range_end: SecondsSinceUnixEpoch | None = None,
folder_id: str | None = None, # if specified, only fetches files within this folder
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
supported_drive_doc_types: list[str] = SUPPORTED_DRIVE_DOC_TYPES,
batch_size: int = INDEX_BATCH_SIZE,
) -> Generator[GoogleDriveFileType, None, None]:
Expand All @@ -137,6 +179,7 @@ def _get_files(
service=service,
query=query,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
)
for file in files:
Expand All @@ -147,12 +190,15 @@ def _get_files(
def get_all_files_batched(
service: discovery.Resource,
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
batch_size: int = INDEX_BATCH_SIZE,
time_range_start: SecondsSinceUnixEpoch | None = None,
time_range_end: SecondsSinceUnixEpoch | None = None,
folder_id: str | None = None, # if specified, only fetches files within this folder
# if True, will fetch files in sub-folders of the specified folder ID. Only applies if folder_id is specified.
# if True, will fetch files in sub-folders of the specified folder ID.
# Only applies if folder_id is specified.
traverse_subfolders: bool = True,
folder_ids_traversed: list[str] | None = None,
) -> Generator[list[GoogleDriveFileType], None, None]:
"""Gets all files matching the criteria specified by the args from Google Drive
in batches of size `batch_size`.
Expand All @@ -163,6 +209,7 @@ def get_all_files_batched(
time_range_end=time_range_end,
folder_id=folder_id,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
)
yield from batch_generator(
Expand All @@ -174,23 +221,33 @@ def get_all_files_batched(
)

if traverse_subfolders:
folder_ids_traversed = folder_ids_traversed or []
subfolders = _get_folders(
service=service,
folder_id=folder_id,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
)
for subfolder in subfolders:
logger.info("Fetching all files in subfolder: " + subfolder["name"])
yield from get_all_files_batched(
service=service,
include_shared=include_shared,
batch_size=batch_size,
time_range_start=time_range_start,
time_range_end=time_range_end,
folder_id=subfolder["id"],
traverse_subfolders=traverse_subfolders,
)
if subfolder["id"] not in folder_ids_traversed:
logger.info("Fetching all files in subfolder: " + subfolder["name"])
folder_ids_traversed.append(subfolder["id"])
yield from get_all_files_batched(
service=service,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
time_range_start=time_range_start,
time_range_end=time_range_end,
folder_id=subfolder["id"],
traverse_subfolders=traverse_subfolders,
folder_ids_traversed=folder_ids_traversed,
)
else:
logger.debug(
"Skipping subfolder since already traversed: " + subfolder["name"]
)


def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
Expand All @@ -209,7 +266,6 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
.execute()
.decode("utf-8")
)
# Default download to PDF since most types can be exported as a PDF
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
Expand All @@ -220,7 +276,7 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
temp.write(word_stream.getvalue())
temp_path = temp.name
return docx2txt.process(temp_path)

# Default download to PDF since most types can be exported as a PDF
else:
response = service.files().get_media(fileId=file["id"]).execute()
pdf_stream = io.BytesIO(response)
Expand All @@ -236,15 +292,20 @@ def __init__(
folder_paths: list[str] | None = None,
batch_size: int = INDEX_BATCH_SIZE,
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
) -> None:
self.folder_paths = folder_paths or []
self.batch_size = batch_size
self.include_shared = include_shared
self.follow_shortcuts = follow_shortcuts
self.creds: Credentials | None = None

@staticmethod
def _process_folder_paths(
service: discovery.Resource, folder_paths: list[str]
service: discovery.Resource,
folder_paths: list[str],
include_shared: bool,
follow_shortcuts: bool,
) -> list[str]:
"""['Folder/Sub Folder'] -> ['<FOLDER_ID>']"""
folder_ids: list[str] = []
Expand All @@ -253,10 +314,19 @@ def _process_folder_paths(
parent_id = "root"
for folder_name in folder_names:
found_parent_id = _get_folder_id(
service=service, parent_id=parent_id, folder_name=folder_name
service=service,
parent_id=parent_id,
folder_name=folder_name,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
)
if found_parent_id is None:
raise ValueError(f"Folder path '{path}' not found in Google Drive")
raise ValueError(
(
f"Folder '{folder_name}' in path '{path}' "
"not found in Google Drive"
)
)
parent_id = found_parent_id
folder_ids.append(parent_id)

Expand All @@ -283,7 +353,7 @@ def _fetch_docs_from_drive(

service = discovery.build("drive", "v3", credentials=self.creds)
folder_ids: Sequence[str | None] = self._process_folder_paths(
service, self.folder_paths
service, self.folder_paths, self.include_shared, self.follow_shortcuts
)
if not folder_ids:
folder_ids = [None]
Expand All @@ -293,6 +363,7 @@ def _fetch_docs_from_drive(
get_all_files_batched(
service=service,
include_shared=self.include_shared,
follow_shortcuts=self.follow_shortcuts,
batch_size=self.batch_size,
time_range_start=start,
time_range_end=end,
Expand Down Expand Up @@ -326,9 +397,10 @@ def load_from_state(self) -> GenerateDocumentsOutput:
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
# need to subtract 10 minutes from start time to account for modifiedTime propogation
# if a document is modified, it takes some time for the API to reflect these changes
# if we do not have an offset, then we may "miss" the update when polling
# need to subtract 10 minutes from start time to account for modifiedTime
# propogation if a document is modified, it takes some time for the API to
# reflect these changes if we do not have an offset, then we may "miss" the
# update when polling
yield from self._fetch_docs_from_drive(
max(start - DRIVE_START_TIME_OFFSET, 0, 0), end
)
1 change: 1 addition & 0 deletions backend/scripts/reset_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def wipe_all_rows(database: str) -> None:
cur.execute(f"DELETE FROM document")
cur.execute(f"DELETE FROM connector_credential_pair")
cur.execute(f"DELETE FROM index_attempt")
cur.execute(f"DELETE FROM credential")
conn.commit()

for table_name in table_names:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ export const ConnectorEditPopup = ({ existingConnector, onSubmit }: Props) => {
name="include_shared"
label="Include Shared Files"
/>
<BooleanFormField
name="follow_shortcuts"
label="Follow Shortcuts"
/>
</div>
)}
validationSchema={Yup.object().shape({
Expand All @@ -58,6 +62,7 @@ export const ConnectorEditPopup = ({ existingConnector, onSubmit }: Props) => {
)
.required(),
include_shared: Yup.boolean().required(),
follow_shortcuts: Yup.boolean().required(),
})}
onSubmit={onSubmit}
/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ export const GoogleDriveConnectorsTable = ({
header: "Include Shared",
key: "include_shared",
},
{
header: "Follow Shortcuts",
key: "follow_shortcuts",
},
{
header: "Status",
key: "status",
Expand Down Expand Up @@ -132,6 +136,16 @@ export const GoogleDriveConnectorsTable = ({
)}
</div>
),
follow_shortcuts: (
<div>
{connectorIndexingStatus.connector.connector_specific_config
.follow_shortcuts ? (
<i>Yes</i>
) : (
<i>No</i>
)}
</div>
),
status: (
<StatusRow
connectorIndexingStatus={connectorIndexingStatus}
Expand Down
10 changes: 10 additions & 0 deletions web/src/app/admin/connectors/google-drive/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,14 @@ const GoogleDriveConnectorManagement = ({
"that match both criteria."
}
/>
<BooleanFormField
name="follow_shortcuts"
label="Follow Shortcuts"
subtext={
"If checked, then will follow shortcuts to files and folder and " +
"attempt to index those as well."
}
/>
</>
)}
validationSchema={Yup.object().shape({
Expand All @@ -292,10 +300,12 @@ const GoogleDriveConnectorManagement = ({
)
.required(),
include_shared: Yup.boolean().required(),
follow_shortcuts: Yup.boolean().required(),
})}
initialValues={{
folder_paths: [],
include_shared: false,
follow_shortcuts: false,
}}
refreshFreq={10 * 60} // 10 minutes
onSubmit={async (isSuccess, responseJson) => {
Expand Down
Loading

1 comment on commit 81a4934

@vercel
Copy link

@vercel vercel bot commented on 81a4934 Aug 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.