From c2818595092c4e8dd96b3e6993b4a621d0e5e1e8 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 18 Dec 2023 23:39:45 -0800 Subject: [PATCH] Google Drive handle invalid PDFs (#838) --- .../cross_connector_utils/file_utils.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py index 3bab0d160ea..3da529e3960 100644 --- a/backend/danswer/connectors/cross_connector_utils/file_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py @@ -9,6 +9,7 @@ import chardet from pypdf import PdfReader +from pypdf.errors import PdfStreamError from danswer.utils.logger import setup_logger @@ -37,29 +38,34 @@ def extract_metadata(line: str) -> dict | None: def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str: - pdf_reader = PdfReader(file) - - # if marked as encrypted and a password is provided, try to decrypt - if pdf_reader.is_encrypted and pdf_pass is not None: - decrypt_success = False - if pdf_pass is not None: - try: - decrypt_success = pdf_reader.decrypt(pdf_pass) != 0 - except Exception: - logger.error(f"Unable to decrypt pdf {file_name}") - else: - logger.info(f"No Password available to to decrypt pdf {file_name}") + try: + pdf_reader = PdfReader(file) + + # If marked as encrypted and a password is provided, try to decrypt + if pdf_reader.is_encrypted and pdf_pass is not None: + decrypt_success = False + if pdf_pass is not None: + try: + decrypt_success = pdf_reader.decrypt(pdf_pass) != 0 + except Exception: + logger.error(f"Unable to decrypt pdf {file_name}") + else: + logger.info(f"No Password available to to decrypt pdf {file_name}") - if not decrypt_success: - # By user request, keep files that are unreadable just so they - # can be discoverable by title. - return "" + if not decrypt_success: + # By user request, keep files that are unreadable just so they + # can be discoverable by title. + return "" - try: return "\n".join(page.extract_text() for page in pdf_reader.pages) + except PdfStreamError: + logger.exception(f"PDF file {file_name} is not a valid PDF") except Exception: logger.exception(f"Failed to read PDF {file_name}") - return "" + + # File is still discoverable by title + # but the contents are not included as they cannot be parsed + return "" def is_macos_resource_fork_file(file_name: str) -> bool: