Skip to content

Commit

Permalink
Google Drive handle invalid PDFs (#838)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 authored Dec 19, 2023
1 parent 2180a40 commit c281859
Showing 1 changed file with 24 additions and 18 deletions.
42 changes: 24 additions & 18 deletions backend/danswer/connectors/cross_connector_utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import chardet
from pypdf import PdfReader
from pypdf.errors import PdfStreamError

from danswer.utils.logger import setup_logger

Expand Down Expand Up @@ -37,29 +38,34 @@ def extract_metadata(line: str) -> dict | None:


def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
pdf_reader = PdfReader(file)

# if marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error(f"Unable to decrypt pdf {file_name}")
else:
logger.info(f"No Password available to to decrypt pdf {file_name}")
try:
pdf_reader = PdfReader(file)

# If marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error(f"Unable to decrypt pdf {file_name}")
else:
logger.info(f"No Password available to to decrypt pdf {file_name}")

if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return ""
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return ""

try:
return "\n".join(page.extract_text() for page in pdf_reader.pages)
except PdfStreamError:
logger.exception(f"PDF file {file_name} is not a valid PDF")
except Exception:
logger.exception(f"Failed to read PDF {file_name}")
return ""

# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return ""


def is_macos_resource_fork_file(file_name: str) -> bool:
Expand Down

1 comment on commit c281859

@vercel
Copy link

@vercel vercel bot commented on c281859 Dec 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.