Skip to content

Commit

Permalink
BUG: Title sometimes is bytes and not str.
Browse files Browse the repository at this point in the history
  • Loading branch information
reformy committed Nov 6, 2024
1 parent 5b50f47 commit e4a7808
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ def _get_text(self, key: str) -> Optional[str]:
retval = self.get(key, None)
if isinstance(retval, TextStringObject):
return retval
if isinstance(retval, ByteStringObject):
return str(retval)
return None

@property
Expand Down
9 changes: 9 additions & 0 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,15 @@ def write_to_stream(
stream.write(binascii.hexlify(self))
stream.write(b">")

def __str__(self) -> str:
for enc in NameObject.CHARSETS:
try:
ret = self.decode(enc)
return NameObject(ret)
except Exception:
pass
return None


class TextStringObject(str, PdfObject): # noqa: SLOT000
"""
Expand Down
9 changes: 9 additions & 0 deletions pypdf/generic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,15 @@ def create_string_object(
retval.autodetect_pdfdocencoding = True
return retval
except UnicodeDecodeError:
# Try utf-8.
try:
text = string.decode("utf-8")
retval = TextStringObject(text)
retval._original_bytes = string
return retval
except UnicodeDecodeError:
pass

return ByteStringObject(string)
else:
raise TypeError("create_string_object should have str or unicode arg")
Expand Down
Binary file added resources/bytes.pdf
Binary file not shown.
9 changes: 9 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,18 @@ def test_read_metadata(pdf_path, expected):
docinfo.modification_date
docinfo.modification_date_raw
if "/Title" in metadict:
assert isinstance(docinfo.title, str)
assert metadict["/Title"] == docinfo.title


def test_read_metadata_title_is_utf8():
with open(RESOURCE_ROOT / "bytes.pdf", "rb") as inputfile:
reader = PdfReader(inputfile)
title = reader.metadata.title
# Should be a str.
assert title == "Microsoft Word - トランスバース社買収電話会議英語Final.docx"


def test_iss1943():
with PdfReader(RESOURCE_ROOT / "crazyones.pdf") as reader:
docinfo = reader.metadata
Expand Down

0 comments on commit e4a7808

Please sign in to comment.