diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 4505861f1..69789d712 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -121,6 +121,8 @@ def _get_text(self, key: str) -> Optional[str]: retval = self.get(key, None) if isinstance(retval, TextStringObject): return retval + if isinstance(retval, ByteStringObject): + return str(retval) return None @property diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 21aa558fe..adeb3f6ad 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -619,6 +619,15 @@ def write_to_stream( stream.write(binascii.hexlify(self)) stream.write(b">") + def __str__(self) -> str: + for enc in NameObject.CHARSETS: + try: + ret = self.decode(enc) + return NameObject(ret) + except Exception: + pass + return None + class TextStringObject(str, PdfObject): # noqa: SLOT000 """ diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index a53b31799..af6fe4429 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -189,6 +189,15 @@ def create_string_object( retval.autodetect_pdfdocencoding = True return retval except UnicodeDecodeError: + # Try utf-8. + try: + text = string.decode("utf-8") + retval = TextStringObject(text) + retval._original_bytes = string + return retval + except UnicodeDecodeError: + pass + return ByteStringObject(string) else: raise TypeError("create_string_object should have str or unicode arg") diff --git a/resources/bytes.pdf b/resources/bytes.pdf new file mode 100644 index 000000000..9ad7d7a14 Binary files /dev/null and b/resources/bytes.pdf differ diff --git a/tests/test_reader.py b/tests/test_reader.py index e1b7fc26d..b01dc1add 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -108,9 +108,18 @@ def test_read_metadata(pdf_path, expected): docinfo.modification_date docinfo.modification_date_raw if "/Title" in metadict: + assert isinstance(docinfo.title, str) assert metadict["/Title"] == docinfo.title +def test_read_metadata_title_is_utf8(): + with open(RESOURCE_ROOT / "bytes.pdf", "rb") as inputfile: + reader = PdfReader(inputfile) + title = reader.metadata.title + # Should be a str. + assert title == "Microsoft Word - トランスバース社買収電話会議英語Final.docx" + + def test_iss1943(): with PdfReader(RESOURCE_ROOT / "crazyones.pdf") as reader: docinfo = reader.metadata