Skip to content

Commit

Permalink
BUG: Title sometimes is bytes and not str.
Browse files Browse the repository at this point in the history
  • Loading branch information
reformy committed Nov 7, 2024
1 parent 5b50f47 commit a4fa82d
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ def _get_text(self, key: str) -> Optional[str]:
retval = self.get(key, None)
if isinstance(retval, TextStringObject):
return retval
if isinstance(retval, ByteStringObject):
return str(retval)
return None

@property
Expand Down
9 changes: 9 additions & 0 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,15 @@ def write_to_stream(
stream.write(binascii.hexlify(self))
stream.write(b">")

def __str__(self) -> str:
charset_to_try = ["utf-16"] + list(NameObject.CHARSETS)
for enc in charset_to_try:
try:
return self.decode(enc)
except UnicodeDecodeError:
pass
raise PdfReadError("Cannot decode ByteStringObject.")


class TextStringObject(str, PdfObject): # noqa: SLOT000
"""
Expand Down
Binary file added resources/bytes.pdf
Binary file not shown.
9 changes: 9 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,18 @@ def test_read_metadata(pdf_path, expected):
docinfo.modification_date
docinfo.modification_date_raw
if "/Title" in metadict:
assert isinstance(docinfo.title, str)
assert metadict["/Title"] == docinfo.title


def test_read_metadata_title_is_utf8():
with open(RESOURCE_ROOT / "bytes.pdf", "rb") as inputfile:
reader = PdfReader(inputfile)
title = reader.metadata.title
# Should be a str.
assert title == "Microsoft Word - トランスバース社買収電話会議英語Final.docx"


def test_iss1943():
with PdfReader(RESOURCE_ROOT / "crazyones.pdf") as reader:
docinfo = reader.metadata
Expand Down

0 comments on commit a4fa82d

Please sign in to comment.