Skip to content

Commit

Permalink
MAINT: Unnecessary character mapping process (#2888)
Browse files Browse the repository at this point in the history
This is a fix for the problem that occurred when #2882 was changed.

The string length of characters was checked after conversion by cmap, but after cmap conversion, there is a pattern where the string length is more than one character, and it cannot be measured accurately.

This is necessary, for example, when considering whether to measure the distance from the ligature or the base character corresponding to the ligature in fixing #1351.

The change in handle_tj is because it cannot pass Ruff's check.
Error: PLR0915 Too many statements (nnn > 176)

The following code is only used to get the character code for a space.
However, I think it would be better to split the code into parts for obtaining the character code.
Style changes are considered in another PR.
  • Loading branch information
ssjkamei authored Oct 4, 2024
1 parent e825ac0 commit abb62ac
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 109 deletions.
40 changes: 10 additions & 30 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def build_char_map_from_dict(
Font sub-type, space_width criteria(50% of width), encoding, map character-map.
The font-dictionary itself is suitable for the curious.
"""
font_type: str = cast(str, ft["/Subtype"])
font_type = cast(str, ft["/Subtype"].get_object())

space_code = 32
encoding, space_code = parse_encoding(ft, space_code)
Expand All @@ -75,21 +75,12 @@ def build_char_map_from_dict(
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
# I consider the space_code is available on one byte
if isinstance(space_code, str):
try: # one byte
sp = space_code.encode("charmap")[0]
except Exception:
sp = space_code.encode("utf-16-be")
sp = sp[0] + 256 * sp[1]
try:
sp = ord(map_dict[chr(sp)])
except KeyError:
pass
else:
sp = space_code
font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0)
half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0
else:
sp = chr(space_code)
font_width_map = build_font_width_map(ft, space_width * 2.0)
half_space_width = compute_space_width(font_width_map, sp) / 2.0

return (
font_type,
Expand Down Expand Up @@ -403,17 +394,14 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->


def build_font_width_map(
ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float
ft: DictionaryObject, default_font_width: float
) -> Dict[Any, float]:
font_width_map: Dict[Any, float] = {}
st: int = 0
en: int = 0
if ft is None:
font_width_map["default"] = default_font_width
return font_width_map
try:
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
except Exception:
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object)] * 2.0
except KeyError:
pass
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
Expand All @@ -435,21 +423,13 @@ def build_font_width_map(
# C_first C_last same_W
en = second
for c_code in range(st, en + 1):
try:
conversion_char = map_dict[chr(c_code)]
font_width_map[conversion_char] = w[2]
except KeyError:
pass
font_width_map[chr(c_code)] = w[2]
w = w[3:]
elif isinstance(second, list):
# Starting_C [W1 W2 ... Wn]
c_code = st
for width in second:
try:
conversion_char = map_dict[chr(c_code)]
font_width_map[conversion_char] = width
except KeyError:
pass
font_width_map[chr(c_code)] = width
c_code += 1
w = w[2:]
else:
Expand Down
109 changes: 86 additions & 23 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,21 @@
overload,
)

from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map
from ._cmap import (
build_char_map,
build_font_width_map,
compute_font_width,
parse_encoding,
parse_to_unicode,
unknown_char_map,
)
from ._protocols import PdfCommonDocProtocol
from ._text_extraction import (
OrientationNotFoundError,
_layout_mode,
crlf_space_check,
handle_tj,
get_display_str,
get_text_operands,
mult,
)
from ._utils import (
Expand Down Expand Up @@ -84,6 +92,7 @@
PdfObject,
RectangleObject,
StreamObject,
TextStringObject,
is_null_or_none,
)

Expand Down Expand Up @@ -496,7 +505,7 @@ def __init__(
if not is_null_or_none(indirect_reference):
assert indirect_reference is not None, "mypy"
self.update(cast(DictionaryObject, indirect_reference.get_object()))
self._font_width_maps: Dict[str, Dict[str, float]] = {}
self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {}

def hash_bin(self) -> int:
"""
Expand Down Expand Up @@ -1722,19 +1731,78 @@ def _get_acutual_font_widths(
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
add_text: str,
text_operands: str,
font_size: float,
default_space_width: float
space_width: float
) -> Tuple[float, float, float]:
font_widths: float = 0
font_name: str = cmap[2]
if font_name not in self._font_width_maps:
self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2)
font_width_map: Dict[Any, float] = self._font_width_maps[font_name]
if add_text:
for char in add_text:
if cmap[3] is None:
font_width_map: Dict[Any, float] = {}
space_char = " "
actual_space_width: float = space_width
font_width_map["default"] = actual_space_width * 2
else:
space_code = 32
_, space_code = parse_encoding(cmap[3], space_code)
_, space_code, _ = parse_to_unicode(cmap[3], space_code)
if isinstance(space_code, str):
space_char = space_code
else:
space_char = chr(space_code)
font_width_map = build_font_width_map(cmap[3], space_width * 2)
actual_space_width = compute_font_width(font_width_map, space_char)
if actual_space_width == 0:
actual_space_width = space_width
self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
font_width_map = self._font_width_maps[font_name][0]
space_char = self._font_width_maps[font_name][1]
actual_space_width = self._font_width_maps[font_name][2]

if text_operands:
for char in text_operands:
if char == space_char:
font_widths += actual_space_width
continue
font_widths += compute_font_width(font_width_map, char)
return (font_widths * font_size, default_space_width * font_size, font_size)
return (font_widths * font_size, space_width * font_size, font_size)

def _handle_tj(
self,
text: str,
operands: List[Union[str, TextStringObject]],
cm_matrix: List[float],
tm_matrix: List[float],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
orientations: Tuple[int, ...],
font_size: float,
rtl_dir: bool,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
space_width: float,
actual_str_size: Dict[str, float]
) -> Tuple[str, bool, Dict[str, float]]:
text_operands, is_str_operands = get_text_operands(
operands, cm_matrix, tm_matrix, cmap, orientations)
if is_str_operands:
text += text_operands
else:
text, rtl_dir = get_display_str(
text,
cm_matrix,
tm_matrix, # text matrix
cmap,
text_operands,
font_size,
rtl_dir,
visitor_text)
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
self._get_acutual_font_widths(cmap, text_operands, font_size, space_width))
actual_str_size["str_widths"] += font_widths

return text, rtl_dir, actual_str_size

def _extract_text(
self,
Expand Down Expand Up @@ -1818,11 +1886,8 @@ def _extract_text(
TL = 0.0
font_size = 12.0 # init just in case of

def current_spacewidth() -> float:
return _space_width / 1000.0

def current_strwidths() -> float:
return _actual_str_size["str_widths"] / 1000.0
def compute_strwidths(str_widths: float) -> float:
return str_widths / 1000.0

def process_operation(operator: bytes, operands: List[Any]) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
Expand Down Expand Up @@ -1945,7 +2010,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
ty = float(operands[1])
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
str_widths = current_strwidths()
str_widths = compute_strwidths(_actual_str_size["str_widths"])
_actual_str_size["str_widths"] = 0.0
elif operator == b"Tm":
check_crlf_space = True
Expand All @@ -1957,28 +2022,26 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
float(operands[4]),
float(operands[5]),
]
str_widths = current_strwidths()
str_widths = compute_strwidths(_actual_str_size["str_widths"])
_actual_str_size["str_widths"] = 0.0
elif operator == b"T*":
check_crlf_space = True
tm_matrix[5] -= TL
elif operator == b"Tj":
check_crlf_space = True
text, rtl_dir, add_text = handle_tj(
text, rtl_dir, _actual_str_size = self._handle_tj(
text,
operands,
cm_matrix,
tm_matrix, # text matrix
cmap,
orientations,
output,
font_size,
rtl_dir,
visitor_text,
_space_width,
_actual_str_size,
)
current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = (
self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth()))
_actual_str_size["str_widths"] += current_font_widths
else:
return None
if check_crlf_space:
Expand All @@ -1994,7 +2057,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
font_size,
visitor_text,
str_widths,
_actual_str_size["space_width"],
compute_strwidths(_actual_str_size["space_width"]),
_actual_str_size["str_height"]
)
if text == "":
Expand Down
Loading

0 comments on commit abb62ac

Please sign in to comment.