Merge pull request #1029 from jsvine/develop

Merge v0.10.3 into stable
jsvine · Oct 26, 2023 · d9561d1 · d9561d1
2 parents ceef47b + 2e838d1
commit d9561d1
Show file tree

Hide file tree

Showing 16 changed files with 184 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,17 @@
 
 All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
+## [0.10.3] - 2023-10-26
+
+### Added
+
+- Add support for marked-content sequences, represented by `mcid` and `tag` attributes on `char`/`rect`/`line`/`curve`/`image` objects (h/t @dhdaines). ([#961](https://github.com/jsvine/pdfplumber/pulls/961))
+- Add `gs_path` argument to `pdfplumber.open(...)` and `pdfplumber.repair(...)`, to allow passing a custom Ghostscript path to be used for repairing. ([#953](https://github.com/jsvine/pdfplumber/issues/953))
+
+### Fixed
+
+- Respect `use_text_flow` in `extract_text` (h/t @dhdaines). ([#983](https://github.com/jsvine/pdfplumber/pulls/983))
+
 ## [0.10.2] - 2023-07-29
 
 ### Added

diff --git a/CITATION.cff b/CITATION.cff
@@ -1,8 +1,8 @@
 cff-version: 1.2.0
 title: pdfplumber
 type: software
-version: 0.10.2
-date-released: "2023-07-29"
+version: 0.10.3
+date-released: "2023-10-26"
 authors:
  - family-names: "Singer-Vine"
  given-names: "Jeremy"

diff --git a/README.md b/README.md
@@ -158,6 +158,8 @@ Each object is represented as a simple Python `dict`, with the following propert
 |`bottom`| Distance of bottom of the character from top of page.|
 |`doctop`| Distance of top of character from top of document.|
 |`matrix`| The "current transformation matrix" for this character. (See below for details.)|
+|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this character if any (otherwise `None`). *Experimental attribute.*|
+|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this character if any (otherwise `None`). *Experimental attribute.*|
 |`ncs`|TKTK|
 |`stroking_pattern`|TKTK|
 |`non_stroking_pattern`|TKTK|
@@ -191,6 +193,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`linewidth`| Thickness of line.|
 |`stroking_color`|The color of the line. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The non-stroking color specified for the line’s path. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this line if any (otherwise `None`). *Experimental attribute.*|
+|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this line if any (otherwise `None`). *Experimental attribute.*|
 |`object_type`| "line"|
 
 #### `rect` properties
@@ -210,6 +214,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`linewidth`| Thickness of line.|
 |`stroking_color`|The color of the rectangle's outline. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The rectangle’s fill color. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this rect if any (otherwise `None`). *Experimental attribute.*|
+|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this rect if any (otherwise `None`). *Experimental attribute.*|
 |`object_type`| "rect"|
 
 #### `curve` properties
@@ -231,6 +237,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`fill`| Whether the shape defined by the curve's path is filled.|
 |`stroking_color`|The color of the curve's outline. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The curve’s fill color. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this curve if any (otherwise `None`). *Experimental attribute.*|
+|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this curve if any (otherwise `None`). *Experimental attribute.*|
 |`object_type`| "curve"|
 
 #### Derived properties
@@ -531,6 +539,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
 - [Shannon Shen](https://github.com/lolipopshock)
 - [Matsumoto Toshi](https://github.com/toshi1127)
 - [John West](https://github.com/jwestwsj)
+- [David Huggins-Daines](https://github.com/dhdaines)
 - [Jeremy B. Merrill](https://github.com/jeremybmerrill)
 
 ## Contributing

diff --git a/docs/repairing.md b/docs/repairing.md
@@ -9,3 +9,7 @@ Malformed PDFs can often be [fixed via Ghostscript](https://superuser.com/questi
 - `pdfplumber.open(..., repair=True)` will repair your PDF on the fly (but not save the repaired version to disk).
 - `pdfplumber.repair(path_to_pdf)` will return a `BytesIO` object holding the bytes of a repaired version of the original file.
 - `pdfplumber.repair(path_to_pdf, outfile="path/to/repaired.pdf")` will write a repaired version of the original file to the indicated `outfile` path.
+
+## Custom parameters
+
+- `gs_path=...`: You can pass a custom path for the Ghostscript executable, helpful in case `pdfplumber` is unable to auto-detect your copy of Ghostscript.
diff --git a/pdfplumber/_version.py b/pdfplumber/_version.py
@@ -1,2 +1,2 @@
-version_info = (0, 10, 2)
+version_info = (0, 10, 3)
 __version__ = ".".join(map(str, version_info))
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -22,7 +22,7 @@
  LTPage,
  LTTextContainer,
 )
-from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT
 from pdfminer.pdfpage import PDFPage
 from pdfminer.psparser import PSLiteral
 
@@ -62,6 +62,8 @@
  "stream",
  "stroke",
  "stroking_color",
+ "mcid",
+ "tag",
  ]
 )
 
@@ -115,6 +117,56 @@ def normalize_color(
  return separate_pattern(tuplefied)
 
 
+class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
+ """Extract layout from a specific page, adding marked-content IDs to
+ objects where found."""
+
+ cur_mcid: Optional[int] = None
+ cur_tag: Optional[str] = None
+
+ def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
+ """Handle beginning of tag, setting current MCID if any."""
+ self.cur_tag = decode_text(tag.name)
+ if isinstance(props, dict) and "MCID" in props:
+ self.cur_mcid = props["MCID"]
+ else:
+ self.cur_mcid = None
+
+ def end_tag(self) -> None:
+ """Handle beginning of tag, clearing current MCID."""
+ self.cur_tag = None
+ self.cur_mcid = None
+
+ def tag_cur_item(self) -> None:
+ """Add current MCID to what we hope to be the most recent object created
+ by pdfminer.six."""
+ # This is somewhat hacky and would not be necessary if
+ # pdfminer.six supported MCIDs. In reading the code it's
+ # clear that the `render_*` methods methods will only ever
+ # create one object, but that is far from being guaranteed.
+ # Even if pdfminer.six's API would just return the objects it
+ # creates, we wouldn't have to do this.
+ cur_obj = self.cur_item._objs[-1]
+ cur_obj.mcid = self.cur_mcid # type: ignore
+ cur_obj.tag = self.cur_tag # type: ignore
+
+ def render_char(self, *args, **kwargs) -> float: # type: ignore
+ """Hook for rendering characters, adding the `mcid` attribute."""
+ adv = super().render_char(*args, **kwargs)
+ self.tag_cur_item()
+ return adv
+
+ def render_image(self, *args, **kwargs) -> None: # type: ignore
+ """Hook for rendering images, adding the `mcid` attribute."""
+ super().render_image(*args, **kwargs)
+ self.tag_cur_item()
+
+ def paint_path(self, *args, **kwargs) -> None: # type: ignore
+ """Hook for rendering lines and curves, adding the `mcid` attribute."""
+ super().paint_path(*args, **kwargs)
+ self.tag_cur_item()
+
+
 class Page(Container):
  cached_properties: List[str] = Container.cached_properties + ["_layout"]
  is_original: bool = True
@@ -174,7 +226,7 @@ def height(self) -> T_num:
  def layout(self) -> LTPage:
  if hasattr(self, "_layout"):
  return self._layout
- device = PDFPageAggregator(
+ device = PDFPageAggregatorWithMarkedContent(
  self.pdf.rsrcmgr,
  pageno=self.page_number,
  laparams=self.pdf.laparams,

diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py
@@ -70,12 +70,13 @@ def open(
  password: Optional[str] = None,
  strict_metadata: bool = False,
  repair: bool = False,
+ gs_path: Optional[Union[str, pathlib.Path]] = None,
  ) -> "PDF":
 
  stream: Union[BufferedReader, BytesIO]
 
  if repair:
- stream = _repair(path_or_fp, password=password)
+ stream = _repair(path_or_fp, password=password, gs_path=gs_path)
  stream_is_external = False
  # Although the original file has a path,
  # the repaired version does not

diff --git a/pdfplumber/repair.py b/pdfplumber/repair.py
@@ -8,9 +8,10 @@
 def _repair(
  path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
  password: Optional[str] = None,
+ gs_path: Optional[Union[str, pathlib.Path]] = None,
 ) -> BytesIO:
 
- executable = shutil.which("gs") or shutil.which("gswin32c")
+ executable = gs_path or shutil.which("gs") or shutil.which("gswin32c")
  if executable is None: # pragma: nocover
  raise Exception(
  "Cannot find Ghostscript, which is required for repairs.\n"
@@ -52,8 +53,9 @@ def repair(
  path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
  outfile: Optional[Union[str, pathlib.Path]] = None,
  password: Optional[str] = None,
+ gs_path: Optional[Union[str, pathlib.Path]] = None,
 ) -> Optional[BytesIO]:
- repaired = _repair(path_or_fp, password)
+ repaired = _repair(path_or_fp, password, gs_path=gs_path)
  if outfile:
  with open(outfile, "wb") as f:
  f.write(repaired.read())

diff --git a/pdfplumber/utils/clustering.py b/pdfplumber/utils/clustering.py
@@ -40,7 +40,10 @@ def make_cluster_dict(values: Iterable[T_num], tolerance: T_num) -> Dict[T_num,
 
 
 def cluster_objects(
- xs: List[R], key_fn: Union[Hashable, Callable[[R], T_num]], tolerance: T_num
+ xs: List[R],
+ key_fn: Union[Hashable, Callable[[R], T_num]],
+ tolerance: T_num,
+ preserve_order: bool = False,
 ) -> List[List[R]]:
 
  if not callable(key_fn):
@@ -51,7 +54,12 @@ def cluster_objects(
 
  get_0, get_1 = itemgetter(0), itemgetter(1)
 
- cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
+ if preserve_order:
+ cluster_tuples = [(x, cluster_dict.get(key_fn(x))) for x in xs]
+ else:
+ cluster_tuples = sorted(
+ ((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1
+ )
 
  grouped = itertools.groupby(cluster_tuples, key=get_1)
 

diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py
@@ -225,7 +225,10 @@ def to_textmap(
 
  for i, ws in enumerate(
  cluster_objects(
- words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
+ words_sorted_doctop,
+ lambda x: float(x[0]["doctop"]),
+ y_tolerance,
+ preserve_order=presorted or use_text_flow,
  )
  ):
  y_dist = (

diff --git a/tests/pdfs/issue-982-example.pdf b/tests/pdfs/issue-982-example.pdf
diff --git a/tests/pdfs/mcid_example.pdf b/tests/pdfs/mcid_example.pdf
diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -70,7 +70,7 @@ def test_csv(self):
  assert c.split("\r\n")[9] == (
  "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
  '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
- ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
+ ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,'
  )
 
  io = StringIO()
@@ -125,7 +125,7 @@ def test_cli_csv(self):
  assert res.decode("utf-8").split("\r\n")[9] == (
  "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
  '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
- ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
+ ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,'
  )
 
  def test_cli_csv_exclude(self):
@@ -141,6 +141,7 @@ def test_cli_csv_exclude(self):
  "3",
  "--exclude-attrs",
  "matrix",
+ "mcid",
  "ncs",
  "non_stroking_pattern",
  "stroking_pattern",
@@ -150,7 +151,7 @@ def test_cli_csv_exclude(self):
  assert res.decode("utf-8").split("\r\n")[9] == (
  "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
  "18.0,12.996,,,,,,TimesNewRomanPSMT,"
- ',,"(0, 0, 0)",,18.0,,,,,Y,,1,'
+ ',,"(0, 0, 0)",,18.0,,,,,,Y,,1,'
  )
 
  def test_cli_csv_include(self):

diff --git a/tests/test_issues.py b/tests/test_issues.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import logging
 import os
+import re
 import unittest
 
 import pdfplumber
@@ -257,3 +258,20 @@ def test_issue_683(self):
  with pdfplumber.open(path) as pdf:
  page = pdf.pages[0]
  page.search(r"\d+", regex=True)
+
+ def test_issue_982(self):
+ """
+ extract_text(use_text_flow=True) apparently does nothing
+
+ This is because, while we took care not to sort the words by
+ `doctop` in `WordExtractor` and `WordMap`, no such precaution
+ was taken in `cluster_objects`. We thus add an option to
+ `cluster_objects` to preserve the ordering (which could come
+ from `use_text_flow` or from `presorted`) of the input objects.
+ """
+ path = os.path.join(HERE, "pdfs/issue-982-example.pdf")
+ with pdfplumber.open(path) as pdf:
+ page = pdf.pages[0]
+ text = re.sub(r"\s+", " ", page.extract_text(use_text_flow=True))
+ words = " ".join(w["text"] for w in page.extract_words(use_text_flow=True))
+ assert text[0:100] == words[0:100]
diff --git a/tests/test_mcids.py b/tests/test_mcids.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import os
+import unittest
+
+import pdfplumber
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+
+class TestMCIDs(unittest.TestCase):
+ """Test MCID extraction."""
+
+ def test_mcids(self):
+ path = os.path.join(HERE, "pdfs/mcid_example.pdf")
+
+ pdf = pdfplumber.open(path)
+ page = pdf.pages[0]
+ # Check text of MCIDS
+ mcids = []
+ for c in page.chars:
+ if "mcid" in c:
+ while len(mcids) <= c["mcid"]:
+ mcids.append("")
+ if not mcids[c["mcid"]]:
+ mcids[c["mcid"]] = c["tag"] + ": "
+ mcids[c["mcid"]] += c["text"]
+ assert mcids == [
+ "Standard: Test of figures",
+ "",
+ "P: 1 ligne",
+ "P: 2 ligne",
+ "P: 3 ligne",
+ "P: 4 ligne",
+ "P: 0",
+ "P: 2",
+ "P: 4",
+ "P: 6",
+ "P: 8",
+ "P: 10",
+ "P: 12",
+ "P: Figure 1: Chart",
+ "",
+ "P: 1 colonne",
+ "P: 2 colonne",
+ "P: 3 colonne",
+ ]
+ # Check line and curve MCIDs
+ line_mcids = set(x["mcid"] for x in page.lines)
+ curve_mcids = set(x["mcid"] for x in page.curves)
+ assert all(x["tag"] == "Figure" for x in page.lines)
+ assert all(x["tag"] == "Figure" for x in page.curves)
+ assert line_mcids & {1, 14}
+ assert curve_mcids & {1, 14}
+ # No rects to test unfortunately!
diff --git a/tests/test_repair.py b/tests/test_repair.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import os
+import shutil
 import tempfile
 import unittest
 
@@ -56,3 +57,8 @@ def test_repair_password(self):
  path = os.path.join(HERE, "pdfs/password-example.pdf")
  with pdfplumber.open(path, repair=True, password="test") as pdf:
  assert len(pdf.pages[0].chars)
+
+ def test_repair_custom_path(self):
+ path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf")
+ with pdfplumber.open(path, repair=True, gs_path=shutil.which("gs")) as pdf:
+ assert len(pdf.pages[0].chars)