Merge branch 'develop' of github.com:jsvine/pdfplumber into develop

jsvine · Mar 7, 2024 · 147f2c4 · 147f2c4
2 parents 53306dc + 207312e
commit 147f2c4
Show file tree

Hide file tree

Showing 4 changed files with 271 additions and 12 deletions.
diff --git a/docs/structure.md b/docs/structure.md
@@ -59,3 +59,18 @@ In this case, because marked content IDs are specific to a given page,
 each element will also have a `page_number` attribute, which is the
 number of the page containing (partially or completely) this element,
 indexed from 1 (for consistency with `pdfplumber.Page`).
+
+You can also access the underlying `PDFStructTree` object for more
+flexibility, including visual debugging. For instance to plot the
+bounding boxes of the contents of all of the `TD` elements on the
+first page of a document:
+
+ page = pdf.pages[0]
+ stree = PDFStructTree(pdf, page)
+ img = page.to_image()
+ img.draw_rects(stree.element_bbox(td) for td in table.find_all("TD"))
+
+The `find_all` method works rather like the same method in
+[BeautifulSoup](https://beautiful-soup-4.readthedocs.io/en/latest/#searching-the-tree) -
+it takes an element name, a regular expression, or a matching
+function.
diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py
@@ -1,15 +1,29 @@
+import itertools
 import logging
+import re
 from collections import deque
 from dataclasses import asdict, dataclass, field
-from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ Iterator,
+ List,
+ Optional,
+ Pattern,
+ Tuple,
+ Union,
+)
 
 from pdfminer.data_structures import NumberTree
-from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdftypes import PDFObjRef, resolve1
 from pdfminer.psparser import PSLiteral
 
-from .utils import decode_text
+from ._typing import T_bbox, T_obj
+from .utils import decode_text, geometry
 
 logger = logging.getLogger(__name__)
 
@@ -19,8 +33,73 @@
  from .pdf import PDF
 
 
+MatchFunc = Callable[["PDFStructElement"], bool]
+
+
+def _find_all(
+ elements: Iterable["PDFStructElement"],
+ matcher: Union[str, Pattern[str], MatchFunc],
+) -> Iterator["PDFStructElement"]:
+ """
+ Common code for `find_all()` in trees and elements.
+ """
+
+ def match_tag(x: "PDFStructElement") -> bool:
+ """Match an element name."""
+ return x.type == matcher
+
+ def match_regex(x: "PDFStructElement") -> bool:
+ """Match an element name by regular expression."""
+ return matcher.match(x.type) # type: ignore
+
+ if isinstance(matcher, str):
+ match_func = match_tag
+ elif isinstance(matcher, re.Pattern):
+ match_func = match_regex
+ else:
+ match_func = matcher # type: ignore
+ d = deque(elements)
+ while d:
+ el = d.popleft()
+ if match_func(el):
+ yield el
+ d.extendleft(reversed(el.children))
+
+
+class Findable:
+ """find() and find_all() methods that can be inherited to avoid
+ repeating oneself"""
+
+ children: List["PDFStructElement"]
+
+ def find_all(
+ self, matcher: Union[str, Pattern[str], MatchFunc]
+ ) -> Iterator["PDFStructElement"]:
+ """Iterate depth-first over matching elements in subtree.
+
+ The `matcher` argument is either an element name, a regular
+ expression, or a function taking a `PDFStructElement` and
+ returning `True` if the element matches.
+ """
+ return _find_all(self.children, matcher)
+
+ def find(
+ self, matcher: Union[str, Pattern[str], MatchFunc]
+ ) -> Optional["PDFStructElement"]:
+ """Find the first matching element in subtree.
+
+ The `matcher` argument is either an element name, a regular
+ expression, or a function taking a `PDFStructElement` and
+ returning `True` if the element matches.
+ """
+ try:
+ return next(_find_all(self.children, matcher))
+ except StopIteration:
+ return None
+
+
 @dataclass
-class PDFStructElement:
+class PDFStructElement(Findable):
  type: str
  revision: Optional[int]
  id: Optional[str]
@@ -36,9 +115,24 @@ class PDFStructElement:
  def __iter__(self) -> Iterator["PDFStructElement"]:
  return iter(self.children)
 
+ def all_mcids(self) -> Iterator[Tuple[Optional[int], int]]:
+ """Collect all MCIDs (with their page numbers, if there are
+ multiple pages in the tree) inside a structure element.
+ """
+ # Collect them depth-first to preserve ordering
+ for mcid in self.mcids:
+ yield self.page_number, mcid
+ d = deque(self.children)
+ while d:
+ el = d.popleft()
+ for mcid in el.mcids:
+ yield el.page_number, mcid
+ d.extendleft(reversed(el.children))
+
  def to_dict(self) -> Dict[str, Any]:
  """Return a compacted dict representation."""
  r = asdict(self)
+ # Prune empty values (does not matter in which order)
  d = deque([r])
  while d:
  el = d.popleft()
@@ -54,7 +148,7 @@ class StructTreeMissing(ValueError):
  pass
 
 
-class PDFStructTree:
+class PDFStructTree(Findable):
  """Parse the structure tree of a PDF.
 
  The constructor takes a `pdfplumber.PDF` and optionally a
@@ -72,7 +166,7 @@ class PDFStructTree:
 
  """
 
- page: Optional[PDFPage]
+ page: Optional["Page"]
 
  def __init__(self, doc: "PDF", page: Optional["Page"] = None):
  self.doc = doc.doc
@@ -88,7 +182,8 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None):
  # span multiple pages, and the "Pg" attribute is *optional*,
  # so this is the approved way to get a page's structure...
  if page is not None:
- self.page = page.page_obj
+ self.page = page
+ self.pages = {page.page_number: page}
  self.page_dict = None
  # ...EXCEPT that the ParentTree is sometimes missing, in which
  # case we fall back to the non-approved way.
@@ -102,9 +197,9 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None):
  # structure tree) then there is no `StructParents`.
  # Note however that if there are XObjects in a page,
  # *they* may have `StructParent` (not `StructParents`)
- if "StructParents" not in self.page.attrs:
+ if "StructParents" not in self.page.page_obj.attrs:
  return
- parent_id = self.page.attrs["StructParents"]
+ parent_id = self.page.page_obj.attrs["StructParents"]
  # NumberTree should have a `get` method like it does in pdf.js...
  parent_array = resolve1(
  next(array for num, array in parent_tree.values if num == parent_id)
@@ -113,8 +208,9 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None):
  else:
  self.page = None
  # Overhead of creating pages shouldn't be too bad we hope!
+ self.pages = {page.page_number: page for page in doc.pages}
  self.page_dict = {
- page.page_obj.pageid: page.page_number for page in doc.pages
+ page.page_obj.pageid: page.page_number for page in self.pages.values()
  }
  self._parse_struct_tree()
 
@@ -246,7 +342,7 @@ def on_parsed_page(self, obj: Dict[str, Any]) -> bool:
  return page_objid in self.page_dict
  if self.page is not None:
  # We have to do this to satisfy mypy
- if page_objid != self.page.pageid:
+ if page_objid != self.page.page_obj.pageid:
  return False
  return True
 
@@ -364,3 +460,50 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None:
 
  def __iter__(self) -> Iterator[PDFStructElement]:
  return iter(self.children)
+
+ def element_bbox(self, el: PDFStructElement) -> T_bbox:
+ """Get the bounding box for an element for visual debugging."""
+ page = None
+ if self.page is not None:
+ page = self.page
+ elif el.page_number is not None:
+ page = self.pages[el.page_number]
+ bbox = el.attributes.get("BBox", None)
+ if page is not None and bbox is not None:
+ from .page import CroppedPage, _invert_box, _normalize_box
+
+ # Use secret knowledge of CroppedPage (cannot use
+ # page.height because it is the *cropped* dimension, but
+ # cropping does not actually translate coordinates)
+ bbox = _invert_box(
+ _normalize_box(bbox), page.mediabox[3] - page.mediabox[1]
+ )
+ # Use more secret knowledge of CroppedPage
+ if isinstance(page, CroppedPage):
+ rect = geometry.bbox_to_rect(bbox)
+ rects = page._crop_fn([rect])
+ if not rects:
+ raise IndexError("Element no longer on page")
+ return geometry.obj_to_bbox(rects[0])
+ else:
+ # Not sure why mypy complains here
+ return bbox # type: ignore
+ else:
+ mcid_objs = []
+ for page_number, mcid in el.all_mcids():
+ objects: Iterable[T_obj]
+ if page_number is None:
+ if page is not None:
+ objects = itertools.chain.from_iterable(page.objects.values())
+ else:
+ objects = [] # pragma: nocover
+ else:
+ objects = itertools.chain.from_iterable(
+ self.pages[page_number].objects.values()
+ )
+ for c in objects:
+ if c["mcid"] == mcid:
+ mcid_objs.append(c)
+ if not mcid_objs:
+ raise IndexError("No objects found") # pragma: nocover
+ return geometry.objects_to_bbox(mcid_objs)
diff --git a/pdfplumber/utils/geometry.py b/pdfplumber/utils/geometry.py
@@ -84,7 +84,8 @@ def clip_obj(obj: T_obj, bbox: T_bbox) -> Optional[T_obj]:
  copy[attr] = dims[attr]
 
  diff = dims["top"] - obj["top"]
- copy["doctop"] = obj["doctop"] + diff
+ if "doctop" in copy:
+ copy["doctop"] = obj["doctop"] + diff
  copy["width"] = copy["x1"] - copy["x0"]
  copy["height"] = copy["bottom"] - copy["top"]
 

diff --git a/tests/test_structure.py b/tests/test_structure.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import os
+import re
 import unittest
 from collections import deque
 
@@ -863,6 +864,105 @@ def test_structure_tree_class(self):
  doc_elem = next(iter(stree))
  assert [k.type for k in doc_elem] == ["P", "P", "Figure"]
 
+ def test_find_all_tree(self):
+ """
+ Test find_all() and find() on trees
+ """
+ path = os.path.join(HERE, "pdfs/image_structure.pdf")
+ pdf = pdfplumber.open(path)
+ stree = PDFStructTree(pdf, pdf.pages[0])
+ figs = list(stree.find_all("Figure"))
+ assert len(figs) == 1
+ fig = stree.find("Figure")
+ assert fig == figs[0]
+ assert stree.find("Fogure") is None
+ figs = list(stree.find_all(re.compile(r"Fig.*")))
+ assert len(figs) == 1
+ figs = list(stree.find_all(lambda x: x.type == "Figure"))
+ assert len(figs) == 1
+ figs = list(stree.find_all("Foogure"))
+ assert len(figs) == 0
+ figs = list(stree.find_all(re.compile(r"Fog.*")))
+ assert len(figs) == 0
+ figs = list(stree.find_all(lambda x: x.type == "Flogger"))
+ assert len(figs) == 0
+
+ def test_find_all_element(self):
+ """
+ Test find_all() and find() on elements
+ """
+ path = os.path.join(HERE, "pdfs/pdf_structure.pdf")
+ pdf = pdfplumber.open(path)
+ stree = PDFStructTree(pdf)
+ for list_elem in stree.find_all("L"):
+ items = list(list_elem.find_all("LI"))
+ assert items
+ for item in items:
+ body = list(item.find_all("LBody"))
+ assert body
+ body1 = item.find("LBody")
+ assert body1 == body[0]
+ assert item.find("Loonie") is None
+
+ def test_all_mcids(self):
+ """
+ Test all_mcids()
+ """
+ path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf")
+ pdf = pdfplumber.open(path)
+ # Make sure we can get them with page numbers
+ stree = PDFStructTree(pdf)
+ sect = next(stree.find_all("Sect"))
+ mcids = list(sect.all_mcids())
+ pages = set(page for page, mcid in mcids)
+ assert 1 in pages
+ assert 2 in pages
+ # If we take only a single page there are no page numbers
+ # (FIXME: may wish to reconsider this API decision...)
+ page = pdf.pages[1]
+ stree = PDFStructTree(pdf, page)
+ sect = next(stree.find_all("Sect"))
+ mcids = list(sect.all_mcids())
+ pages = set(page for page, mcid in mcids)
+ assert None in pages
+ assert 1 not in pages
+ assert 2 not in pages
+ # Assure that we get the MCIDs for a content element
+ for p in sect.find_all("P"):
+ assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids)
+
+ def test_element_bbox(self):
+ """
+ Test various ways of getting element bboxes
+ """
+ path = os.path.join(HERE, "pdfs/pdf_structure.pdf")
+ pdf = pdfplumber.open(path)
+ stree = PDFStructTree(pdf)
+ # As BBox attribute
+ table = next(stree.find_all("Table"))
+ assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 555.3, 542.25)
+ # With child elements
+ tr = next(table.find_all("TR"))
+ assert tuple(stree.element_bbox(tr)) == (56.8, 495.9, 328.312, 507.9)
+ # From a specific page it should also work
+ stree = PDFStructTree(pdf, pdf.pages[0])
+ table = next(stree.find_all("Table"))
+ assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 555.3, 542.25)
+ tr = next(table.find_all("TR"))
+ assert tuple(stree.element_bbox(tr)) == (56.8, 495.9, 328.312, 507.9)
+ # Yeah but what happens if you crop the page?
+ page = pdf.pages[0].crop((10, 400, 500, 500))
+ stree = PDFStructTree(pdf, page)
+ table = next(stree.find_all("Table"))
+ # The element gets cropped too
+ assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 500, 500)
+ # And if you crop it out of the page?
+ page = pdf.pages[0].crop((0, 0, 560, 400))
+ stree = PDFStructTree(pdf, page)
+ table = next(stree.find_all("Table"))
+ with self.assertRaises(IndexError):
+ _ = stree.element_bbox(table)
+
 
 class TestUnparsed(unittest.TestCase):
  """Test handling of PDFs with unparsed pages."""