diff --git a/papermage/magelib/entity.py b/papermage/magelib/entity.py index 8447019..adb8394 100644 --- a/papermage/magelib/entity.py +++ b/papermage/magelib/entity.py @@ -101,7 +101,13 @@ def __getattr__(self, name: str) -> List["Entity"]: "Please use Entity.intersect_by_span or Entity.intersect_by_box instead." ) try: - return self.intersect_by_span(name=name) + if len(self.spans) > 0: + intersection = self.intersect_by_span(name=name) + if len(intersection) == 0 and len(self.boxes) > 0: + intersection = self.intersect_by_box(name=name) + return intersection + else: + return self.intersect_by_box(name=name) except ValueError: # maybe users just want some attribute of the Entity object return self.__getattribute__(name) diff --git a/pyproject.toml b/pyproject.toml index e1a6f9d..b45ec74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = 'papermage' -version = '0.17.0' +version = '0.18.0' description = 'Papermage. Casting magic over scientific PDFs.' license = {text = 'Apache-2.0'} readme = 'README.md' diff --git a/tests/test_magelib/test_document.py b/tests/test_magelib/test_document.py index 20c2aed..fdf7ef0 100644 --- a/tests/test_magelib/test_document.py +++ b/tests/test_magelib/test_document.py @@ -150,6 +150,33 @@ def test_cross_referencing(self): self.assertListEqual(doc.intersect_by_box(query=doc.tokens[4], name="chunks"), [chunks[1]]) self.assertListEqual(doc.intersect_by_box(query=doc.tokens[5], name="chunks"), [chunks[1]]) + def test_cross_referencing_with_no_spans(self): + doc = Document("This is a test document!") + # boxes are in a top-left to bottom-right diagonal fashion (same page) + tokens = [ + Entity.from_json({"spans": [[0, 4]], "boxes": [[0, 0, 0.5, 0.5, 0]]}), + Entity.from_json({"spans": [[5, 7]], "boxes": [[1, 1, 0.5, 0.5, 0]]}), + Entity.from_json({"spans": [[8, 9]], "boxes": [[2, 2, 0.5, 0.5, 0]]}), + Entity.from_json({"spans": [[10, 14]], "boxes": [[3, 3, 0.5, 0.5, 0]]}), + Entity.from_json({"spans": [[15, 23]], "boxes": [[4, 4, 0.5, 0.5, 0]]}), + Entity.from_json({"spans": [[23, 24]], "boxes": [[5, 5, 0.5, 0.5, 0]]}), + ] + # chunks have no spans + chunks = [ + Entity.from_json({"boxes": [[0, 0, 2.01, 2.01, 0]]}), + Entity.from_json({"boxes": [[3.0, 3.0, 4.0, 4.0, 0]]}), + Entity.from_json({"boxes": [[0, 0, 10.0, 10.0, 1]]}), + ] + doc.annotate_layer(name="tokens", entities=tokens) + doc.annotate_layer(name="chunks", entities=chunks) + + # getattr() should still work when no spans; defers to boxes + self.assertListEqual(doc.chunks[0].tokens, tokens[:3]) + self.assertListEqual(doc.chunks[1].tokens, tokens[3:]) + + # last chunk is on a different page; intersects nothing + self.assertListEqual(doc.chunks[2].tokens, []) + def test_cross_referencing_with_missing_entity_fields(self): """What happens when annotate a Doc with entiites missing spans or boxes? How does the cross-referencing operation behave?"""