Skip to content

Commit

Permalink
Make Entity.layers a little more magical by checking if spans and box…
Browse files Browse the repository at this point in the history
…es exist on both layers (#74)

* Make Entity.layers a little more magical by checking if spans and boxes exist

* add tests for new getattr() behavior

* increment

---------

Co-authored-by: kyleclo <kyleclo@uw.edu>
  • Loading branch information
josephcc and kyleclo committed Mar 17, 2024
1 parent 4cb681e commit 0e8c796
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 2 deletions.
8 changes: 7 additions & 1 deletion papermage/magelib/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,13 @@ def __getattr__(self, name: str) -> List["Entity"]:
"Please use Entity.intersect_by_span or Entity.intersect_by_box instead."
)
try:
return self.intersect_by_span(name=name)
if len(self.spans) > 0:
intersection = self.intersect_by_span(name=name)
if len(intersection) == 0 and len(self.boxes) > 0:
intersection = self.intersect_by_box(name=name)
return intersection
else:
return self.intersect_by_box(name=name)
except ValueError:
# maybe users just want some attribute of the Entity object
return self.__getattribute__(name)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = 'papermage'
version = '0.17.0'
version = '0.18.0'
description = 'Papermage. Casting magic over scientific PDFs.'
license = {text = 'Apache-2.0'}
readme = 'README.md'
Expand Down
27 changes: 27 additions & 0 deletions tests/test_magelib/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,33 @@ def test_cross_referencing(self):
self.assertListEqual(doc.intersect_by_box(query=doc.tokens[4], name="chunks"), [chunks[1]])
self.assertListEqual(doc.intersect_by_box(query=doc.tokens[5], name="chunks"), [chunks[1]])

def test_cross_referencing_with_no_spans(self):
doc = Document("This is a test document!")
# boxes are in a top-left to bottom-right diagonal fashion (same page)
tokens = [
Entity.from_json({"spans": [[0, 4]], "boxes": [[0, 0, 0.5, 0.5, 0]]}),
Entity.from_json({"spans": [[5, 7]], "boxes": [[1, 1, 0.5, 0.5, 0]]}),
Entity.from_json({"spans": [[8, 9]], "boxes": [[2, 2, 0.5, 0.5, 0]]}),
Entity.from_json({"spans": [[10, 14]], "boxes": [[3, 3, 0.5, 0.5, 0]]}),
Entity.from_json({"spans": [[15, 23]], "boxes": [[4, 4, 0.5, 0.5, 0]]}),
Entity.from_json({"spans": [[23, 24]], "boxes": [[5, 5, 0.5, 0.5, 0]]}),
]
# chunks have no spans
chunks = [
Entity.from_json({"boxes": [[0, 0, 2.01, 2.01, 0]]}),
Entity.from_json({"boxes": [[3.0, 3.0, 4.0, 4.0, 0]]}),
Entity.from_json({"boxes": [[0, 0, 10.0, 10.0, 1]]}),
]
doc.annotate_layer(name="tokens", entities=tokens)
doc.annotate_layer(name="chunks", entities=chunks)

# getattr() should still work when no spans; defers to boxes
self.assertListEqual(doc.chunks[0].tokens, tokens[:3])
self.assertListEqual(doc.chunks[1].tokens, tokens[3:])

# last chunk is on a different page; intersects nothing
self.assertListEqual(doc.chunks[2].tokens, [])

def test_cross_referencing_with_missing_entity_fields(self):
"""What happens when annotate a Doc with entiites missing spans or boxes?
How does the cross-referencing operation behave?"""
Expand Down

0 comments on commit 0e8c796

Please sign in to comment.