From 6439810846f605f89cc090326fc8c86a8dbee8ed Mon Sep 17 00:00:00 2001 From: kyleclo Date: Sun, 6 Aug 2023 15:00:23 -0700 Subject: [PATCH 1/6] add find() method --- papermage/magelib/document.py | 24 ++++++++++++------------ papermage/magelib/indexer.py | 4 ++-- tests/test_magelib/test_document.py | 6 ++++-- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/papermage/magelib/document.py b/papermage/magelib/document.py index 8286b51..612f2cc 100644 --- a/papermage/magelib/document.py +++ b/papermage/magelib/document.py @@ -44,27 +44,27 @@ def __init__(self, symbols: str, metadata: Optional[Metadata] = None): def fields(self) -> List[str]: return list(self.__entity_span_indexers.keys()) + self.SPECIAL_FIELDS - def find_by_span(self, query: Union[Entity, Span], field_name: str) -> List[Entity]: - if isinstance(query, Entity): - return self.__entity_span_indexers[field_name].find(query=query) - elif isinstance(query, Span): + def find(self, query: Union[Span, Box], field_name: str) -> List[Entity]: + if isinstance(query, Span): return self.__entity_span_indexers[field_name].find(query=Entity(spans=[query])) - else: - raise TypeError(f"Unsupported query type {type(query)}") - - def find_by_box(self, query: Union[Entity, Box], field_name: str) -> List[Entity]: - if isinstance(query, Entity): - return self.__entity_box_indexers[field_name].find(query=query) elif isinstance(query, Box): return self.__entity_box_indexers[field_name].find(query=Entity(boxes=[query])) else: raise TypeError(f"Unsupported query type {type(query)}") + def find_by_span(self, query: Entity, field_name: str) -> List[Entity]: + # TODO: will rename this to `intersect_by_span` + return self.__entity_span_indexers[field_name].find(query=query) + + def find_by_box(self, query: Entity, field_name: str) -> List[Entity]: + # TODO: will rename this to `intersect_by_span` + return self.__entity_box_indexers[field_name].find(query=query) + def check_field_name_availability(self, field_name: str) -> None: if field_name in self.SPECIAL_FIELDS: raise AssertionError(f"{field_name} not allowed Document.SPECIAL_FIELDS.") if field_name in self.__entity_span_indexers.keys(): - raise AssertionError(f"{field_name} already exists. Try `is_overwrite=True`") + raise AssertionError(f"{field_name} already exists. Try `doc.remove_entity({field_name})` first.") if field_name in dir(self): raise AssertionError(f"{field_name} clashes with Document class properties.") @@ -78,9 +78,9 @@ def annotate_entity(self, field_name: str, entities: List[Entity]) -> None: entity.doc = self entity.id = i - setattr(self, field_name, entities) self.__entity_span_indexers[field_name] = EntitySpanIndexer(entities=entities) self.__entity_box_indexers[field_name] = EntityBoxIndexer(entities=entities) + setattr(self, field_name, entities) def remove_entity(self, field_name: str): for entity in getattr(self, field_name): diff --git a/papermage/magelib/indexer.py b/papermage/magelib/indexer.py index 463aae2..f4dd478 100644 --- a/papermage/magelib/indexer.py +++ b/papermage/magelib/indexer.py @@ -78,7 +78,7 @@ def _ensure_disjoint(self) -> None: def find(self, query: Entity) -> List[Entity]: if not isinstance(query, Entity): - raise ValueError(f"EntityIndexer only works with `query` that is Entity type") + raise TypeError(f"EntityIndexer only works with `query` that is Entity type") if not query.spans: return [] @@ -159,7 +159,7 @@ def _ensure_disjoint(self) -> None: def find(self, query: Entity) -> List[Entity]: if not isinstance(query, Entity): - raise ValueError(f"EntityBoxIndexer only works with `query` that is Entity type") + raise TypeError(f"EntityBoxIndexer only works with `query` that is Entity type") if not query.boxes: return [] diff --git a/tests/test_magelib/test_document.py b/tests/test_magelib/test_document.py index 76db0a0..17399ac 100644 --- a/tests/test_magelib/test_document.py +++ b/tests/test_magelib/test_document.py @@ -203,15 +203,17 @@ def test_query(self): # test query by span self.assertListEqual( doc.find_by_span(query=doc.chunks[0], field_name="tokens"), - doc.find_by_span(query=doc.chunks[0].spans[0], field_name="tokens"), + doc.find(query=doc.chunks[0].spans[0], field_name="tokens"), ) # test query by box self.assertListEqual( doc.find_by_box(query=doc.chunks[0], field_name="tokens"), - doc.find_by_box(query=doc.chunks[0].boxes[0], field_name="tokens"), + doc.find(query=doc.chunks[0].boxes[0], field_name="tokens"), ) # calling wrong method w input type should fail with self.assertRaises(TypeError): doc.find_by_box(query=doc.chunks[0].spans[0], field_name="tokens") with self.assertRaises(TypeError): doc.find_by_span(query=doc.chunks[0].boxes[0], field_name="tokens") + with self.assertRaises(TypeError): + doc.find(query=doc.chunks[0], field_name="tokens") From 5ecf0ec4b9b62a8bac919ae9e5e2d9244cae5e5d Mon Sep 17 00:00:00 2001 From: kyleclo Date: Sun, 6 Aug 2023 15:20:31 -0700 Subject: [PATCH 2/6] add recipe run() --- papermage/magelib/__init__.py | 8 +++++-- papermage/magelib/document.py | 2 ++ papermage/recipes/core_recipe.py | 38 ++++++++++++++++++++++++++++---- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/papermage/magelib/__init__.py b/papermage/magelib/__init__.py index b1c8626..d8a19c8 100644 --- a/papermage/magelib/__init__.py +++ b/papermage/magelib/__init__.py @@ -25,7 +25,9 @@ RowsFieldName, BlocksFieldName, ImagesFieldName, - WordsFieldName + WordsFieldName, + SentencesFieldName, + ParagraphsFieldName ) __all__ = [ @@ -47,5 +49,7 @@ "TokensFieldName", "RowsFieldName", "BlocksFieldName", - "WordsFieldName" + "WordsFieldName", + "SentencesFieldName", + "ParagraphsFieldName", ] diff --git a/papermage/magelib/document.py b/papermage/magelib/document.py index 612f2cc..77245de 100644 --- a/papermage/magelib/document.py +++ b/papermage/magelib/document.py @@ -29,6 +29,8 @@ RowsFieldName = "rows" BlocksFieldName = "blocks" WordsFieldName = "words" +SentencesFieldName = "sentences" +ParagraphsFieldName = "paragraphs" class Document: diff --git a/papermage/recipes/core_recipe.py b/papermage/recipes/core_recipe.py index d11a680..3aa6acd 100644 --- a/papermage/recipes/core_recipe.py +++ b/papermage/recipes/core_recipe.py @@ -5,10 +5,27 @@ """ import logging +from pathlib import Path +from typing import Union logger = logging.getLogger(__name__) -from papermage.magelib import Document, Entity +from papermage.magelib import ( + BlocksFieldName, + Document, + EntitiesFieldName, + Entity, + ImagesFieldName, + MetadataFieldName, + PagesFieldName, + ParagraphsFieldName, + RelationsFieldName, + RowsFieldName, + SentencesFieldName, + SymbolsFieldName, + TokensFieldName, + WordsFieldName, +) from papermage.parsers.pdfplumber_parser import PDFPlumberParser from papermage.predictors import ( HFBIOTaggerPredictor, @@ -46,6 +63,18 @@ def __init__( self.sent_predictor = PysbdSentencePredictor() logger.info("Finished instantiating recipe") + def run(self, pdf: Union[str, Path, Document]) -> Document: + if isinstance(pdf, str): + pdf = Path(pdf) + assert pdf.exists(), f"File {pdf} does not exist." + assert isinstance( + pdf, (Document, Path) + ), f"Unsupported type {type(pdf)} for pdf; should be a Document or a path to a PDF file." + if isinstance(pdf, Path): + self.from_path(str(pdf)) + else: + raise NotImplementedError("Document input not yet supported.") + def from_path(self, pdfpath: str) -> Document: logger.info("Parsing document...") doc = self.parser.parse(input_pdf_path=pdfpath) @@ -53,14 +82,15 @@ def from_path(self, pdfpath: str) -> Document: logger.info("Rasterizing document...") images = self.rasterizer.rasterize(input_pdf_path=pdfpath, dpi=72) doc.annotate_images(images=list(images)) + self.rasterizer.attach_images(images=images, doc=doc) logger.info("Predicting words...") words = self.word_predictor.predict(doc=doc) - doc.annotate_entity(field_name="words", entities=words) + doc.annotate_entity(field_name=WordsFieldName, entities=words) logger.info("Predicting sentences...") sentences = self.sent_predictor.predict(doc=doc) - doc.annotate_entity(field_name="sentences", entities=sentences) + doc.annotate_entity(field_name=SentencesFieldName, entities=sentences) logger.info("Predicting blocks...") layout = self.effdet_publaynet_predictor.predict(doc=doc) @@ -75,7 +105,7 @@ def from_path(self, pdfpath: str) -> Document: # blocks are used by IVILA, so we need to annotate them as well # copy the entities because they already have docs attached blocks = [Entity.from_json(ent.to_json()) for ent in layout + equations] - doc.annotate_entity(field_name="blocks", entities=blocks) + doc.annotate_entity(field_name=BlocksFieldName, entities=blocks) logger.info("Predicting vila...") vila_entities = self.ivila_predictor.predict(doc=doc) From f354d82231f906013934b6c58791c8e30f0dcc3a Mon Sep 17 00:00:00 2001 From: kyleclo Date: Sun, 6 Aug 2023 15:20:42 -0700 Subject: [PATCH 3/6] quickstart notebook --- examples/quick_start_demo.ipynb | 158 ++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 examples/quick_start_demo.ipynb diff --git a/examples/quick_start_demo.ipynb b/examples/quick_start_demo.ipynb new file mode 100644 index 0000000..cfc1a77 --- /dev/null +++ b/examples/quick_start_demo.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quick start demo" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's familiarize with the core data classes --- Document, Layers, and Entities." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from papermage.magelib import Document\n", + "\n", + "doc = Document(\"This is a sentence. This is another sentence.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'This is a sentence. This is another sentence.'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc.symbols" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But documents are highly-structured " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Recipes to get Documents\n", + "\n", + "Documents are annoying to build from scratch. But we don't have to! Let's use a predefined recipe:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kylel/miniconda3/envs/papermage/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/Users/kylel/miniconda3/envs/papermage/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator OneHotEncoder from version 1.2.2 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", + "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", + " warnings.warn(\n", + "/Users/kylel/miniconda3/envs/papermage/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator MaxAbsScaler from version 1.2.2 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", + "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", + " warnings.warn(\n", + "/Users/kylel/miniconda3/envs/papermage/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator LinearSVC from version 1.2.2 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", + "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", + " warnings.warn(\n", + " 0%| | 0/4 [00:00" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc.pages[0].images[0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "papermage", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From f5dc25250912d164fa9a612a4de391d392588e94 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Sun, 6 Aug 2023 15:21:00 -0700 Subject: [PATCH 4/6] toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 95a80a6..0d549dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = 'papermage' -version = '0.9.0' +version = '0.10.0' description = 'Papermage. Casting magic over scientific PDFs.' license = {text = 'Apache-2.0'} readme = 'README.md' From 6bb7349d63f6408229137c76fa93df8397899c00 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Sun, 6 Aug 2023 15:25:57 -0700 Subject: [PATCH 5/6] doc.annotate() --- papermage/magelib/document.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/papermage/magelib/document.py b/papermage/magelib/document.py index 77245de..a9db62d 100644 --- a/papermage/magelib/document.py +++ b/papermage/magelib/document.py @@ -73,6 +73,12 @@ def check_field_name_availability(self, field_name: str) -> None: def get_entity(self, field_name: str) -> List[Entity]: return getattr(self, field_name) + def annotate(self, field_name: str, entiites: List[Entity]) -> None: + if isinstance(entiites[0], Entity): + self.annotate_entity(field_name=field_name, entities=entiites) + else: + raise NotImplementedError(f"Unsupported entity type {type(entiites[0])}") + def annotate_entity(self, field_name: str, entities: List[Entity]) -> None: self.check_field_name_availability(field_name=field_name) From c9da4efb2345726dd2603960a7a89767a4c71c2e Mon Sep 17 00:00:00 2001 From: kyleclo Date: Sun, 6 Aug 2023 15:42:33 -0700 Subject: [PATCH 6/6] pr fixes --- papermage/magelib/document.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/papermage/magelib/document.py b/papermage/magelib/document.py index a9db62d..666b9a8 100644 --- a/papermage/magelib/document.py +++ b/papermage/magelib/document.py @@ -66,18 +66,18 @@ def check_field_name_availability(self, field_name: str) -> None: if field_name in self.SPECIAL_FIELDS: raise AssertionError(f"{field_name} not allowed Document.SPECIAL_FIELDS.") if field_name in self.__entity_span_indexers.keys(): - raise AssertionError(f"{field_name} already exists. Try `doc.remove_entity({field_name})` first.") + raise AssertionError(f'{field_name} already exists. Try `doc.remove_entity("{field_name}")` first.') if field_name in dir(self): raise AssertionError(f"{field_name} clashes with Document class properties.") def get_entity(self, field_name: str) -> List[Entity]: return getattr(self, field_name) - def annotate(self, field_name: str, entiites: List[Entity]) -> None: - if isinstance(entiites[0], Entity): - self.annotate_entity(field_name=field_name, entities=entiites) + def annotate(self, field_name: str, entities: List[Entity]) -> None: + if all(isinstance(e, Entity) for e in entities): + self.annotate_entity(field_name=field_name, entities=entities) else: - raise NotImplementedError(f"Unsupported entity type {type(entiites[0])}") + raise NotImplementedError(f"entity list contains non-entities: {[type(e) for e in entities]}") def annotate_entity(self, field_name: str, entities: List[Entity]) -> None: self.check_field_name_availability(field_name=field_name)