Skip to content

Commit

Permalink
Merge pull request #43 from allenai/soldni/example
Browse files Browse the repository at this point in the history
demo
  • Loading branch information
soldni committed Aug 7, 2023
2 parents c9a4084 + 47f6f66 commit 89514dc
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 58 deletions.
215 changes: 162 additions & 53 deletions examples/quick_start_demo.ipynb

Large diffs are not rendered by default.

Binary file added examples/res/fig1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/res/fig3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
38 changes: 33 additions & 5 deletions papermage/recipes/core_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

import logging
from pathlib import Path
from typing import Union
from typing import Dict, List, Union
import warnings
from papermage.predictors.sklearn_predictors.word_predictor import make_text

from papermage.utils.annotate import group_by
Expand All @@ -21,6 +22,7 @@
Document,
EntitiesFieldName,
Entity,
Box,
EquationsFieldName,
FiguresFieldName,
FootersFieldName,
Expand Down Expand Up @@ -90,7 +92,10 @@ def __init__(
self.parser = PDFPlumberParser()
self.rasterizer = PDF2ImageRasterizer()

self.word_predictor = SVMWordPredictor.from_path(svm_word_predictor_path)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.word_predictor = SVMWordPredictor.from_path(svm_word_predictor_path)

self.effdet_publaynet_predictor = LPBlockPredictor.from_pretrained(effdet_publaynet_predictor_path)
# self.effdet_mfd_predictor = LPBlockPredictor.from_pretrained(effdet_mfd_predictor_path)
self.ivila_predictor = IVILATokenClassificationPredictor.from_pretrained(ivila_predictor_path)
Expand Down Expand Up @@ -126,6 +131,7 @@ def from_path(self, pdfpath: str) -> Document:

def from_doc(self, doc: Document) -> Document:
self.logger.info("Predicting words...")

words = self.word_predictor.predict(doc=doc)
doc.annotate_entity(field_name=WordsFieldName, entities=words)

Expand All @@ -134,15 +140,37 @@ def from_doc(self, doc: Document) -> Document:
doc.annotate_entity(field_name=SentencesFieldName, entities=sentences)

self.logger.info("Predicting blocks...")
blocks = self.effdet_publaynet_predictor.predict(doc=doc)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
blocks = self.effdet_publaynet_predictor.predict(doc=doc)
doc.annotate_entity(field_name=BlocksFieldName, entities=blocks)

self.logger.info("Predicting vila...")
vila_entities = self.ivila_predictor.predict(doc=doc)
doc.annotate_entity(field_name="vila_entities", entities=vila_entities)

for entity in vila_entities:
entity.boxes = [
Box.create_enclosing_box(
[b for t in doc.find_by_span(entity, field_name=TokensFieldName) for b in t.boxes]
)
]
entity.text = make_text(entity=entity, document=doc)
doc.annotate_entity(field_name="vila_entities", entities=vila_entities)
preds = group_by(entities=vila_entities, metadata_field="label", metadata_values_map=VILA_LABELS_MAP)
doc.annotate(*preds)

return doc


if __name__ == "__main__":
import argparse
import json

parser = argparse.ArgumentParser()
parser.add_argument("--pdf", required=True, type=str, help="Path to PDF file.")
parser.add_argument("--output", type=str, help="Path to output JSON file.")
args = parser.parse_args()

recipe = CoreRecipe()
doc = recipe.from_path(pdfpath=args.pdf)
with open(args.output, "w") as f:
json.dump(doc.to_json(), f, indent=2)
3 changes: 3 additions & 0 deletions papermage/visualizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .visualizer import plot_entities_on_page

__all__ = ["plot_entities_on_page"]
Binary file added tests/fixtures/papermage.pdf
Binary file not shown.

0 comments on commit 89514dc

Please sign in to comment.