Skip to content

Commit

Permalink
Add function to group contiguous objects
Browse files Browse the repository at this point in the history
  • Loading branch information
turicas committed Apr 28, 2019
1 parent 74ed56e commit e9619f3
Showing 1 changed file with 59 additions and 17 deletions.
76 changes: 59 additions & 17 deletions rows/plugins/plugin_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,56 @@
from pytesseract import image_to_boxes
from PIL import Image

from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines
from rows.plugins.plugin_pdf import group_objects, PDFBackend, TextObject, pdf_table_lines
from rows.plugins.utils import create_table


def join_text_group(group):
"""Join a list of `TextObject`s into one"""

obj = group[0]
max_between = (obj.x1 - obj.x0) / len(obj.text) # Average letter size
text, last_x1 = [], obj.x0
for obj in group:
if last_x1 + max_between <= obj.x0:
text.append(" ")
text.append(obj.text)
last_x1 = obj.x1
text = "".join(text)

return TextObject(
x0=min(obj.x0 for obj in group),
y0=min(obj.y0 for obj in group),
x1=max(obj.x1 for obj in group),
y1=max(obj.y1 for obj in group),
text=text
)


def group_contiguous_objects(objs, x_threshold, y_threshold):
"""Merge contiguous objects if they're closer enough"""

objs.sort(key=lambda obj: obj.y0)
y_groups = group_objects(objs, y_threshold, "y")
for y_group, y_items in y_groups.items():
y_items.sort(key=lambda obj: obj.x0)

x_groups, current_group, last_x1 = [], [], None
for obj in y_items:
if not current_group or last_x1 + x_threshold >= obj.x0:
current_group.append(obj)
elif current_group:
x_groups.append(current_group)
current_group = [obj]
last_x1 = obj.x1
if current_group:
x_groups.append(current_group)

for group in x_groups:
if group:
yield join_text_group(group)


class TesseractBackend(PDFBackend):

name = "tesseract"
Expand All @@ -36,12 +82,7 @@ def __init__(self, filename_or_fobj, language):

@cached_property
def document(self):
if hasattr(self.filename_or_fobj, "read"):
image = Image.open(self.filename_or_fobj)
else:
image = self.filename_or_fobj

return image
return Image.open(self.filename_or_fobj)

@cached_property
def number_of_pages(self):
Expand All @@ -51,6 +92,7 @@ def extract_text(self, page_numbers=None):
return "" # TODO: image_to_string

def objects(self, page_numbers=None, starts_after=None, ends_before=None):
_, total_y = self.document.size
header = "char left bottom right top page".split()
boxes = image_to_boxes(self.document, lang=self.language).splitlines()
text_objs = []
Expand All @@ -60,18 +102,18 @@ def objects(self, page_numbers=None, starts_after=None, ends_before=None):
if key != "char":
value = int(value)
row[key] = value
obj = TextObject(
x0=row["left"],
y0=row["bottom"],
x1=row["right"],
y1=row["top"],
text=row["char"],
text_objs.append(
TextObject(
x0=row["left"],
y0=total_y - row["bottom"],
x1=row["right"],
y1=total_y - row["top"],
text=row["char"],
)
)
text_objs.append(obj)

text_objs.sort(key=lambda obj: (obj.y0, obj.x0))
# TODO: group contiguous objects before yielding
yield text_objs
# TODO: custom thresholds
yield list(group_contiguous_objects(text_objs, 30, 12))

text_objects = objects

Expand Down

0 comments on commit e9619f3

Please sign in to comment.