Skip to content

Commit

Permalink
Tabby remove frame (#501)
Browse files Browse the repository at this point in the history
Co-authored-by: alexander1999-hub <golodkov.ao@phystech.edu>
Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
Co-authored-by: Belyaeva Oksana <belyaeva@ispras.ru>
  • Loading branch information
4 people authored Nov 14, 2024
1 parent 75739f4 commit cb701b0
Show file tree
Hide file tree
Showing 13 changed files with 232 additions and 137 deletions.
6 changes: 3 additions & 3 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional

from dedoc.api.schema.cell_with_meta import CellWithMeta as ApiCellWithMeta
from dedoc.data_structures.annotation import Annotation
Expand All @@ -20,14 +20,14 @@ class CellWithMeta(Serializable):
:vartype rowspan: int
:vartype invisible: bool
"""
def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
def __init__(self, lines: Optional[List[LineWithMeta]], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
"""
:param lines: textual lines of the cell
:param colspan: number of columns to span like in HTML format
:param rowspan: number of rows to span like in HTML format
:param invisible: indicator for displaying or hiding cell text
"""
self.lines: List[LineWithMeta] = lines
self.lines: List[LineWithMeta] = [] if lines is None else lines
self.colspan: int = colspan
self.rowspan: int = rowspan
self.invisible: bool = invisible
Expand Down
25 changes: 9 additions & 16 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from dedocutils.data_structures import BBox

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.line_with_meta import LineWithMeta


class Cell:
class Cell(CellWithMeta):

@staticmethod
def copy_from(cell: "Cell",
Expand Down Expand Up @@ -41,35 +42,27 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)
if self.con_coord:
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self,
x_top_left: int,
x_bottom_right: int,
y_top_left: int,
y_bottom_right: int,
id_con: int = -1,
lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False,
is_attribute_required: bool = False,
rotated_angle: int = 0,
uid: str = None,
def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
contour_coord: Optional[BBox] = None) -> None:

import uuid

assert x_top_left <= x_bottom_right
assert y_top_left <= y_bottom_right

self.lines = [] if lines is None else lines
super().__init__(lines)

self.x_top_left = x_top_left
self.x_bottom_right = x_bottom_right
self.y_top_left = y_top_left
self.y_bottom_right = y_bottom_right
self.id_con = id_con
self.lines = [] if lines is None else lines
self.is_attribute = is_attribute
self.is_attribute_required = is_attribute_required
self.rotated_angle = rotated_angle
self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
self.colspan = 1
self.rowspan = 1
self.invisible = False
self.con_coord = contour_coord or BBox(0, 0, 0, 0)

def __str__(self) -> str:
Expand Down
1 change: 1 addition & 0 deletions dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
self.page_number = page_number
self.bbox = bbox
self.name = name
# TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
self.rotated_angle = rotated_angle

def shift(self, shift_x: int, shift_y: int) -> None:
Expand Down
14 changes: 12 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List
from typing import Any, List, Optional

from dedocutils.data_structures import BBox

Expand All @@ -10,7 +10,8 @@


class ScanTable:
def __init__(self, page_number: int, matrix_cells: List[List[Cell]] = None, bbox: BBox = None, name: str = "", order: int = -1) -> None:
def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
name: str = "", order: int = -1) -> None:
self.matrix_cells = matrix_cells
self.page_number = page_number
self.locations = []
Expand All @@ -27,6 +28,15 @@ def extended(self, table: "ScanTable") -> None:
# extend order
self.order = max(self.order, table.order)

def check_on_cell_instance(self) -> bool:
if len(self.matrix_cells) == 0:
return False
if len(self.matrix_cells[0]) == 0:
return False
if not isinstance(self.matrix_cells[0][0], Cell):
return False
return True

def to_table(self) -> Table:
metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]
Expand Down
1 change: 1 addition & 0 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable


ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"orient_analysis_cells",
"orient_cell_angle",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import logging
from typing import List

Expand Down Expand Up @@ -155,24 +156,26 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
# condition 2. Exclusion of the duplicated header (if any)
attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells)
attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells)
t2_update = copy.deepcopy(t2)
if TableAttributeExtractor.is_equal_attributes(attr1, attr2):
t2.matrix_cells = t2.matrix_cells[len(attr2):]
t2_update.matrix_cells = t2_update.matrix_cells[len(attr2):]

if len(t2.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
if len(t2_update.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
return False

TableAttributeExtractor.clear_attributes(t2.matrix_cells)
TableAttributeExtractor.clear_attributes(t2_update.matrix_cells)

# condition 3. Number of columns should be equal
if len(t1.matrix_cells[-1]) != len(t2.matrix_cells[0]):
if len(t1.matrix_cells[-1]) != len(t2_update.matrix_cells[0]):
if self.config.get("debug_mode", False):
self.logger.debug("Different count column")
return False

# condition 4. Comparison of the widths of last and first rows
if not self.__is_equal_width_cells(t1.matrix_cells, t2.matrix_cells):
if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.matrix_cells, t2_update.matrix_cells):
if self.config.get("debug_mode", False):
self.logger.debug("Different width columns")
return False

t2.matrix_cells = copy.deepcopy(t2_update.matrix_cells) # save changes
return True
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,22 @@ def is_equal_attributes(attr1: List[List[Cell]], attr2: List[List[Cell]], thr_si

return True

@staticmethod
def check_have_attributes(matrix_table: List[List[Cell]]) -> bool:
if len(matrix_table) == 0:
return False
if len(matrix_table[0]) == 0:
return False
if not hasattr(matrix_table[0][0], "is_attribute"):
return False
return True

@staticmethod
def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:

if not TableAttributeExtractor.check_have_attributes(matrix_table):
return matrix_table[:1]

header_rows = len(matrix_table)
for (i, row) in enumerate(matrix_table):
attrs = [cell for cell in row if cell.is_attribute]
Expand All @@ -44,6 +58,9 @@ def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:

@staticmethod
def clear_attributes(matrix_table: List[List[Cell]]) -> None:
if not TableAttributeExtractor.check_have_attributes(matrix_table):
return

for row in matrix_table:
for cell in row:
cell.is_attribute = False
Expand Down
107 changes: 76 additions & 31 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os.path
from typing import List, Optional, Tuple

from dedocutils.data_structures import BBox
Expand Down Expand Up @@ -62,13 +63,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
warnings = []

with tempfile.TemporaryDirectory() as tmp_dir:
lines, tables, tables_on_images, attachments, document_metadata = self.__extract(
path=file_path,
parameters=parameters,
warnings=warnings,
tmp_dir=tmp_dir
)
lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=attachments)
lines, tables, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings, tmp_dir=tmp_dir)

if get_param_with_attachments(parameters) and self.attachment_extractor.can_extract(file_path):
attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
Expand All @@ -79,14 +74,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure

return self._postprocess(result)

def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
-> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
def __extract(self, path: str, parameters: dict, warnings: List[str], tmp_dir: str)\
-> Tuple[List[LineWithMeta], List[Table], List[PdfImageAttachment], Optional[dict]]:
import math
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import calculate_file_hash
from dedoc.utils.parameter_utils import get_param_page_slice, get_param_with_attachments
from dedoc.utils.parameter_utils import get_param_need_gost_frame_analysis

all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
all_lines, all_tables, all_scan_tables, all_attached_images = [], [], [], []
with_attachments = get_param_with_attachments(parameters)
document_metadata = None

Expand All @@ -104,40 +100,70 @@ def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
document_metadata["last_page"] = last_page

if empty_page_limit:
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
return all_lines, all_tables, all_attached_images, document_metadata

remove_gost_frame = get_param_need_gost_frame_analysis(parameters)
gost_json_path = self.__save_gost_frame_boxes_to_json(first_page=first_page, last_page=last_page, page_count=page_count, tmp_dir=tmp_dir, path=path) \
if remove_gost_frame else ""

# in java tabby reader page numeration starts with 1, end_page is included
first_tabby_page = first_page + 1 if first_page is not None else 1
last_tabby_page = page_count if (last_page is None) or (last_page is not None and last_page > page_count) else last_page
self.logger.info(f"Reading PDF pages from {first_tabby_page} to {last_tabby_page}")
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page, tmp_dir=tmp_dir)
document = self.__process_pdf(path=path,
start_page=first_tabby_page,
end_page=last_tabby_page,
tmp_dir=tmp_dir,
gost_json_path=gost_json_path,
remove_frame=remove_gost_frame)

pages = document.get("pages", [])
for page in pages:
page_lines = self.__get_lines_with_location(page, file_hash)
if page_lines:
all_lines.extend(page_lines)
page_tables, table_on_images = self.__get_tables(page)
assert len(page_tables) == len(table_on_images)
if page_tables:
all_tables.extend(page_tables)
all_tables_on_images.extend(table_on_images)
scan_tables = self.__get_tables(page)
all_scan_tables.extend(scan_tables)

attached_images = self.__get_attached_images(page=page, parameters=parameters, path=path) if with_attachments else []
if attached_images:
all_attached_images.extend(attached_images)

return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
mp_tables = self.table_recognizer.convert_to_multipages_tables(all_scan_tables, lines_with_meta=all_lines)
all_lines = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=all_attached_images)

tables = [scan_table.to_table() for scan_table in mp_tables]

return all_lines, tables, all_attached_images, document_metadata

def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: Optional[int], page_count: int, path: str, tmp_dir: str) -> str:
from joblib import Parallel, delayed
import json

first_page = 0 if first_page is None or first_page < 0 else first_page
last_page = page_count if (last_page is None) or (last_page is not None and last_page > page_count) else last_page
images = self._get_images(path, first_page, last_page)

gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)

result_dict = {
page_number: {**page_data[1].to_dict(), **{"original_image_width": page_data[2][1], "original_image_height": page_data[2][0]}}
for page_number, page_data in enumerate(gost_analyzed_images, start=first_page)
}

def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
result_json_path = os.path.join(tmp_dir, "gost_frame_bboxes.json")
with open(result_json_path, "w") as f:
json.dump(result_dict, f)

return result_json_path

def __get_tables(self, page: dict) -> List[ScanTable]:
import uuid
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.table_metadata import TableMetadata

tables = []
tables_on_image = []
scan_tables = []
page_number = page["number"]
page_width = int(page["width"])
page_height = int(page["height"])
Expand All @@ -149,7 +175,7 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
cell_properties = table["cell_properties"]
assert len(rows) == len(cell_properties)

result_cells = []
cells = []
for num_row, row in enumerate(rows):
assert len(row) == len(cell_properties[num_row])

Expand All @@ -161,20 +187,22 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
for c in cell_blocks:
cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"]))
annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height))
"""
TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable"
https://jira.intra.ispras.ru/browse/TLDR-851
"""

result_row.append(CellWithMeta(
lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)],
colspan=cell_properties[num_row][num_col]["col_span"],
rowspan=cell_properties[num_row][num_col]["row_span"],
invisible=bool(cell_properties[num_row][num_col]["invisible"])
))
result_cells.append(result_row)
cells.append(result_row)

table_name = str(uuid.uuid4())
tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, uid=table_name)))
tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order))
scan_tables.append(ScanTable(page_number=page_number, matrix_cells=cells, bbox=table_bbox, name=str(uuid.uuid4()), order=order))

return tables, tables_on_image
return scan_tables

def __get_attached_images(self, page: dict, parameters: dict, path: str) -> List[PdfImageAttachment]:
import os
Expand Down Expand Up @@ -291,10 +319,20 @@ def __jar_path(self) -> str:
import os
return os.environ.get("TABBY_JAR", self.default_config["JAR_PATH"])

def __run(self, path: str, tmp_dir: str, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes:
def __run(self,
path: str,
tmp_dir: str,
encoding: str = "utf-8",
start_page: int = None,
end_page: int = None,
remove_frame: bool = False,
gost_json_path: str = ""
) -> bytes:
import subprocess

args = ["java"] + ["-jar", self.__jar_path(), "-i", path, "-tmp", f"{tmp_dir}/"]
if remove_frame:
args += ["-rf", gost_json_path]
if start_page is not None and end_page is not None:
args += ["-sp", str(start_page), "-ep", str(end_page)]
try:
Expand All @@ -307,11 +345,18 @@ def __run(self, path: str, tmp_dir: str, encoding: str = "utf-8", start_page: in
except subprocess.CalledProcessError as e:
raise TabbyPdfError(e.stderr.decode(encoding))

def __process_pdf(self, path: str, tmp_dir: str, start_page: int = None, end_page: int = None) -> dict:
def __process_pdf(self,
path: str,
tmp_dir: str,
start_page: int = None,
end_page: int = None,
gost_json_path: str = "",
remove_frame: bool = False) -> dict:
import json
import os

self.__run(path=path, start_page=start_page, end_page=end_page, tmp_dir=tmp_dir)
self.__run(path=path, start_page=start_page, end_page=end_page, tmp_dir=tmp_dir, remove_frame=remove_frame, gost_json_path=gost_json_path)

with open(os.path.join(tmp_dir, "data.json"), "r") as response:
document = json.load(response)

Expand Down
Binary file not shown.
Loading

0 comments on commit cb701b0

Please sign in to comment.