Skip to content

Commit

Permalink
Esl 137 added boxes into table (#333)
Browse files Browse the repository at this point in the history
* ESL-137 added box extraction skeleton into scan table extraction

* ESL-138 ESL-137 a lot of table changes

- added CellWithMeta
- change output table structure, remove CellProperies in output
- change logic bbox extraction from image tables after debugging
- change output in CSV, HTML, TABBY, PDF, SCAN readers
- change all tests with tables
- fixed styles

* ESL-137 chnaged draw table script

* ESL-148 added script of table word boxes drawing

* TLDR-471 added angle rotation from PdfImageReader and Tables

* ESL-137 fixed unit-tests

* ESL-137 fixed after review; removing some unused functions

- fixed after review
- removing some unused functions

* ESL-137 update docs

* ESL-137 after review
  • Loading branch information
oksidgy authored and sunveil committed Oct 11, 2023
1 parent a106293 commit 4379fa7
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 54 deletions.
13 changes: 13 additions & 0 deletions dedoc/readers/docx_reader/data_structures/cell_property.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@


class CellProperty:
"""
This class holds information about the table cell.
"""
def __init__(self, colspan: int, rowspan: int, invisible: bool) -> None:
"""
:param cell: class which should contain the following attributes: colspan, rowspan, invisible.
"""
self.colspan = colspan
self.rowspan = rowspan
self.invisible = invisible
96 changes: 43 additions & 53 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import math
import os
import subprocess
from collections import namedtuple
from typing import List, Optional, Tuple

import numpy as np

from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError
from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation
Expand All @@ -35,8 +35,6 @@
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.utils import calculate_file_hash

CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")


class PdfTabbyReader(PdfBaseReader):
"""
Expand Down Expand Up @@ -80,7 +78,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
lines, scan_tables, tables_cell_properties = self.__extract(path=path)
lines, tables, tables_on_images = self.__extract(path=path)
warnings = []
document_metadata = None

Expand All @@ -95,15 +93,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
if last_page != math.inf:
document_metadata["last_page"] = last_page

lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[])
tables = []
assert len(scan_tables) == len(tables_cell_properties)
for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties):
cell_properties = [[cellp for cellp in row] for row in table_cells_property]
metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
cells = [[cell for cell in row] for row in scan_table.matrix_cells]
table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties)
tables.append(table)
lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=[])

attachments = []
if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters):
Expand All @@ -117,56 +107,56 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

return self._postprocess(result)

def __extract(self, path: str, start_page: int = None, end_page: int = None) \
-> Tuple[List[LineWithMeta], List[ScanTable], List[List[List[CellPropertyInfo]]]]:
def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[Table], List[ScanTable]]:
file_hash = calculate_file_hash(path=path)
document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)

all_lines = []
all_tables = []
all_cell_properties = []
all_tables_on_images = []
for page in document.get("pages", []):
lines = self.__get_lines_with_location(page, file_hash)
if lines:
all_lines.extend(lines)
tables, cell_properties = self.__get_tables(page, file_hash)
if tables:
all_tables.extend(tables)
all_cell_properties.extend(cell_properties)

return all_lines, all_tables, all_cell_properties

def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[ScanTable], List[List[List[CellPropertyInfo]]]]:
page_lines = self.__get_lines_with_location(page, file_hash)
if page_lines:
all_lines.extend(page_lines)
page_tables, table_on_images = self.__get_tables(page, file_hash)
assert len(page_tables) == len(table_on_images)
if page_tables:
all_tables.extend(page_tables)
all_tables_on_images.extend(table_on_images)

return all_lines, all_tables, all_tables_on_images

def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[Table], List[ScanTable]]:
tables = []
cell_properties = []
tables_on_image = []
page_number = page["number"]
i = 0
for table in page["tables"]:
i += 1
for table_num, table in enumerate(page["tables"]):
x_top_left = table["x_top_left"]
y_top_left = table["y_top_left"]
x_bottom_right = x_top_left + table["width"]
y_bottom_right = y_top_left + table["height"]
order = table["order"]
order = table["order"] # TODO add table order into TableMetadata
rows = table["rows"]
cell_properties_json = table["cell_properties"]
cell_property_list = []

for cell_properties_row in cell_properties_json:
cell_property_row_list = []

for cell_property in cell_properties_row:
cell_property_info = CellPropertyInfo(cell_property["col_span"], cell_property["row_span"], bool(cell_property["invisible"]))
cell_property_row_list.append(cell_property_info)

cell_property_list.append(cell_property_row_list)

cells = [row for row in rows]
bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))

tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order))
cell_properties.append(cell_property_list)

return tables, cell_properties
cell_properties = table["cell_properties"]
assert len(rows) == len(cell_properties)

result_cells = []
for num_row, row in enumerate(rows):
assert len(row) == len(cell_properties[num_row])
result_row = []
for num_col, cell_text in enumerate(row):
result_row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=page_number, line_id=0))],
colspan=cell_properties[num_row][num_col]["col_span"],
rowspan=cell_properties[num_row][num_col]["row_span"],
invisible=bool(cell_properties[num_row][num_col]["invisible"])))

result_cells.append(result_row)
table_bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) # noqa TODO add table location into TableMetadata
tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, is_inserted=False)))
table_name = file_hash + str(page_number) + str(table_num)
tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order))

return tables, tables_on_image

def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
lines = []
Expand Down Expand Up @@ -274,6 +264,6 @@ def _process_one_page(self,
image: np.ndarray,
parameters: ParametersForParseDoc,
page_number: int,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:

return [], [], []
return [], [], [], []
2 changes: 1 addition & 1 deletion dedoc/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = ""
__version__ = "0.11.2"

0 comments on commit 4379fa7

Please sign in to comment.