Skip to content

Commit

Permalink
Use Cell class in tabby
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Dec 5, 2024
1 parent 829a86d commit 1511bac
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 14 deletions.
7 changes: 5 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ def copy_from(cell: "Cell",
y_bottom_right=y_bottom_right,
id_con=cell.id_con,
lines=cell.lines,
colspan=cell.colspan,
rowspan=cell.rowspan,
invisible=cell.invisible,
is_attribute=cell.is_attribute,
is_attribute_required=cell.is_attribute_required,
rotated_angle=cell.rotated_angle,
Expand All @@ -44,15 +47,15 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)

def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
contour_coord: Optional[BBox] = None) -> None:
contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:

import uuid

assert x_top_left <= x_bottom_right
assert y_top_left <= y_bottom_right

self.lines = [] if lines is None else lines
super().__init__(lines)
super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)

self.x_top_left = x_top_left
self.x_bottom_right = x_bottom_right
Expand Down
20 changes: 12 additions & 8 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page:

def __get_tables(self, page: dict) -> List[ScanTable]:
import uuid
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.line_metadata import LineMetadata

Expand Down Expand Up @@ -188,15 +188,19 @@ def __get_tables(self, page: dict) -> List[ScanTable]:
cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"]))
annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height))
"""
TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable"
https://jira.intra.ispras.ru/browse/TLDR-851
TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable"
https://jira.intra.ispras.ru/browse/TLDR-851
"""

result_row.append(CellWithMeta(
current_cell_properties = cell_properties[num_row][num_col]
result_row.append(Cell(
lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)],
colspan=cell_properties[num_row][num_col]["col_span"],
rowspan=cell_properties[num_row][num_col]["row_span"],
invisible=bool(cell_properties[num_row][num_col]["invisible"])
colspan=current_cell_properties["col_span"],
rowspan=current_cell_properties["row_span"],
invisible=bool(current_cell_properties["invisible"]),
x_top_left=int(current_cell_properties["x_top_left"]),
x_bottom_right=int(current_cell_properties["x_top_left"]) + int(current_cell_properties["width"]),
y_top_left=int(current_cell_properties["y_top_left"]),
y_bottom_right=int(current_cell_properties["y_top_left"]) + int(current_cell_properties["height"])
))
cells.append(result_row)

Expand Down
6 changes: 2 additions & 4 deletions tests/api_tests/test_api_misc_multipage_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,10 @@ def test_api_ml_table_recognition_synthetic_data_1(self) -> None:

def test_api_ml_table_recognition_synthetic_data_3(self) -> None:
file_name = "example_mp_table_with_repeate_header_2.pdf"
for pdf_param in ["false", "true"]:
# for "tabby" doesn't work because need to unify the output of table in matrix form and set attribute cells,
# without this tables won't be merge.
for pdf_param in ["false", "true", "tabby"]:
tables = self._get_tables(file_name, pdf_with_text_layer=pdf_param)

self.assertEqual(len(tables), 1)
self.assertEqual(len(tables), 1, f"Error when pdf_with_text_layer={pdf_param}")
table = tables[0]["cells"]

self.assertListEqual(
Expand Down

0 comments on commit 1511bac

Please sign in to comment.