diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index 0d42dc37..8665eeaa 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -25,6 +25,9 @@ def copy_from(cell: "Cell", y_bottom_right=y_bottom_right, id_con=cell.id_con, lines=cell.lines, + colspan=cell.colspan, + rowspan=cell.rowspan, + invisible=cell.invisible, is_attribute=cell.is_attribute, is_attribute_required=cell.is_attribute_required, rotated_angle=cell.rotated_angle, @@ -44,7 +47,7 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None, - contour_coord: Optional[BBox] = None) -> None: + contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: import uuid @@ -52,7 +55,7 @@ def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bott assert y_top_left <= y_bottom_right self.lines = [] if lines is None else lines - super().__init__(lines) + super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible) self.x_top_left = x_top_left self.x_bottom_right = x_bottom_right diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 1d0d594d..9e258b5e 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -159,7 +159,7 @@ def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: def __get_tables(self, page: dict) -> List[ScanTable]: import uuid - from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.line_metadata import LineMetadata @@ -188,15 +188,19 @@ def __get_tables(self, page: dict) -> List[ScanTable]: cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"])) annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height)) """ - TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable" - https://jira.intra.ispras.ru/browse/TLDR-851 + TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable" + https://jira.intra.ispras.ru/browse/TLDR-851 """ - - result_row.append(CellWithMeta( + current_cell_properties = cell_properties[num_row][num_col] + result_row.append(Cell( lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)], - colspan=cell_properties[num_row][num_col]["col_span"], - rowspan=cell_properties[num_row][num_col]["row_span"], - invisible=bool(cell_properties[num_row][num_col]["invisible"]) + colspan=current_cell_properties["col_span"], + rowspan=current_cell_properties["row_span"], + invisible=bool(current_cell_properties["invisible"]), + x_top_left=int(current_cell_properties["x_top_left"]), + x_bottom_right=int(current_cell_properties["x_top_left"]) + int(current_cell_properties["width"]), + y_top_left=int(current_cell_properties["y_top_left"]), + y_bottom_right=int(current_cell_properties["y_top_left"]) + int(current_cell_properties["height"]) )) cells.append(result_row) diff --git a/tests/api_tests/test_api_misc_multipage_table.py b/tests/api_tests/test_api_misc_multipage_table.py index 5c3c0d2e..c7431247 100644 --- a/tests/api_tests/test_api_misc_multipage_table.py +++ b/tests/api_tests/test_api_misc_multipage_table.py @@ -47,12 +47,10 @@ def test_api_ml_table_recognition_synthetic_data_1(self) -> None: def test_api_ml_table_recognition_synthetic_data_3(self) -> None: file_name = "example_mp_table_with_repeate_header_2.pdf" - for pdf_param in ["false", "true"]: - # for "tabby" doesn't work because need to unify the output of table in matrix form and set attribute cells, - # without this tables won't be merge. + for pdf_param in ["false", "true", "tabby"]: tables = self._get_tables(file_name, pdf_with_text_layer=pdf_param) - self.assertEqual(len(tables), 1) + self.assertEqual(len(tables), 1, f"Error when pdf_with_text_layer={pdf_param}") table = tables[0]["cells"] self.assertListEqual(