diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index fe1cae98..e918c4fa 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -131,11 +131,11 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]: tables = [] tables_on_image = [] page_number = page["number"] + page_width = int(page["width"]) + page_height = int(page["height"]) + for table in page["tables"]: - x_top_left = table["x_top_left"] - y_top_left = table["y_top_left"] - x_bottom_right = x_top_left + table["width"] - y_bottom_right = y_top_left + table["height"] + table_bbox = BBox(x_top_left=table["x_top_left"], y_top_left=table["y_top_left"], width=table["width"], height=table["height"]) order = table["order"] # TODO add table order into TableMetadata rows = table["rows"] cell_properties = table["cell_properties"] @@ -144,15 +144,24 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]: result_cells = [] for num_row, row in enumerate(rows): assert len(row) == len(cell_properties[num_row]) - result_row = [] - for num_col, cell_text in enumerate(row): - result_row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=page_number, line_id=0))], - colspan=cell_properties[num_row][num_col]["col_span"], - rowspan=cell_properties[num_row][num_col]["row_span"], - invisible=bool(cell_properties[num_row][num_col]["invisible"]))) + result_row = [] + for num_col, cell in enumerate(row): + annotations = [] + cell_blocks = cell["cell_blocks"] + + for c in cell_blocks: + cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"])) + annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height)) + + result_row.append(CellWithMeta( + lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)], + colspan=cell_properties[num_row][num_col]["col_span"], + rowspan=cell_properties[num_row][num_col]["row_span"], + invisible=bool(cell_properties[num_row][num_col]["invisible"]) + )) result_cells.append(result_row) - table_bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) # noqa TODO add table location into TableMetadata + table_name = str(uuid.uuid4()) tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, uid=table_name))) tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order)) @@ -161,55 +170,38 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]: def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]: lines = [] - page_number = page["number"] - page_width = int(page["width"]) - page_height = int(page["height"]) + page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"]) prev_line = None for block in page["blocks"]: annotations = [] order = block["order"] block_text = block["text"] - bx_top_left = int(block["x_top_left"]) - by_top_left = int(block["y_top_left"]) - bx_bottom_right = bx_top_left + int(block["width"]) - by_bottom_right = by_top_left + int(block["height"]) - indent = block["indent"] - spacing = block["spacing"] len_block = len(block_text) - annotations.append(IndentationAnnotation(0, len_block, str(indent))) - annotations.append(SpacingAnnotation(0, len_block, str(spacing))) + annotations.append(IndentationAnnotation(0, len_block, str(block["indent"]))) + annotations.append(SpacingAnnotation(0, len_block, str(block["spacing"]))) for annotation in block["annotations"]: - is_bold = annotation["is_bold"] - is_italic = annotation["is_italic"] - font_name = annotation["font_name"] - font_size = annotation["font_size"] - link = annotation["metadata"] - url = annotation["url"] start = annotation["start"] end = annotation["end"] - x_top_left = int(annotation["x_top_left"]) - y_top_left = int(annotation["y_top_left"]) - x_bottom_right = bx_top_left + int(annotation["width"]) - y_bottom_right = by_top_left + int(annotation["height"]) - box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) + box = BBox(x_top_left=int(annotation["x_top_left"]), y_top_left=int(annotation["y_top_left"]), + width=int(annotation["width"]), height=int(annotation["height"])) annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height)) - annotations.append(SizeAnnotation(start, end, str(font_size))) - annotations.append(StyleAnnotation(start, end, font_name)) + annotations.append(SizeAnnotation(start, end, str(annotation["font_size"]))) + annotations.append(StyleAnnotation(start, end, annotation["font_name"])) - if is_bold: + if annotation["is_bold"]: annotations.append(BoldAnnotation(start, end, "True")) - if is_italic: + if annotation["is_italic"]: annotations.append(ItalicAnnotation(start, end, "True")) - if link == "LINK": - annotations.append(LinkedTextAnnotation(start, end, url)) + if annotation["metadata"] == "LINK": + annotations.append(LinkedTextAnnotation(start, end, annotation["url"])) meta = block["metadata"].lower() uid = f"txt_{file_hash}_{order}" - bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right)) + bbox = BBox(x_top_left=int(block["x_top_left"]), y_top_left=int(block["y_top_left"]), width=int(block["width"]), height=int(block["height"])) annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height)) metadata = LineMetadata(page_id=page_number, line_id=order) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar index b3d5eae8..411ded10 100644 Binary files a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar differ diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py index 19e917cd..61385b4f 100644 --- a/dedoc/scripts/test_words_bbox_extraction.py +++ b/dedoc/scripts/test_words_bbox_extraction.py @@ -167,7 +167,6 @@ def test_table_word_extraction(self): for file_name in file_names: result = self._send_request(file_name, data=dict()) - table0 = result["content"]["tables"][0] page_angle = result["metadata"]["other_fields"]["rotated_page_angles"][0] image = cv2.imread(self._get_abs_path(file_name)) diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index 5f9ce3ab..1d194988 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -180,9 +180,9 @@ def test_pdf_with_tables(self) -> None: table = tables[3]["cells"] self.assertListEqual(["", "2016", "2017", "2018", "2019"], self._get_text_of_row(table[0])) - self.assertListEqual(["", "Прогноз", "Прогноз бюджета", "Прогноз бюджета", "Прогноз бюджета"], self._get_text_of_row(table[1])) - self.assertListEqual(["Ненефтегазов\nые доходы", "10,4", "9,6", "9,6", "9,6"], self._get_text_of_row(table[19])) - self.assertListEqual(["Сальдо\nбюджета", "-3,7", "-3,2", "-2,2", "-1,2"], self._get_text_of_row(table[20])) + self.assertListEqual(["", "Прогноз", "Прогноз бюджета"], self._get_text_of_row(table[1])) + self.assertListEqual(["Ненефтегазов\nые доходы", "10,4", "9,6", "9,6", "9,6"], self._get_text_of_row(table[21])) + self.assertListEqual(["Сальдо\nбюджета", "-3,7", "-3,2", "-2,2", "-1,2"], self._get_text_of_row(table[22])) tree = content["structure"] self._check_tree_sanity(tree) @@ -225,7 +225,7 @@ def test_tables_with_merged_cells(self) -> None: result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby")) table = result["content"]["tables"][0]["cells"] - hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]] + hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 1), 5]] for (i, j), k in hidden_cells_big_table_with_colspan: self.assertFalse(table[i][j]["invisible"])