ispras · NastyBoget · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -131,6 +131,8 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
         tables = []
         tables_on_image = []
         page_number = page["number"]
+        page_width = int(page["width"])
+        page_height = int(page["height"])
         for table in page["tables"]:
             x_top_left = table["x_top_left"]
             y_top_left = table["y_top_left"]
@@ -145,13 +147,32 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
             for num_row, row in enumerate(rows):
                 assert len(row) == len(cell_properties[num_row])
                 result_row = []
-                for num_col, cell_text in enumerate(row):
-                    result_row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=page_number, line_id=0))],
-                                                   colspan=cell_properties[num_row][num_col]["col_span"],
-                                                   rowspan=cell_properties[num_row][num_col]["row_span"],
-                                                   invisible=bool(cell_properties[num_row][num_col]["invisible"])))
-
+                for num_col, cell in enumerate(row):
+                    annotations = []
+                    cell_text = cell["text"]
+                    cell_blocks = cell["cell_blocks"]
+                    for c in cell_blocks:
+                        x_top_left = int(c["x_top_left"])
+                        y_top_left = int(c["y_top_left"])
+                        x_bottom_right = x_top_left + int(c["width"])
+                        y_bottom_right = y_top_left + int(c["height"])
+                        start = c["start"]
+                        end = c["end"]
+                        box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
+                        annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
+
+                    result_row.append(CellWithMeta(lines=[
+                        LineWithMeta(
+                            line=cell_text,
+                            metadata=LineMetadata(page_id=page_number, line_id=0),
+                            annotations=annotations)
+                    ],
+                        colspan=cell_properties[num_row][num_col]["col_span"],
+                        rowspan=cell_properties[num_row][num_col]["row_span"],
+                        invisible=bool(cell_properties[num_row][num_col]["invisible"])
+                    ))
                 result_cells.append(result_row)
+
             table_bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))  # noqa TODO add table location into TableMetadata
             table_name = str(uuid.uuid4())
             tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, uid=table_name)))

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar
diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py
@@ -178,6 +178,22 @@ def test_table_word_extraction(self):
 
             cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image)
 
+        file_name = "pdf_with_text_layer/english_doc.pdf"
+        result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
+        structure = result["content"]["structure"]
+        image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
+        word_annotations = self.__get_words_annotation(structure)
+        ann = word_annotations[0]
+        if ann is not None:
+            bbox = json.loads(ann.bbox)
+            image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
+
+        image = self.__draw_word_annotations(image, word_annotations)
+        table0 = result["content"]["tables"][0]
+        word_annotations = self.__get_words_annotation_from_cell(table0)
+        image = self.__draw_word_annotations(image, word_annotations, angle=0)
+        cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)
+
     def test_document_image_reader(self) -> None:
         filename_to_parameters = {
             "scanned/scan_orient_1.jpg": {},

diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py
@@ -180,9 +180,9 @@ def test_pdf_with_tables(self) -> None:
 
         table = tables[3]["cells"]
         self.assertListEqual(["", "2016", "2017", "2018", "2019"], self._get_text_of_row(table[0]))
-        self.assertListEqual(["", "Прогноз", "Прогноз бюджета", "Прогноз бюджета", "Прогноз бюджета"], self._get_text_of_row(table[1]))
-        self.assertListEqual(["Ненефтегазов\nые доходы", "10,4", "9,6", "9,6", "9,6"], self._get_text_of_row(table[19]))
-        self.assertListEqual(["Сальдо\nбюджета", "-3,7", "-3,2", "-2,2", "-1,2"], self._get_text_of_row(table[20]))
+        self.assertListEqual(["", "Прогноз", "Прогноз бюджета"], self._get_text_of_row(table[1]))
+        self.assertListEqual(["Ненефтегазов\nые доходы", "10,4", "9,6", "9,6", "9,6"], self._get_text_of_row(table[21]))
+        self.assertListEqual(["Сальдо\nбюджета", "-3,7", "-3,2", "-2,2", "-1,2"], self._get_text_of_row(table[22]))
 
         tree = content["structure"]
         self._check_tree_sanity(tree)
@@ -225,7 +225,7 @@ def test_tables_with_merged_cells(self) -> None:
         result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
         table = result["content"]["tables"][0]["cells"]
 
-        hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]]
+        hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 1), 5]]
 
         for (i, j), k in hidden_cells_big_table_with_colspan:
             self.assertFalse(table[i][j]["invisible"])