TLDR-471 added angle rotation from PdfImageReader and Tables

ispras · Sep 22, 2023 · 459c065 · 459c065
1 parent b5253b2
commit 459c065
Show file tree

Hide file tree

Showing 10 changed files with 84 additions and 50 deletions.
diff --git a/dedoc/data_structures/bbox.py b/dedoc/data_structures/bbox.py
@@ -1,3 +1,4 @@
+import math
 from collections import OrderedDict
 from typing import Dict, Tuple
 
@@ -51,6 +52,24 @@ def y_bottom_right(self) -> int:
     def crop_image_by_box(image: np.ndarray, bbox: "BBox") -> np.ndarray:
         return image[bbox.y_top_left:bbox.y_bottom_right, bbox.x_top_left:bbox.x_bottom_right]
 
+    @staticmethod
+    def rotate_coordinates(bbox: "BBox", angle_rotate: float, image_shape: Tuple[int]) -> "BBox":
+        xb, yb = bbox.x_top_left, bbox.y_top_left
+        # TODO check!!! was xe, ye = self.bbox.x_begin + self.bbox.width, self.bbox.x_begin + self.bbox.height
+        xe, ye = bbox.x_bottom_right, bbox.y_bottom_right  # self.bbox.x_top_left + self.bbox.height
+        rad = angle_rotate * math.pi / 180
+
+        xc = image_shape[1] // 2
+        yc = image_shape[0] // 2
+
+        bbox_xb = min((int(float(xb - xc) * math.cos(rad) - float(yb - yc) * math.sin(rad)) + xc), image_shape[1])
+        bbox_yb = min((int(float(yb - yc) * math.cos(rad) + float(xb - xc) * math.sin(rad)) + yc), image_shape[0])
+        bbox_xe = min((int(float(xe - xc) * math.cos(rad) - float(ye - yc) * math.sin(rad)) + xc), image_shape[1])
+        bbox_ye = min((int(float(ye - yc) * math.cos(rad) + float(xe - xc) * math.sin(rad)) + yc), image_shape[0])
+        bbox_new = BBox(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb)
+
+        return bbox_new
+
     def __str__(self) -> str:
         return f"BBox(x = {self.x_top_left} y = {self.y_top_left}, w = {self.width}, h = {self.height})"
 

diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py
@@ -11,7 +11,7 @@ class TableMetadata(Serializable):
     """
     This class holds the information about the table location in the document and information about cell properties.
     """
-    def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False) -> None:
+    def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False, rotated_angle: float = 0.0) -> None:
         """
         :param page_id: number of the page where table starts
         :param uid: unique identifier of the table
@@ -21,18 +21,21 @@ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserte
         self.page_id = page_id
         self.uid = str(uuid.uuid1()) if not uid else uid
         self.is_inserted = is_inserted
+        self.rotated_angle = rotated_angle
 
     def to_dict(self) -> dict:
         res = OrderedDict()
         res["uid"] = self.uid
         res["page_id"] = self.page_id
         res["is_inserted"] = self.is_inserted
+        res["rotated_angle"] = self.rotated_angle
         return res
 
     @staticmethod
     def get_api_dict(api: Api) -> Model:
         return api.model("TableMetadata", {
             "page_id": fields.Integer(readonly=False, description="table start page number"),
             "uid": fields.String(description="table unique id"),
-            "is_inserted": fields.Boolean(description="was the table inserted into document body")
+            "is_inserted": fields.Boolean(description="was the table inserted into document body"),
+            "rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes?")
         })
diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py
@@ -1,37 +1,24 @@
-import math
 from collections import OrderedDict
 from functools import total_ordering
-from typing import Any, Dict, Tuple
+from typing import Any, Dict
 
 from dedoc.data_structures.bbox import BBox
 
 
 @total_ordering
 class Location:
-    def __init__(self, page_number: int, bbox: BBox, name: str = "") -> None:
+    def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0) -> None:
         self.page_number = page_number
         self.bbox = bbox
         self.name = name
-
-    def rotate_coordinates(self, angle_rotate: float, image_shape: Tuple[int]) -> None:
-        xb, yb = self.bbox.x_top_left, self.bbox.y_top_left
-        # TODO check!!! was xe, ye = self.bbox.x_begin + self.bbox.width, self.bbox.x_begin + self.bbox.height
-        xe, ye = self.bbox.x_bottom_right, self.bbox.y_bottom_right  # self.bbox.x_top_left + self.bbox.height
-        rad = angle_rotate * math.pi / 180
-
-        bbox_xb = min((int(float(xb) * math.cos(rad) - float(yb) * math.sin(rad))), image_shape[1])
-        bbox_yb = min((int(float(yb) * math.cos(rad) + float(xb) * math.sin(rad))), image_shape[0])
-        bbox_xe = min((int(float(xe) * math.cos(rad) - float(ye) * math.sin(rad))), image_shape[1])
-        bbox_ye = min((int(float(ye) * math.cos(rad) + float(xe) * math.sin(rad))), image_shape[0])
-        bbox_new = BBox(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb)
-
-        self.bbox = bbox_new
+        self.rotated_angle = rotated_angle
 
     def to_dict(self) -> Dict[str, Any]:
         res = OrderedDict()
         res["page_number"] = self.page_number
         res["bbox"] = self.bbox.to_dict()  # [x_begin, y_begin, width, height]
         res["name"] = self.name
+        res["rotated_angle"] = self.rotated_angle
         return res
 
     def __eq__(self, other: "Location") -> bool:

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -90,7 +90,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse)
         tables = []
         for scan_table in scan_tables:
-            metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
+            metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle)
             cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row]
                                for row in scan_table.matrix_cells]
 
@@ -133,14 +133,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple
                 metadata["last_page"] = last_page
         else:
             warnings = []
-            metadata = None
+            metadata = {}
 
         if len(result) == 0:
-            all_lines, unref_tables, attachments = [], [], []
+            all_lines, unref_tables, attachments, page_angles = [], [], [], []
         else:
-            all_lines, unref_tables, attachments = map(list, map(flatten, zip(*result)))
+            all_lines, unref_tables, attachments, page_angles = map(list, map(flatten, zip(*result)))
         if parameters.need_header_footers_analysis:
-            lines = [lines for lines, _, _ in result]
+            lines = [lines for lines, _, _, _ in result]
             lines, headers, footers = footer_header_analysis(lines)
             all_lines = list(flatten(lines))
         mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
@@ -152,11 +152,13 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple
             prev_line = line
 
         all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
+        if page_angles:
+            metadata["rotated_page_angles"] = page_angles
         return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata
 
     @abstractmethod
     def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \
-            -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
+            -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
         pass
 
     def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]:

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -70,9 +70,11 @@ def _process_one_page(self,
                           image: np.ndarray,
                           parameters: ParametersForParseDoc,
                           page_number: int,
-                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
+                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
         #  --- Step 1: correct orientation and detect column count ---
-        rotated_image, is_one_column_document = self._detect_column_count_and_orientation(image, parameters)
+        rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
+        if self.config.get("debug_mode"):
+            self.logger.info(f"Angle page rotation = {angle}")
 
         #  --- Step 2: do binarization ---
         if parameters.need_binarization:
@@ -100,9 +102,9 @@ def _process_one_page(self,
         if self.config.get("labeling_mode"):
             save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))
 
-        return lines, tables, page.attachments
+        return lines, tables, page.attachments, [angle]
 
-    def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool]:
+    def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, int]:
         """
         Function :
             - detects the number of page columns
@@ -120,10 +122,10 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa
         angle = angle if parameters.document_orientation is None else 0
         self.logger.info(f"Final orientation angle = {angle}, is_one_column_document = {is_one_column_document}")
 
-        rotated_image, _ = self.scan_rotator.auto_rotate(image, angle)
+        rotated_image, result_angle = self.scan_rotator.auto_rotate(image, angle)
         if self.config.get("debug_mode"):
             img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
             self.logger.info(f"Save image to {img_path}")
             cv2.imwrite(img_path, rotated_image)
 
-        return rotated_image, is_one_column_document
+        return rotated_image, is_one_column_document, result_angle
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py
@@ -144,7 +144,7 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
         union_cell[col_id].y_bottom_right = y_bottom_split
 
         cell_image = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right])
-        result_row[col_id].lines = __get_ocr_lines(cell_image, language)
+        result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image)
 
         col_id -= 1
 

diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -5,14 +5,16 @@
 
 import numpy as np
 
+from dedoc.data_structures import BBox
 from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
 from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
 from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor
-from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import \
+    TableAttributeExtractor
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours
 
 
@@ -33,7 +35,7 @@ def extract_onepage_tables_from_image(self,
                                           page_number: int,
                                           language: str,
                                           orient_analysis_cells: bool,
-                                          orient_cell_angle: int,
+                                          orient_cell_angle: int,  # TODO remove
                                           table_type: str) -> List[ScanTable]:
         """
         extracts tables from input image
@@ -59,17 +61,14 @@ def extract_onepage_tables_from_image(self,
 
         for matrix in tables:
             for location in matrix.locations:
-                location.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape)
+                location.bbox = BBox.rotate_coordinates(bbox=location.bbox, angle_rotate=-angle_rotate, image_shape=image.shape)
+                location.rotated_angle = angle_rotate
 
         tables = self.__select_attributes_matrix_tables(tables=tables)
 
-        """
-        TODO: fix in the future
-        if orient_analysis_cells:
-            tables = self.__analyze_header_cell_with_diff_orient(tables, language, orient_cell_angle)"""
         return tables
 
-    """ TODO fix in the future
+    """ TODO fix in the future (REMOVE)
     def __detect_diff_orient(self, cell_text: str) -> bool:
         # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа
         parts = cell_text.split("\n")
@@ -165,7 +164,8 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type:
                     cur_table.matrix_cells = self.splitter.split(cells=cur_table.matrix_cells)
 
                     # Эвристика 2: таблица должна иметь больше одного столбца
-                    if len(cur_table.matrix_cells[0]) > 1 or (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []):
+                    if len(cur_table.matrix_cells[0]) > 1 or (
+                            self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []):
                         tables.append(cur_table)
 
                     if self.table_options.split_last_column in table_type:

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -129,7 +129,7 @@ def __extract(self, path: str, start_page: int = None, end_page: int = None) ->
 
         return all_lines, all_tables, all_tables_on_images
 
-    def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
+    def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[Table], List[ScanTable]]:
         tables = []
         tables_on_image = []
         page_number = page["number"]
@@ -272,6 +272,6 @@ def _process_one_page(self,
                           image: np.ndarray,
                           parameters: ParametersForParseDoc,
                           page_number: int,
-                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
+                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
 
-        return [], [], []
+        return [], [], [], []
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -43,7 +43,7 @@ def _process_one_page(self,
                           image: np.ndarray,
                           parameters: ParametersForParseDoc,
                           page_number: int,
-                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
+                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
         if parameters.need_pdf_table_analysis:
             gray_image = self._convert_to_gray(image)
             cleaned_image, tables = self.table_recognizer.recognize_tables_from_image(
@@ -59,15 +59,15 @@ def _process_one_page(self,
 
         page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number)
         if page is None:
-            return [], [], []
+            return [], [], [], []
         unreadable_blocks = [location.bbox for table in tables for location in table.locations]
         page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
         lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
 
         if self.config.get("labeling_mode"):
             save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))
 
-        return lines, tables, page.attachments
+        return lines, tables, page.attachments, []
 
     def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool:
         """

diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 from dedoc.api.dedoc_api import config
+from dedoc.utils.image_utils import rotate_image
 from dedoc.utils.pdf_utils import get_page_image
 from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
 
@@ -104,14 +105,27 @@ def normalize_font_thickness(image: np.ndarray) -> Tuple[float, int]:
         return font_scale, thickness
 
     @staticmethod
-    def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType]) -> np.ndarray:
+    def rotate_coordinate(x: int, y: int, xc: float, yc: float, angle: float) -> Tuple[int, int]:
+        rad = angle * math.pi / 180
+        x_rotated = int(float(x - xc) * math.cos(rad) - float(y - yc) * math.sin(rad) + xc)
+        y_rotated = int(float(y - yc) * math.cos(rad) + float(x - xc) * math.sin(rad) + yc)
+
+        return x_rotated, y_rotated
+
+    @staticmethod
+    def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType], angle: float) -> np.ndarray:
 
         font_scale, thickness = TestWordExtraction.normalize_font_thickness(image)
 
         for ann in word_annotations:
             bbox = json.loads(ann.bbox)
             p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"]))
             p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"]))
+            x_c = image.shape[1] / 2
+            y_c = image.shape[0] / 2
+            p1 = TestWordExtraction.rotate_coordinate(p1[0], p1[1], x_c, y_c, angle)
+            p2 = TestWordExtraction.rotate_coordinate(p2[0], p2[1], x_c, y_c, angle)
+
             cv2.rectangle(image, p1, p2, (0, 255, 0) if ann.text_type == "typewritten" else (255, 0, 0))
             text = ",".join(ann.confs) if ann.confs != [] else "None"
             cv2.putText(image, text, (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])),
@@ -148,12 +162,19 @@ def test_tabby_document(self):
     def test_table_word_extraction(self):
         output_path = os.path.join(self.output_path)
         os.makedirs(output_path, exist_ok=True)
-        file_names = ["tables/example_with_table3.png", "tables/example_with_table4.jpg", "tables/example_with_table5.png", "tables/example_with_table6.png"]
+        file_names = ["tables/example_with_table5.png", "tables/example_with_table3.png", "tables/example_with_table4.jpg",
+                      "tables/example_with_table6.png" "tables/example_with_table_horizontal_union.jpg"]
         for file_name in file_names:
             result = self._send_request(file_name, data=dict())
             table0 = result["content"]["tables"][0]
+            page_angle = result["metadata"]["other_fields"]["rotated_page_angles"][0]
+            table_angle = table0["metadata"]["rotated_angle"]
+
             word_annotations = TestWordExtraction.get_words_annotation_from_cell(table0)
             image = cv2.imread(self._get_abs_path(file_name))
-            image = TestWordExtraction.draw_word_annotations(image, word_annotations)
+            image = rotate_image(image, page_angle)
+            #image = rotate_image(image, table_angle)
+
+            image = TestWordExtraction.draw_word_annotations(image, word_annotations, angle=table_angle)
             cv2.imwrite(os.path.join(output_path, file_name), image)