diff --git a/dedoc/data_structures/bbox.py b/dedoc/data_structures/bbox.py index 2c45eddf..69122ab6 100644 --- a/dedoc/data_structures/bbox.py +++ b/dedoc/data_structures/bbox.py @@ -1,3 +1,4 @@ +import math from collections import OrderedDict from typing import Dict, Tuple @@ -51,6 +52,24 @@ def y_bottom_right(self) -> int: def crop_image_by_box(image: np.ndarray, bbox: "BBox") -> np.ndarray: return image[bbox.y_top_left:bbox.y_bottom_right, bbox.x_top_left:bbox.x_bottom_right] + @staticmethod + def rotate_coordinates(bbox: "BBox", angle_rotate: float, image_shape: Tuple[int]) -> "BBox": + xb, yb = bbox.x_top_left, bbox.y_top_left + # TODO check!!! was xe, ye = self.bbox.x_begin + self.bbox.width, self.bbox.x_begin + self.bbox.height + xe, ye = bbox.x_bottom_right, bbox.y_bottom_right # self.bbox.x_top_left + self.bbox.height + rad = angle_rotate * math.pi / 180 + + xc = image_shape[1] // 2 + yc = image_shape[0] // 2 + + bbox_xb = min((int(float(xb - xc) * math.cos(rad) - float(yb - yc) * math.sin(rad)) + xc), image_shape[1]) + bbox_yb = min((int(float(yb - yc) * math.cos(rad) + float(xb - xc) * math.sin(rad)) + yc), image_shape[0]) + bbox_xe = min((int(float(xe - xc) * math.cos(rad) - float(ye - yc) * math.sin(rad)) + xc), image_shape[1]) + bbox_ye = min((int(float(ye - yc) * math.cos(rad) + float(xe - xc) * math.sin(rad)) + yc), image_shape[0]) + bbox_new = BBox(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb) + + return bbox_new + def __str__(self) -> str: return f"BBox(x = {self.x_top_left} y = {self.y_top_left}, w = {self.width}, h = {self.height})" diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index 8e7a25e3..d36a6261 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -11,7 +11,7 @@ class TableMetadata(Serializable): """ This class holds the information about the table location in the document and information about cell properties. """ - def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False) -> None: + def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False, rotated_angle: float = 0.0) -> None: """ :param page_id: number of the page where table starts :param uid: unique identifier of the table @@ -21,12 +21,14 @@ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserte self.page_id = page_id self.uid = str(uuid.uuid1()) if not uid else uid self.is_inserted = is_inserted + self.rotated_angle = rotated_angle def to_dict(self) -> dict: res = OrderedDict() res["uid"] = self.uid res["page_id"] = self.page_id res["is_inserted"] = self.is_inserted + res["rotated_angle"] = self.rotated_angle return res @staticmethod @@ -34,5 +36,6 @@ def get_api_dict(api: Api) -> Model: return api.model("TableMetadata", { "page_id": fields.Integer(readonly=False, description="table start page number"), "uid": fields.String(description="table unique id"), - "is_inserted": fields.Boolean(description="was the table inserted into document body") + "is_inserted": fields.Boolean(description="was the table inserted into document body"), + "rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes?") }) diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py index 86ed6d26..4a2e4d58 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/location.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/location.py @@ -1,37 +1,24 @@ -import math from collections import OrderedDict from functools import total_ordering -from typing import Any, Dict, Tuple +from typing import Any, Dict from dedoc.data_structures.bbox import BBox @total_ordering class Location: - def __init__(self, page_number: int, bbox: BBox, name: str = "") -> None: + def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0) -> None: self.page_number = page_number self.bbox = bbox self.name = name - - def rotate_coordinates(self, angle_rotate: float, image_shape: Tuple[int]) -> None: - xb, yb = self.bbox.x_top_left, self.bbox.y_top_left - # TODO check!!! was xe, ye = self.bbox.x_begin + self.bbox.width, self.bbox.x_begin + self.bbox.height - xe, ye = self.bbox.x_bottom_right, self.bbox.y_bottom_right # self.bbox.x_top_left + self.bbox.height - rad = angle_rotate * math.pi / 180 - - bbox_xb = min((int(float(xb) * math.cos(rad) - float(yb) * math.sin(rad))), image_shape[1]) - bbox_yb = min((int(float(yb) * math.cos(rad) + float(xb) * math.sin(rad))), image_shape[0]) - bbox_xe = min((int(float(xe) * math.cos(rad) - float(ye) * math.sin(rad))), image_shape[1]) - bbox_ye = min((int(float(ye) * math.cos(rad) + float(xe) * math.sin(rad))), image_shape[0]) - bbox_new = BBox(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb) - - self.bbox = bbox_new + self.rotated_angle = rotated_angle def to_dict(self) -> Dict[str, Any]: res = OrderedDict() res["page_number"] = self.page_number res["bbox"] = self.bbox.to_dict() # [x_begin, y_begin, width, height] res["name"] = self.name + res["rotated_angle"] = self.rotated_angle return res def __eq__(self, other: "Location") -> bool: diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 0cbfb808..4a541f7a 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -90,7 +90,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse) tables = [] for scan_table in scan_tables: - metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name) + metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle) cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in scan_table.matrix_cells] @@ -133,14 +133,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple metadata["last_page"] = last_page else: warnings = [] - metadata = None + metadata = {} if len(result) == 0: - all_lines, unref_tables, attachments = [], [], [] + all_lines, unref_tables, attachments, page_angles = [], [], [], [] else: - all_lines, unref_tables, attachments = map(list, map(flatten, zip(*result))) + all_lines, unref_tables, attachments, page_angles = map(list, map(flatten, zip(*result))) if parameters.need_header_footers_analysis: - lines = [lines for lines, _, _ in result] + lines = [lines for lines, _, _, _ in result] lines, headers, footers = footer_header_analysis(lines) all_lines = list(flatten(lines)) mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines) @@ -152,11 +152,13 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple prev_line = line all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links) + if page_angles: + metadata["rotated_page_angles"] = page_angles return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata @abstractmethod def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \ - -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]: + -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]: pass def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 0cab0781..bf8099cd 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -70,9 +70,11 @@ def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, - path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]: + path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]: # --- Step 1: correct orientation and detect column count --- - rotated_image, is_one_column_document = self._detect_column_count_and_orientation(image, parameters) + rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters) + if self.config.get("debug_mode"): + self.logger.info(f"Angle page rotation = {angle}") # --- Step 2: do binarization --- if parameters.need_binarization: @@ -100,9 +102,9 @@ def _process_one_page(self, if self.config.get("labeling_mode"): save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path)) - return lines, tables, page.attachments + return lines, tables, page.attachments, [angle] - def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool]: + def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, int]: """ Function : - detects the number of page columns @@ -120,10 +122,10 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa angle = angle if parameters.document_orientation is None else 0 self.logger.info(f"Final orientation angle = {angle}, is_one_column_document = {is_one_column_document}") - rotated_image, _ = self.scan_rotator.auto_rotate(image, angle) + rotated_image, result_angle = self.scan_rotator.auto_rotate(image, angle) if self.config.get("debug_mode"): img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg") self.logger.info(f"Save image to {img_path}") cv2.imwrite(img_path, rotated_image) - return rotated_image, is_one_column_document + return rotated_image, is_one_column_document, result_angle diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index 1a7ff1cf..b88d9205 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -144,7 +144,7 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image union_cell[col_id].y_bottom_right = y_bottom_split cell_image = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right]) - result_row[col_id].lines = __get_ocr_lines(cell_image, language) + result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image) col_id -= 1 diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index 102854a1..47c8c0e5 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -5,6 +5,7 @@ import numpy as np +from dedoc.data_structures import BBox from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree @@ -12,7 +13,8 @@ from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import \ + TableAttributeExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours @@ -33,7 +35,7 @@ def extract_onepage_tables_from_image(self, page_number: int, language: str, orient_analysis_cells: bool, - orient_cell_angle: int, + orient_cell_angle: int, # TODO remove table_type: str) -> List[ScanTable]: """ extracts tables from input image @@ -59,17 +61,14 @@ def extract_onepage_tables_from_image(self, for matrix in tables: for location in matrix.locations: - location.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape) + location.bbox = BBox.rotate_coordinates(bbox=location.bbox, angle_rotate=-angle_rotate, image_shape=image.shape) + location.rotated_angle = angle_rotate tables = self.__select_attributes_matrix_tables(tables=tables) - """ - TODO: fix in the future - if orient_analysis_cells: - tables = self.__analyze_header_cell_with_diff_orient(tables, language, orient_cell_angle)""" return tables - """ TODO fix in the future + """ TODO fix in the future (REMOVE) def __detect_diff_orient(self, cell_text: str) -> bool: # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа parts = cell_text.split("\n") @@ -165,7 +164,8 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type: cur_table.matrix_cells = self.splitter.split(cells=cur_table.matrix_cells) # Эвристика 2: таблица должна иметь больше одного столбца - if len(cur_table.matrix_cells[0]) > 1 or (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []): + if len(cur_table.matrix_cells[0]) > 1 or ( + self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []): tables.append(cur_table) if self.table_options.split_last_column in table_type: diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 27871475..de5ca03a 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -129,7 +129,7 @@ def __extract(self, path: str, start_page: int = None, end_page: int = None) -> return all_lines, all_tables, all_tables_on_images - def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]: + def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[Table], List[ScanTable]]: tables = [] tables_on_image = [] page_number = page["number"] @@ -272,6 +272,6 @@ def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, - path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]: + path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]: - return [], [], [] + return [], [], [], [] diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 27b76d5f..827b16c8 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -43,7 +43,7 @@ def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, - path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]: + path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]: if parameters.need_pdf_table_analysis: gray_image = self._convert_to_gray(image) cleaned_image, tables = self.table_recognizer.recognize_tables_from_image( @@ -59,7 +59,7 @@ def _process_one_page(self, page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number) if page is None: - return [], [], [] + return [], [], [], [] unreadable_blocks = [location.bbox for table in tables for location in table.locations] page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)] lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False) @@ -67,7 +67,7 @@ def _process_one_page(self, if self.config.get("labeling_mode"): save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path)) - return lines, tables, page.attachments + return lines, tables, page.attachments, [] def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool: """ diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py index 53212738..0d750831 100644 --- a/dedoc/scripts/test_words_bbox_extraction.py +++ b/dedoc/scripts/test_words_bbox_extraction.py @@ -8,6 +8,7 @@ import numpy as np from dedoc.api.dedoc_api import config +from dedoc.utils.image_utils import rotate_image from dedoc.utils.pdf_utils import get_page_image from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -104,7 +105,15 @@ def normalize_font_thickness(image: np.ndarray) -> Tuple[float, int]: return font_scale, thickness @staticmethod - def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType]) -> np.ndarray: + def rotate_coordinate(x: int, y: int, xc: float, yc: float, angle: float) -> Tuple[int, int]: + rad = angle * math.pi / 180 + x_rotated = int(float(x - xc) * math.cos(rad) - float(y - yc) * math.sin(rad) + xc) + y_rotated = int(float(y - yc) * math.cos(rad) + float(x - xc) * math.sin(rad) + yc) + + return x_rotated, y_rotated + + @staticmethod + def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType], angle: float) -> np.ndarray: font_scale, thickness = TestWordExtraction.normalize_font_thickness(image) @@ -112,6 +121,11 @@ def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConf bbox = json.loads(ann.bbox) p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])) p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"])) + x_c = image.shape[1] / 2 + y_c = image.shape[0] / 2 + p1 = TestWordExtraction.rotate_coordinate(p1[0], p1[1], x_c, y_c, angle) + p2 = TestWordExtraction.rotate_coordinate(p2[0], p2[1], x_c, y_c, angle) + cv2.rectangle(image, p1, p2, (0, 255, 0) if ann.text_type == "typewritten" else (255, 0, 0)) text = ",".join(ann.confs) if ann.confs != [] else "None" cv2.putText(image, text, (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])), @@ -148,12 +162,19 @@ def test_tabby_document(self): def test_table_word_extraction(self): output_path = os.path.join(self.output_path) os.makedirs(output_path, exist_ok=True) - file_names = ["tables/example_with_table3.png", "tables/example_with_table4.jpg", "tables/example_with_table5.png", "tables/example_with_table6.png"] + file_names = ["tables/example_with_table5.png", "tables/example_with_table3.png", "tables/example_with_table4.jpg", + "tables/example_with_table6.png" "tables/example_with_table_horizontal_union.jpg"] for file_name in file_names: result = self._send_request(file_name, data=dict()) table0 = result["content"]["tables"][0] + page_angle = result["metadata"]["other_fields"]["rotated_page_angles"][0] + table_angle = table0["metadata"]["rotated_angle"] + word_annotations = TestWordExtraction.get_words_annotation_from_cell(table0) image = cv2.imread(self._get_abs_path(file_name)) - image = TestWordExtraction.draw_word_annotations(image, word_annotations) + image = rotate_image(image, page_angle) + #image = rotate_image(image, table_angle) + + image = TestWordExtraction.draw_word_annotations(image, word_annotations, angle=table_angle) cv2.imwrite(os.path.join(output_path, file_name), image)