ESL-137 fixed after review; removing some unused functions

- fixed after review - removing some unused functions
ispras · Sep 22, 2023 · 156586a · 156586a
1 parent fc71c10
commit 156586a
Show file tree

Hide file tree

Showing 13 changed files with 29 additions and 134 deletions.
diff --git a/dedoc/data_structures/__init__.py b/dedoc/data_structures/__init__.py
@@ -3,6 +3,7 @@
 from .annotation import Annotation
 from .attached_file import AttachedFile
 from .bbox import BBox
+from .cell_with_meta import CellWithMeta
 from .concrete_annotations import *
 from .document_content import DocumentContent
 from .document_metadata import DocumentMetadata
@@ -16,6 +17,5 @@
 from .tree_node import TreeNode
 from .unstructured_document import UnstructuredDocument
 
-__all__ = (['Annotation', 'AttachedFile', 'BBox', 'CellProperty', 'DocumentContent', 'DocumentMetadata', 'HierarchyLevel', 'LineMetadata',
-           'LineWithMeta', 'ParsedDocument', 'Serializable', 'Table', 'TableMetadata', 'TreeNode', 'UnstructuredDocument']
-           + annotations.__all__)
+__all__ = ['Annotation', 'AttachedFile', 'BBox', 'DocumentContent', 'DocumentMetadata', 'HierarchyLevel', 'LineMetadata',
+           'LineWithMeta', 'ParsedDocument', 'Serializable', 'Table', 'TableMetadata', 'CellWithMeta', 'TreeNode', 'UnstructuredDocument', *annotations.__all__]
diff --git a/dedoc/data_structures/bbox.py b/dedoc/data_structures/bbox.py
@@ -52,23 +52,19 @@ def y_bottom_right(self) -> int:
     def crop_image_by_box(image: np.ndarray, bbox: "BBox") -> np.ndarray:
         return image[bbox.y_top_left:bbox.y_bottom_right, bbox.x_top_left:bbox.x_bottom_right]
 
-    @staticmethod
-    def rotate_coordinates(bbox: "BBox", angle_rotate: float, image_shape: Tuple[int]) -> "BBox":
-        xb, yb = bbox.x_top_left, bbox.y_top_left
-        # TODO check!!! was xe, ye = self.bbox.x_begin + self.bbox.width, self.bbox.x_begin + self.bbox.height
-        xe, ye = bbox.x_bottom_right, bbox.y_bottom_right  # self.bbox.x_top_left + self.bbox.height
+    def rotate_coordinates(self, angle_rotate: float, image_shape: Tuple[int]) -> None:
+        xb, yb = self.x_top_left, self.y_top_left
+        xe, ye = self.x_bottom_right, self.y_bottom_right
         rad = angle_rotate * math.pi / 180
 
-        xc = image_shape[1] // 2
-        yc = image_shape[0] // 2
-
-        bbox_xb = min((int(float(xb - xc) * math.cos(rad) - float(yb - yc) * math.sin(rad)) + xc), image_shape[1])
-        bbox_yb = min((int(float(yb - yc) * math.cos(rad) + float(xb - xc) * math.sin(rad)) + yc), image_shape[0])
-        bbox_xe = min((int(float(xe - xc) * math.cos(rad) - float(ye - yc) * math.sin(rad)) + xc), image_shape[1])
-        bbox_ye = min((int(float(ye - yc) * math.cos(rad) + float(xe - xc) * math.sin(rad)) + yc), image_shape[0])
-        bbox_new = BBox(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb)
+        xc = image_shape[1] / 2
+        yc = image_shape[0] / 2
 
-        return bbox_new
+        bbox_xb = min((int(float(xb - xc) * math.cos(rad) - float(yb - yc) * math.sin(rad) + xc)), image_shape[1])
+        bbox_yb = min((int(float(yb - yc) * math.cos(rad) + float(xb - xc) * math.sin(rad) + yc)), image_shape[0])
+        bbox_xe = min((int(float(xe - xc) * math.cos(rad) - float(ye - yc) * math.sin(rad) + xc)), image_shape[1])
+        bbox_ye = min((int(float(ye - yc) * math.cos(rad) + float(xe - xc) * math.sin(rad) + yc)), image_shape[0])
+        self.__init__(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb)
 
     def __str__(self) -> str:
         return f"BBox(x = {self.x_top_left} y = {self.y_top_left}, w = {self.width}, h = {self.height})"

diff --git a/dedoc/data_structures/cell_property.py b/dedoc/data_structures/cell_property.py
@@ -1,29 +1,25 @@
 from collections import OrderedDict
-from typing import List
 
 import numpy as np
 from flask_restx import Api, Model, fields
 
-from dedoc.data_structures.annotation import Annotation
 from dedoc.data_structures.serializable import Serializable
 
 
 class CellProperty(Serializable):
     """
     This class holds information about the table cell.
     """
-    def __init__(self, colspan: int, rowspan: int, invisible: bool, annotations: List[Annotation] = []) -> None:  # noqa
+    def __init__(self, colspan: int, rowspan: int, invisible: bool) -> None:
         """
         :param cell: class which should contain the following attributes: colspan, rowspan, invisible.
         """
         self.colspan = colspan
         self.rowspan = rowspan
         self.invisible = invisible
-        self.annotations = annotations
 
     def to_dict(self) -> dict:
         res = OrderedDict()
-        res["annotations"] = [annotation.to_dict() for annotation in self.annotations]
         res["colspan"] = int(np.int8(self.colspan)) if self.colspan else None
         res["rowspan"] = int(np.int8(self.rowspan)) if self.colspan else None
         res["invisible"] = self.invisible
@@ -35,6 +31,4 @@ def get_api_dict(api: Api) -> Model:
             "colspan": fields.Integer(description="attribute of union column count"),
             "rowspan": fields.Integer(description="attribute of union row count"),
             "invisible": fields.Boolean(description='flag for cell display (for example: if invisible==true then style="display: none")'),
-            "annotations": fields.List(
-                fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
         })
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -91,9 +91,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         tables = []
         for scan_table in scan_tables:
             metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle)
-            cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row]
-                               for row in scan_table.matrix_cells]
-
+            cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in scan_table.matrix_cells]
             table = Table(metadata=metadata, cells=cells_with_meta)
             tables.append(table)
 

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/metadata_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/metadata_extractor.py
@@ -127,9 +127,6 @@ def __set_indentations(self, page: PageWithBBox) -> PageWithBBox:
 
         return page
 
-    def __get_line_metadata(self, bbox: TextWithBBox, page_with_lines: PageWithBBox) -> LineMetadata:
-        return LineMetadata(page_id=page_with_lines.page_num, line_id=bbox.line_num)
-
     def __get_font_size(self, bbox: TextWithBBox, image_height: int) -> int:
         """
         determines the font size by the bbox size, return font size in typography point

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py
@@ -21,14 +21,10 @@ def __init__(self, *, config: dict) -> None:
         self.logger = config.get("logger", logging.getLogger())
 
     def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], language: str) -> List[List[LineWithMeta]]:  # noqa
-        # try:
-        # if len(img_cells) == 0:
-        #    return []
         for node in tree_nodes:
             node.set_crop_text_box(page_image)
-        # img_cells_cropped = map(crop_image_text, img_cells)
-        # ids, images = zip(*sorted(enumerate(img_cells_cropped), key=lambda t: -t[1].shape[1]))
-        tree_nodes.sort(key=lambda t: -t.crop_text_box.width)  # TODO check
+
+        tree_nodes.sort(key=lambda t: -t.crop_text_box.width)
         originalbox_to_fastocrbox = {}
         batches = list(self.__nodes2batch(tree_nodes))
         for num_batch, nodes_batch in enumerate(batches):
@@ -66,8 +62,7 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"],
 
         return self.__create_lines_with_meta(tree_nodes, originalbox_to_fastocrbox, page_image)
 
-    def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> (  # noqa
-            Tuple)[OcrPage, List[BBox]]:
+    def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> Tuple[OcrPage, List[BBox]]: # noqa
         concatenated, chunk_boxes = self.__concat_images(src_image=src_image, tree_table_nodes=tree_table_nodes)
         if self.config.get("debug_mode", False):
             image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", f"stacked_batch_image_{num_batch}.png")

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py
@@ -1,17 +1,10 @@
-import concurrent.futures
-from collections import namedtuple
-from typing import Iterable, Iterator, List
+from typing import Iterable, List
 
 import numpy as np
 
-from dedoc.data_structures.bbox import BBox
 from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox
 from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox
-from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_document_page, \
-    get_text_with_bbox_from_document_page_one_column
-
-BBoxLevel = namedtuple("BBoxLevel", ["text_line", "some_word"])
-bbox_level = BBoxLevel(4, 5)
+from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_document_page, get_text_with_bbox_from_document_page_one_column
 
 
 class OCRLineExtractor:
@@ -30,43 +23,6 @@ def split_image2lines(self,
         if len(filtered_bboxes) >= 0:
             new_parsed_doc = PageWithBBox(page_num=page_num, bboxes=filtered_bboxes, image=image)
             return new_parsed_doc
-    """
-    def split_imagecell2lines(self,
-                          cell_image: np.ndarray,
-                          page_num: int,
-                          page_height: int,
-                          page_width: int,
-                          language: str = "rus+eng",) -> PageWithBBox:
-        bboxes = self.__split_image2bboxes_from_cell(cell_image=cell_image, page_num=page_num, language=language, page_height=page_height,
-        page_width=page_width)
-
-        filtered_bboxes = list(self._filtered_bboxes(bboxes))
-        if len(filtered_bboxes) >= 0:
-            new_parsed_cell = PageWithBBox(page_num=page_num, bboxes=filtered_bboxes, image=cell_image)
-            return new_parsed_cell"""
-
-    def split_images2lines(self, images: Iterator[np.ndarray], language: str = "rus+eng") -> List[PageWithBBox]:
-        input_data = ((page, image, language) for page, image in enumerate(images))
-        with concurrent.futures.ProcessPoolExecutor(max_workers=self.config["n_jobs"]) as executor:
-            documents = executor.map(self._parse_one_image, input_data)
-
-        return [doc for doc in documents if doc is not None]
-
-    def _parse_one_image(self, args: List) -> PageWithBBox:
-        page_num, image, language = args
-        bboxes = self.__split_image2bboxes(image=image, page_num=page_num, language=language, is_one_column_document=True)
-        if len(bboxes) > 0:
-            new_parsed_doc = PageWithBBox(page_num=page_num, bboxes=bboxes, image=image)
-            return new_parsed_doc
-
-    @staticmethod
-    def _is_box_in(box1: BBox, box2: BBox) -> bool:
-        """
-        check if box1 is in box2
-        """
-        x_inside = (box1.x_top_left >= box2.x_top_left) and (box1.x_bottom_right <= box2.x_bottom_right)
-        y_inside = (box1.y_top_left >= box2.y_top_left) and (box1.y_bottom_right <= box2.y_bottom_right)
-        return x_inside and y_inside
 
     def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str, is_one_column_document: bool) -> List[TextWithBBox]:
         ocr_conf_threshold = self.config.get("ocr_conf_threshold", -1)
@@ -81,14 +37,6 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str,
 
         return line_boxes
 
-    """def __split_image2bboxes_from_cell(self, cell_image: np.ndarray, page_num: int, language: str, page_height: int, page_width: int) -> List[TextWithBBox]
-        output_dict = get_text_with_bbox_from_cells(cell_image, language, ocr_conf_threshold=0.0)
-        line_boxes = [
-            TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(page_width, page_height))
-            for line_num, line in enumerate(output_dict.lines)]
-
-        return line_boxes"""
-
     def _filtered_bboxes(self, bboxes: List[TextWithBBox]) -> Iterable[TextWithBBox]:
         for text_with_bbox in bboxes:
             bbox = text_with_bbox.bbox

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_utils.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_utils.py
@@ -3,22 +3,6 @@
 
 from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage
 
-"""
-def get_cell_text_by_ocr(img_cell: np.ndarray, language: str) -> str:
-    if img_cell.shape[0] == 0 or img_cell.shape[1] == 0:
-        return ""
-
-    text = get_text_from_table_cell(img_cell, language=language)
-
-    return text
-
-
-def get_text_from_table_cell(image: np.ndarray, language: str) -> str:
-    config = "--psm 6"
-    text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)["text"]
-    return text
-"""
-
 
 def get_text_with_bbox_from_document_page_one_column(image: np.ndarray, language: str, ocr_conf_threshold: float) -> OcrPage:
     """

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py b/dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py
@@ -1,12 +1,9 @@
 import logging
-from typing import Iterator, List
 
 import cv2
 import numpy as np
-from joblib import Parallel, delayed
 
 from dedoc.utils.image_utils import rotate_image
-from dedoc.utils.utils import get_batch
 
 
 class ScanRotator:
@@ -48,13 +45,3 @@ def auto_rotate(self, image: np.ndarray, orientation_angle: int = 0) -> (np.ndar
         if self.config.get("debug_mode"):
             self.logger.debug(f"Best angle: {best_angle}, orientation angle: {orientation_angle}")
         return rotated, best_angle + orientation_angle
-
-    def rotate(self, images: List[np.ndarray]) -> Iterator[np.ndarray]:
-        """
-        automatic rotation of list of images
-        """
-        n_jobs = self.config["n_jobs"]
-        for batch in get_batch(size=n_jobs, iterable=images):
-            rotated_ = Parallel(n_jobs=n_jobs)(delayed(self.auto_rotate)(img) for img in batch)
-            for res, _ in rotated_:
-                yield res
diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -5,16 +5,14 @@
 
 import numpy as np
 
-from dedoc.data_structures import BBox
 from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
 from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
 from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor
-from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import \
-    TableAttributeExtractor
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours
 
 
@@ -61,7 +59,7 @@ def extract_onepage_tables_from_image(self,
 
         for matrix in tables:
             for location in matrix.locations:
-                location.bbox = BBox.rotate_coordinates(bbox=location.bbox, angle_rotate=-angle_rotate, image_shape=image.shape)
+                location.bbox.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape)
                 location.rotated_angle = angle_rotate
 
         tables = self.__select_attributes_matrix_tables(tables=tables)

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -3,7 +3,6 @@
 import math
 import os
 import subprocess
-from collections import namedtuple
 from typing import List, Optional, Tuple
 
 import numpy as np
@@ -36,8 +35,6 @@
 from dedoc.utils.parameter_utils import get_param_page_slice
 from dedoc.utils.utils import calculate_file_hash
 
-CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")
-
 
 class PdfTabbyReader(PdfBaseReader):
     """

diff --git a/dedoc/readers/reader_composition.py b/dedoc/readers/reader_composition.py
@@ -42,7 +42,6 @@ def parse_file(self, tmp_dir: str, filename: str, parameters: Dict[str, str]) ->
             if can_read:
                 unstructured_document = reader.read(path=file_path, document_type=document_type, parameters=parameters)
                 assert len(unstructured_document.lines) == 0 or isinstance(unstructured_document.lines[0], LineWithMeta)
-                assert isinstance(unstructured_document, UnstructuredDocument)  # TODO remove
                 return unstructured_document
 
         raise BadFileFormatError(

diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py
@@ -113,18 +113,20 @@ def rotate_coordinate(x: int, y: int, xc: float, yc: float, angle: float) -> Tup
         return x_rotated, y_rotated
 
     @staticmethod
-    def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType], angle: float) -> np.ndarray:
+    def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType], angle: float = 0.) -> np.ndarray:
 
         font_scale, thickness = TestWordExtraction.normalize_font_thickness(image)
+        x_c = image.shape[1] / 2
+        y_c = image.shape[0] / 2
 
         for ann in word_annotations:
             bbox = json.loads(ann.bbox)
             p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"]))
             p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"]))
-            x_c = image.shape[1] / 2
-            y_c = image.shape[0] / 2
-            p1 = TestWordExtraction.rotate_coordinate(p1[0], p1[1], x_c, y_c, angle)
-            p2 = TestWordExtraction.rotate_coordinate(p2[0], p2[1], x_c, y_c, angle)
+
+            if angle == 0.0:
+                p1 = TestWordExtraction.rotate_coordinate(p1[0], p1[1], x_c, y_c, angle)
+                p2 = TestWordExtraction.rotate_coordinate(p2[0], p2[1], x_c, y_c, angle)
 
             cv2.rectangle(image, p1, p2, (0, 255, 0) if ann.text_type == "typewritten" else (255, 0, 0))
             text = ",".join(ann.confs) if ann.confs != [] else "None"