ESL-137 after review

ispras · Sep 25, 2023 · 89147f0 · 89147f0
1 parent efc2820
commit 89147f0
Show file tree

Hide file tree

Showing 27 changed files with 107 additions and 195 deletions.
diff --git a/dedoc/data_structures/cell_property.py b/dedoc/data_structures/cell_property.py
diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py
@@ -13,11 +13,11 @@ class CellWithMeta:
     """
     def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
         """
-               :param lines: text lines (LineWithMeta) of the cell
-               :param colspan: The value of the rowspan attribute represents the number of columns to span. Like HTML format.
-               :param rowspan: The value of the rowspan attribute represents the number of rows to span. Like HTML format.
-               :param invisible: Display or hide cell values
-               """
+           :param lines: text lines (LineWithMeta) of the cell
+           :param colspan: The value of the rowspan attribute represents the number of columns to span. Like HTML format.
+           :param rowspan: The value of the rowspan attribute represents the number of rows to span. Like HTML format.
+           :param invisible: Display or hide cell values
+        """
         self.lines = lines
         self.colspan = colspan
         self.rowspan = rowspan
@@ -45,6 +45,5 @@ def get_api_dict(api: Api) -> Model:
             "colspan": fields.Integer(description="attribute of union column count"),
             "rowspan": fields.Integer(description="attribute of union row count"),
             "invisible": fields.Boolean(description='flag for cell display (for example: if invisible==true then style="display: none")'),
-            "lines": fields.List(
-                fields.Nested(LineWithMeta.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
+            "lines": fields.List(fields.Nested(LineWithMeta.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
         })
diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
@@ -1,6 +1,6 @@
 import re
 from collections import OrderedDict
-from typing import List, Sized, Union
+from typing import List, Optional, Sized, Union
 from uuid import uuid1
 
 from flask_restx import Api, Model, fields
@@ -17,7 +17,10 @@ class LineWithMeta(Sized):
     (for example, document title and raw text of the document should not be in the same line).
     Still the logical part of the document may be represented by more than one line (for example, document title may consist of many lines).
     """
-    def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotation], uid: str = None) -> None:
+    def __init__(self, line: str,
+                 metadata: Optional[LineMetadata] = None,
+                 annotations: Optional[List[Annotation]] = None,
+                 uid: str = None) -> None:
         """
         :param line: raw text of the document line
         :param metadata: metadata (related to the entire line, as line or page number, its hierarchy level)
@@ -26,13 +29,26 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
         """
 
         self._line = line
-        assert isinstance(metadata, LineMetadata)
+        metadata = LineMetadata(page_id=0, line_id=None) if metadata is None else metadata
         self._metadata = metadata
-        self._annotations = annotations
+        self._annotations = [] if annotations is None else annotations
         self._uid = str(uuid1()) if uid is None else uid
 
     def __len__(self) -> int:
-        return len(self.line)
+        return len(self._line)
+
+    @staticmethod
+    def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta":
+        if len(lines) == 0:
+            return LineWithMeta("")
+
+        common_line = lines[0]
+
+        for next_line in lines[1:]:
+            common_line += LineWithMeta(delimiter)
+            common_line += next_line
+
+        return common_line
 
     def split(self, sep: str) -> List["LineWithMeta"]:
         """
@@ -141,6 +157,5 @@ def to_dict(self) -> dict:
     def get_api_dict(api: Api) -> Model:
         return api.model("LineWithMeta", {
             "text": fields.String(description="line's text"),
-            "annotations": fields.List(
-                fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
+            "annotations": fields.List(fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
         })
diff --git a/dedoc/data_structures/table.py b/dedoc/data_structures/table.py
@@ -18,7 +18,6 @@ def __init__(self, cells: List[List[CellWithMeta]], metadata: TableMetadata) ->
         """
         :param cells: a list of lists of cells (cell has text, colspan and rowspan attributes).
         :param metadata: some table metadata, as location, size and so on.
-        :param cells_properties: a list of lists of cells properties - each should contain attributes rowspan, colspan, invisible (for repeated cells)
         """
         self.metadata = metadata
         self.cells = cells

diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py
@@ -16,7 +16,8 @@ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserte
         :param page_id: number of the page where table starts
         :param uid: unique identifier of the table
         :param is_inserted: indicator if table was already inserted into paragraphs list
-        :param cell_properties: information about rowspan, colspan and invisibility of each cell
+        :param rotated_angle: the value of the rotation angle by which the table was rotated during recognition.
+        Extracted boxes from a table will need to be rotated by this angle.
         """
         self.page_id = page_id
         self.uid = str(uuid.uuid1()) if not uid else uid
@@ -37,5 +38,5 @@ def get_api_dict(api: Api) -> Model:
             "page_id": fields.Integer(readonly=False, description="table start page number"),
             "uid": fields.String(description="table unique id"),
             "is_inserted": fields.Boolean(description="was the table inserted into document body"),
-            "rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes?")
+            "rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes")
         })
diff --git a/dedoc/readers/csv_reader/csv_reader.py b/dedoc/readers/csv_reader/csv_reader.py
@@ -44,7 +44,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         for row in data:
             row_lines = []
             for cell in row:
-                row_lines.append(CellWithMeta(lines=[LineWithMeta(line=cell, metadata=LineMetadata(page_id=0, line_id=line_id), annotations=[])]))
+                row_lines.append(CellWithMeta(lines=[LineWithMeta(line=cell, metadata=LineMetadata(page_id=0, line_id=line_id))]))
                 line_id += 1
             cells_with_meta.append(row_lines)
 

diff --git a/dedoc/readers/docx_reader/data_structures/cell_property.py b/dedoc/readers/docx_reader/data_structures/cell_property.py
@@ -0,0 +1,13 @@
+
+
+class CellProperty:
+    """
+    This class holds information about the table cell.
+    """
+    def __init__(self, colspan: int, rowspan: int, invisible: bool) -> None:
+        """
+        :param cell: class which should contain the following attributes: colspan, rowspan, invisible.
+        """
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.invisible = invisible
diff --git a/dedoc/readers/docx_reader/data_structures/table.py b/dedoc/readers/docx_reader/data_structures/table.py
@@ -3,10 +3,10 @@
 from bs4 import Tag
 
 from dedoc.data_structures import LineMetadata, LineWithMeta
-from dedoc.data_structures.cell_property import CellProperty
 from dedoc.data_structures.cell_with_meta import CellWithMeta
 from dedoc.data_structures.table import Table
 from dedoc.data_structures.table_metadata import TableMetadata
+from dedoc.readers.docx_reader.data_structures.cell_property import CellProperty
 from dedoc.readers.docx_reader.data_structures.run import Run
 from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
 
@@ -83,7 +83,7 @@ def to_table(self) -> Table:
         for num_row, row in enumerate(result_cells):
             result_row = []
             for num_col, cell_text in enumerate(row):
-                cell = CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=0, line_id=None), annotations=[])],
+                cell = CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=0, line_id=0))],
                                     colspan=cell_property_list[num_row][num_col].colspan,
                                     rowspan=cell_property_list[num_row][num_col].rowspan,
                                     invisible=cell_property_list[num_row][num_col].invisible)

diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py
@@ -182,7 +182,7 @@ def __get_decoded(self, text: str) -> str:
 
     def __get_field(self, message: Message, key: str, line_metadata: LineMetadata) -> LineWithMeta:
         text = self.__get_decoded(message.get(key.lower(), ""))
-        return LineWithMeta(line=text, metadata=line_metadata, annotations=[])
+        return LineWithMeta(line=text, metadata=line_metadata)
 
     def __get_main_fields(self, message: Message) -> List[LineWithMeta]:
         lines = list()

diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py
@@ -58,9 +58,7 @@ def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table:
             row = []
             for col_id in range(n_cols):
                 value = str(sheet.cell_value(rowx=row_id, colx=col_id))
-                row.append(CellWithMeta(lines=[LineWithMeta(line=value,
-                                                            metadata=LineMetadata(page_id=sheet_id, line_id=None),
-                                                            annotations=[])]))
+                row.append(CellWithMeta(lines=[LineWithMeta(line=value, metadata=LineMetadata(page_id=sheet_id, line_id=0))]))
             res.append(row)
         metadata = TableMetadata(page_id=sheet_id)
         return Table(cells=res, metadata=metadata)
diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py
@@ -157,7 +157,7 @@ def __get_li_header(self, list_type: str, index: int) -> LineWithMeta:
         else:
             header = str(index + 1) + end
         metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel(2, 1, False, line_type=HierarchyLevel.list_item), page_id=0, line_id=0)
-        header_line = LineWithMeta(line=header, metadata=metadata, annotations=[])
+        header_line = LineWithMeta(line=header, metadata=metadata)
         return header_line
 
     def __read_list(self, lst: Tag, uid: str, path_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:

diff --git a/dedoc/readers/json_reader/json_reader.py b/dedoc/readers/json_reader/json_reader.py
@@ -119,7 +119,7 @@ def __handle_one_element(self, depth: int, value: Any, line_type: str, line_type
 
         hierarchy_level = HierarchyLevel(level_1=level1, level_2=level2, can_be_multiline=False, line_type=line_type_meta)
         metadata = LineMetadata(tag_hierarchy_level=hierarchy_level, page_id=0, line_id=None)
-        line = LineWithMeta(line=self.__get_text(value), metadata=metadata, annotations=[])
+        line = LineWithMeta(line=self.__get_text(value), metadata=metadata)
         return line
 
     def __is_flat(self, value: Any) -> bool:  # noqa

diff --git a/dedoc/readers/note_reader/note_reader.py b/dedoc/readers/note_reader/note_reader.py
@@ -4,7 +4,6 @@
 from typing import Optional
 
 from dedoc.common.exceptions.bad_file_error import BadFileFormatError
-from dedoc.data_structures.line_metadata import LineMetadata
 from dedoc.data_structures.line_with_meta import LineWithMeta
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.readers.base_reader import BaseReader
@@ -40,7 +39,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
             text = note_dict["content"]
             if isinstance(text, bytes):
                 text = text.decode()
-            lines = [LineWithMeta(line=text, annotations=[], metadata=LineMetadata(line_id=0, page_id=0))]
+            lines = [LineWithMeta(line=text)]
             unstructured = UnstructuredDocument(tables=[], lines=lines, attachments=[])
 
             return unstructured

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py
@@ -1,5 +1,4 @@
 import uuid
-from collections import OrderedDict
 from typing import List, Optional
 
 from dedoc.data_structures.annotation import Annotation
@@ -37,23 +36,21 @@ def __init__(self,
                  y_top_left: int,
                  y_bottom_right: int,
                  id_con: int = -1,
-                 lines: List[LineWithMeta] = None,
+                 lines: Optional[List[LineWithMeta]] = None,
                  is_attribute: bool = False,
                  is_attribute_required: bool = False,
                  rotated_angle: int = 0,
                  uid: str = None,
                  contour_coord: Optional[BBox] = None) -> None:
 
-        if lines is None:
-            lines = []
         assert x_top_left <= x_bottom_right
         assert y_top_left <= y_bottom_right
         self.x_top_left = x_top_left
         self.x_bottom_right = x_bottom_right
         self.y_top_left = y_top_left
         self.y_bottom_right = y_bottom_right
         self.id_con = id_con
-        self.lines = lines
+        self.lines = [] if lines is None else lines
         self.is_attribute = is_attribute
         self.is_attribute_required = is_attribute_required
         self.rotated_angle = rotated_angle
@@ -67,13 +64,10 @@ def __str__(self) -> str:
         return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"
 
     def get_text(self) -> str:
-        return "\n".join([line.line for line in self.lines])
+        return LineWithMeta.join(self.lines).line
 
     def get_annotations(self) -> List[Annotation]:
-        annotations = []
-        for line in self.lines:
-            annotations.extend(line.annotations)
-        return annotations
+        return LineWithMeta.join(self.lines).annotations
 
     def __repr__(self) -> str:
         return self.__str__()
@@ -85,16 +79,3 @@ def width(self) -> int:
     @property
     def height(self) -> int:
         return self.y_bottom_right - self.y_top_left
-
-    def to_dict(self) -> dict:
-        cell_dict = OrderedDict()
-        cell_dict["text"] = self.get_text()
-        cell_dict["is_attribute"] = self.is_attribute
-        cell_dict["colspan"] = self.colspan
-        cell_dict["rowspan"] = self.rowspan
-        cell_dict["invisible"] = self.invisible
-
-        return cell_dict
-
-    def set_rotated_angle(self, rotated_angle: int) -> None:
-        self.rotated_angle = rotated_angle
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -156,7 +156,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple
 
     @abstractmethod
     def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \
-            -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
+            -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
+        """
+            function parses image and returns:
+            - recognized textual lines with annotations
+            - recognized tables on an image
+            - attachments (figures on images)
+            - [rotated_angle] - the angle by which the image was rotated for recognition
+        """
         pass
 
     def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]:

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py
@@ -35,8 +35,7 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"],
                 for i, table_tree_node in enumerate(nodes_batch):
                     cv2.imwrite(os.path.join(tmp_dir, f"image_{num_batch}_{i}.png"), BBox.crop_image_by_box(page_image, table_tree_node.cell_box))
 
-            ocr_result, chunk_boxes = self.__handle_one_batch(src_image=page_image, tree_table_nodes=nodes_batch, num_batch=num_batch,
-                                                              language=language)
+            ocr_result, chunk_boxes = self.__handle_one_batch(src_image=page_image, tree_table_nodes=nodes_batch, num_batch=num_batch, language=language)
 
             for chunk_index, _ in enumerate(chunk_boxes):
                 originalbox_to_fastocrbox[nodes_batch[chunk_index].cell_box] = []

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -70,7 +70,7 @@ def _process_one_page(self,
                           image: np.ndarray,
                           parameters: ParametersForParseDoc,
                           page_number: int,
-                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
+                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
         #  --- Step 1: correct orientation and detect column count ---
         rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
         if self.config.get("debug_mode"):
@@ -104,7 +104,7 @@ def _process_one_page(self,
 
         return lines, tables, page.attachments, [angle]
 
-    def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, int]:
+    def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, float]:
         """
         Function :
             - detects the number of page columns

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py b/dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py
@@ -22,7 +22,7 @@ def determine_score(self, arr: np.ndarray, angle: int) -> (np.ndarray, float):
         score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
         return score
 
-    def auto_rotate(self, image: np.ndarray, orientation_angle: int = 0) -> (np.ndarray, int):
+    def auto_rotate(self, image: np.ndarray, orientation_angle: float = 0.) -> (np.ndarray, float):
         if orientation_angle:
             image = rotate_image(image, orientation_angle)
 

diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -162,8 +162,7 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type:
                     cur_table.matrix_cells = self.splitter.split(cells=cur_table.matrix_cells)
 
                     # Эвристика 2: таблица должна иметь больше одного столбца
-                    if len(cur_table.matrix_cells[0]) > 1 or (
-                            self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []):
+                    if len(cur_table.matrix_cells[0]) > 1 or (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []):
                         tables.append(cur_table)
 
                     if self.table_options.split_last_column in table_type: