Esl 137 added boxes into table (#333)

* ESL-137 added box extraction skeleton into scan table extraction * ESL-138 ESL-137 a lot of table changes - added CellWithMeta - change output table structure, remove CellProperies in output - change logic bbox extraction from image tables after debugging - change output in CSV, HTML, TABBY, PDF, SCAN readers - change all tests with tables - fixed styles * ESL-137 chnaged draw table script * ESL-148 added script of table word boxes drawing * TLDR-471 added angle rotation from PdfImageReader and Tables * ESL-137 fixed unit-tests * ESL-137 fixed after review; removing some unused functions - fixed after review - removing some unused functions * ESL-137 update docs * ESL-137 after review
ispras · Oct 11, 2023 · 4379fa7 · 4379fa7
1 parent a106293
commit 4379fa7
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 54 deletions.
diff --git a/dedoc/readers/docx_reader/data_structures/cell_property.py b/dedoc/readers/docx_reader/data_structures/cell_property.py
@@ -0,0 +1,13 @@
+
+
+class CellProperty:
+    """
+    This class holds information about the table cell.
+    """
+    def __init__(self, colspan: int, rowspan: int, invisible: bool) -> None:
+        """
+        :param cell: class which should contain the following attributes: colspan, rowspan, invisible.
+        """
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.invisible = invisible
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -3,14 +3,14 @@
 import math
 import os
 import subprocess
-from collections import namedtuple
 from typing import List, Optional, Tuple
 
 import numpy as np
 
 from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError
 from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError
 from dedoc.data_structures.bbox import BBox
+from dedoc.data_structures.cell_with_meta import CellWithMeta
 from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
 from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
 from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation
@@ -35,8 +35,6 @@
 from dedoc.utils.parameter_utils import get_param_page_slice
 from dedoc.utils.utils import calculate_file_hash
 
-CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")
-
 
 class PdfTabbyReader(PdfBaseReader):
     """
@@ -80,7 +78,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters
-        lines, scan_tables, tables_cell_properties = self.__extract(path=path)
+        lines, tables, tables_on_images = self.__extract(path=path)
         warnings = []
         document_metadata = None
 
@@ -95,15 +93,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
             if last_page != math.inf:
                 document_metadata["last_page"] = last_page
 
-        lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[])
-        tables = []
-        assert len(scan_tables) == len(tables_cell_properties)
-        for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties):
-            cell_properties = [[cellp for cellp in row] for row in table_cells_property]
-            metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
-            cells = [[cell for cell in row] for row in scan_table.matrix_cells]
-            table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties)
-            tables.append(table)
+        lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=[])
 
         attachments = []
         if self._can_contain_attachements(path) and self.attachment_extractor.with_attachments(parameters):
@@ -117,56 +107,56 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
 
         return self._postprocess(result)
 
-    def __extract(self, path: str, start_page: int = None, end_page: int = None) \
-            -> Tuple[List[LineWithMeta], List[ScanTable], List[List[List[CellPropertyInfo]]]]:
+    def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[Table], List[ScanTable]]:
         file_hash = calculate_file_hash(path=path)
         document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
+
         all_lines = []
         all_tables = []
-        all_cell_properties = []
+        all_tables_on_images = []
         for page in document.get("pages", []):
-            lines = self.__get_lines_with_location(page, file_hash)
-            if lines:
-                all_lines.extend(lines)
-            tables, cell_properties = self.__get_tables(page, file_hash)
-            if tables:
-                all_tables.extend(tables)
-                all_cell_properties.extend(cell_properties)
-
-        return all_lines, all_tables, all_cell_properties
-
-    def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[ScanTable], List[List[List[CellPropertyInfo]]]]:
+            page_lines = self.__get_lines_with_location(page, file_hash)
+            if page_lines:
+                all_lines.extend(page_lines)
+            page_tables, table_on_images = self.__get_tables(page, file_hash)
+            assert len(page_tables) == len(table_on_images)
+            if page_tables:
+                all_tables.extend(page_tables)
+                all_tables_on_images.extend(table_on_images)
+
+        return all_lines, all_tables, all_tables_on_images
+
+    def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[Table], List[ScanTable]]:
         tables = []
-        cell_properties = []
+        tables_on_image = []
         page_number = page["number"]
-        i = 0
-        for table in page["tables"]:
-            i += 1
+        for table_num, table in enumerate(page["tables"]):
             x_top_left = table["x_top_left"]
             y_top_left = table["y_top_left"]
             x_bottom_right = x_top_left + table["width"]
             y_bottom_right = y_top_left + table["height"]
-            order = table["order"]
+            order = table["order"]  # TODO add table order into TableMetadata
             rows = table["rows"]
-            cell_properties_json = table["cell_properties"]
-            cell_property_list = []
-
-            for cell_properties_row in cell_properties_json:
-                cell_property_row_list = []
-
-                for cell_property in cell_properties_row:
-                    cell_property_info = CellPropertyInfo(cell_property["col_span"], cell_property["row_span"], bool(cell_property["invisible"]))
-                    cell_property_row_list.append(cell_property_info)
-
-                cell_property_list.append(cell_property_row_list)
-
-            cells = [row for row in rows]
-            bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
-
-            tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order))
-            cell_properties.append(cell_property_list)
-
-        return tables, cell_properties
+            cell_properties = table["cell_properties"]
+            assert len(rows) == len(cell_properties)
+
+            result_cells = []
+            for num_row, row in enumerate(rows):
+                assert len(row) == len(cell_properties[num_row])
+                result_row = []
+                for num_col, cell_text in enumerate(row):
+                    result_row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=page_number, line_id=0))],
+                                                   colspan=cell_properties[num_row][num_col]["col_span"],
+                                                   rowspan=cell_properties[num_row][num_col]["row_span"],
+                                                   invisible=bool(cell_properties[num_row][num_col]["invisible"])))
+
+                result_cells.append(result_row)
+            table_bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))  # noqa TODO add table location into TableMetadata
+            tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, is_inserted=False)))
+            table_name = file_hash + str(page_number) + str(table_num)
+            tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order))
+
+        return tables, tables_on_image
 
     def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
         lines = []
@@ -274,6 +264,6 @@ def _process_one_page(self,
                           image: np.ndarray,
                           parameters: ParametersForParseDoc,
                           page_number: int,
-                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
+                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
 
-        return [], [], []
+        return [], [], [], []
diff --git a/dedoc/version.py b/dedoc/version.py
@@ -1 +1 @@
-__version__ = ""
+__version__ = "0.11.2"