Merge remote-tracking branch 'origin/add_tabby_cells_bbox' into add_t…

…abby_cells_bbox # Conflicts: # dedoc/data_structures/line_with_meta.py # dedoc/readers/pdf_reader/data_classes/tables/cell.py # dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py # dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py # dedoc/scripts/test_words_bbox_extraction.py
ispras · Oct 11, 2023 · 7922b18 · 7922b18
2 parents 4c7e5eb + f3779fe
commit 7922b18
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 73 deletions.
diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
@@ -55,9 +55,6 @@ def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta":
 
         return common_line
 
-    def __lt__(self, other: "LineWithMeta") -> bool:
-        return self.line < other.line
-
     def split(self, sep: str) -> List["LineWithMeta"]:
         """
         Split this line into a list of lines, keep annotations consistent.

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py
@@ -3,7 +3,6 @@
 
 from dedocutils.data_structures import BBox
 
-from dedoc.data_structures import BBoxAnnotation
 from dedoc.data_structures.annotation import Annotation
 from dedoc.data_structures.line_with_meta import LineWithMeta
 
@@ -71,24 +70,6 @@ def get_text(self) -> str:
     def get_annotations(self) -> List[Annotation]:
         return LineWithMeta.join(self.lines, delimiter="\n").annotations
 
-    def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
-        for i_line, _ in enumerate(self.lines):
-            for i_ann, annotation in enumerate(self.lines[i_line].annotations):
-                if annotation.name != "bounding box":
-                    continue
-
-                bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
-                k_w = new_page_width / page_width
-                k_h = new_page_height / page_height
-                new_bbox = BBox(x_top_left=int(bbox.x_top_left * k_w), y_top_left=int(bbox.y_top_left * k_h),
-                                width=int(bbox.width * k_w), height=int(bbox.height * k_h))
-
-                self.lines[i_line].annotations[i_ann] = BBoxAnnotation(start=annotation.start,
-                                                                       end=annotation.end,
-                                                                       value=new_bbox,
-                                                                       page_width=new_page_width,
-                                                                       page_height=new_page_height)
-
     def __repr__(self) -> str:
         return self.__str__()
 

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -63,24 +63,12 @@ def _process_one_page(self,
         unreadable_blocks = [location.bbox for table in tables for location in table.locations]
         page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
         lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
-        self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
 
         if self.config.get("labeling_mode"):
             save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))
 
         return lines, tables, page.attachments, []
 
-    def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
-        """
-        Change table boxes's width height into pdf space like textual lines
-        """
-
-        for table in tables:
-            for row in table.matrix_cells:
-
-                for cell in row:
-                    cell.change_lines_boxes_page_width_height(new_page_width=pdf_width, new_page_height=pdf_height)
-
     def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool:
         """
         Check obj_bbox inside some unreadable blocks or not

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
@@ -107,7 +107,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithB
 
         attachments = images if len(images) < 10 else []
 
-        return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments, pdf_page_height=height, pdf_page_width=width)
+        return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments)
 
     def __extract_image(self,
                         directory: str,
@@ -186,7 +186,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl
                     # duplicated previous style
                     chars_with_style.append(chars_with_style[-1])
 
-        annotations = self.__extract_words_bbox_annotation(lobj, height, width)
+        annotations = self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)
         # 3 - extract range from chars_with_style array
         char_pointer = 0
 
@@ -197,7 +197,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl
 
         return annotations
 
-    def __extract_words_bbox_annotation(self, lobj: LTTextContainer, height: int, width: int) -> List[Annotation]:
+    def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]:
         words: List[WordObj] = []
         word: WordObj = WordObj(start=0, end=0, value=LTTextContainer())
         if isinstance(lobj, LTTextLineHorizontal):
@@ -216,7 +216,7 @@ def __extract_words_bbox_annotation(self, lobj: LTTextContainer, height: int, wi
         annotations = [
             BBoxAnnotation(start=word.start,
                            end=word.end,
-                           value=create_bbox(height=height, k_h=1.0, k_w=1.0, lobj=word.value),
+                           value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value),
                            page_width=width,
                            page_height=height) for word in words
         ]

diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py
@@ -127,36 +127,35 @@ def __draw_word_annotations(self, image: np.ndarray, word_annotations: List[Bbox
                         cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 255), thickness)
         return image
 
-    def __draw_tables_words(self, tables: List[dict], image: np.ndarray) -> np.ndarray:
-        for table in tables:
-            table_angle = table["metadata"]["rotated_angle"]
-
-            word_annotations = self.__get_words_annotation_from_cell(table)
-            image = self.__draw_word_annotations(image, word_annotations, angle=table_angle)
-        return image
-
-    def test_pdf_documents(self):
-        filename_parameters_outputdir = [
-            ["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="true"),  "pdfminer_reader"],
-            ["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="tabby"), "tabby_reader"]
-        ]
+    def test_pdfminer_document(self):
+        output_path = os.path.join(self.output_path, "pdfminer_reader")
+        os.makedirs(output_path, exist_ok=True)
+        file_name = "pdf_with_text_layer/english_doc.pdf"
+        result = self._send_request(file_name, data=dict(pdf_with_text_layer="true"))
+        structure = result["content"]["structure"]
+        word_annotations = self.__get_words_annotation(structure)
+        image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
+        image = self.__draw_word_annotations(image, word_annotations)
+        cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)
+
+    def test_tabby_document(self):
+        output_path = os.path.join(self.output_path, "tabby_reader")
+        os.makedirs(output_path, exist_ok=True)
+        file_name = "pdf_with_text_layer/english_doc.pdf"
+        result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
+        structure = result["content"]["structure"]
+        image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
+        word_annotations = self.__get_words_annotation(structure)
+        ann = word_annotations[0]
+        if ann is not None:
+            bbox = json.loads(ann.bbox)
+            image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
 
-        for file_name, parameters, outputdir in filename_parameters_outputdir:
-            output_path = os.path.join(self.output_path, outputdir)
-            os.makedirs(output_path, exist_ok=True)
-            result = self._send_request(file_name, data=parameters)
-            structure = result["content"]["structure"]
-            word_annotations = self.__get_words_annotation(structure)
-            image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
-            ann = word_annotations[0]
-            if ann is not None:
-                bbox = json.loads(ann.bbox)
-                image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
-            image = self.__draw_word_annotations(image, word_annotations)
-            tables = result["content"]["tables"]
-            if len(tables) > 0:
-                image = self.__draw_tables_words(tables, image)
-            cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)
+        image = self.__draw_word_annotations(image, word_annotations)
+        table0 = result["content"]["tables"][0]
+        word_annotations = self.__get_words_annotation_from_cell(table0)
+        image = self.__draw_word_annotations(image, word_annotations, angle=0)
+        cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)
 
     def test_table_word_extraction(self):
         output_path = os.path.join(self.output_path, 'tables')
@@ -169,13 +168,13 @@ def test_table_word_extraction(self):
             result = self._send_request(file_name, data=dict())
             table0 = result["content"]["tables"][0]
             page_angle = result["metadata"]["other_fields"]["rotated_page_angles"][0]
+            table_angle = table0["metadata"]["rotated_angle"]
 
+            word_annotations = self.__get_words_annotation_from_cell(table0)
             image = cv2.imread(self._get_abs_path(file_name))
             image = rotate_image(image, page_angle)
-            tables = result["content"]["tables"]
-            if len(tables) > 0:
-                image = self.__draw_tables_words(tables, image)
 
+            image = self.__draw_word_annotations(image, word_annotations, angle=table_angle)
             cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image)
 
     def test_document_image_reader(self) -> None:
@@ -192,7 +191,4 @@ def test_document_image_reader(self) -> None:
             image = cv2.imread(self._get_abs_path(filename))
             image = rotate_image(image, result["metadata"]["other_fields"].get("rotated_page_angles", [0.])[0])
             image = self.__draw_word_annotations(image, word_annotations)
-            tables = result["content"]["tables"]
-            if len(tables) > 0:
-                image = self.__draw_tables_words(tables, image)
             cv2.imwrite(os.path.join(output_path, filename.split("/")[-1]), image)