ESL-156 fix pdfminer boxes output

ispras · Oct 10, 2023 · 48599dd · 48599dd
1 parent 6c17385
commit 48599dd
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 3 deletions.
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
@@ -186,7 +186,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl
                     # duplicated previous style
                     chars_with_style.append(chars_with_style[-1])
 
-        annotations = self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)
+        annotations = self.__extract_words_bbox_annotation(lobj, height, width)
         # 3 - extract range from chars_with_style array
         char_pointer = 0
 
@@ -197,7 +197,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl
 
         return annotations
 
-    def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]:
+    def __extract_words_bbox_annotation(self, lobj: LTTextContainer, height: int, width: int) -> List[Annotation]:
         words: List[WordObj] = []
         word: WordObj = WordObj(start=0, end=0, value=LTTextContainer())
         if isinstance(lobj, LTTextLineHorizontal):
@@ -216,7 +216,7 @@ def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h
         annotations = [
             BBoxAnnotation(start=word.start,
                            end=word.end,
-                           value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value),
+                           value=create_bbox(height=height, k_h=1.0, k_w=1.0, lobj=word.value),
                            page_width=width,
                            page_height=height) for word in words
         ]

diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py
@@ -135,6 +135,10 @@ def test_pdfminer_document(self):
         structure = result["content"]["structure"]
         word_annotations = self.__get_words_annotation(structure)
         image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
+        ann = word_annotations[0]
+        if ann is not None:
+            bbox = json.loads(ann.bbox)
+            image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
         image = self.__draw_word_annotations(image, word_annotations)
         cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)