diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index 29e3c39a..3b78a259 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -186,7 +186,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl # duplicated previous style chars_with_style.append(chars_with_style[-1]) - annotations = self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width) + annotations = self.__extract_words_bbox_annotation(lobj, height, width) # 3 - extract range from chars_with_style array char_pointer = 0 @@ -197,7 +197,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl return annotations - def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]: + def __extract_words_bbox_annotation(self, lobj: LTTextContainer, height: int, width: int) -> List[Annotation]: words: List[WordObj] = [] word: WordObj = WordObj(start=0, end=0, value=LTTextContainer()) if isinstance(lobj, LTTextLineHorizontal): @@ -216,7 +216,7 @@ def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h annotations = [ BBoxAnnotation(start=word.start, end=word.end, - value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value), + value=create_bbox(height=height, k_h=1.0, k_w=1.0, lobj=word.value), page_width=width, page_height=height) for word in words ] diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py index a2a07bcc..3b65b392 100644 --- a/dedoc/scripts/test_words_bbox_extraction.py +++ b/dedoc/scripts/test_words_bbox_extraction.py @@ -135,6 +135,10 @@ def test_pdfminer_document(self): structure = result["content"]["structure"] word_annotations = self.__get_words_annotation(structure) image = np.asarray(get_page_image(self._get_abs_path(file_name), 0)) + ann = word_annotations[0] + if ann is not None: + bbox = json.loads(ann.bbox) + image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC) image = self.__draw_word_annotations(image, word_annotations) cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)