Skip to content

Commit

Permalink
ESL-156 fix pdfminer boxes output
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy committed Oct 10, 2023
1 parent 6c17385 commit 48599dd
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl
# duplicated previous style
chars_with_style.append(chars_with_style[-1])

annotations = self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)
annotations = self.__extract_words_bbox_annotation(lobj, height, width)
# 3 - extract range from chars_with_style array
char_pointer = 0

Expand All @@ -197,7 +197,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl

return annotations

def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]:
def __extract_words_bbox_annotation(self, lobj: LTTextContainer, height: int, width: int) -> List[Annotation]:
words: List[WordObj] = []
word: WordObj = WordObj(start=0, end=0, value=LTTextContainer())
if isinstance(lobj, LTTextLineHorizontal):
Expand All @@ -216,7 +216,7 @@ def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h
annotations = [
BBoxAnnotation(start=word.start,
end=word.end,
value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value),
value=create_bbox(height=height, k_h=1.0, k_w=1.0, lobj=word.value),
page_width=width,
page_height=height) for word in words
]
Expand Down
4 changes: 4 additions & 0 deletions dedoc/scripts/test_words_bbox_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ def test_pdfminer_document(self):
structure = result["content"]["structure"]
word_annotations = self.__get_words_annotation(structure)
image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
ann = word_annotations[0]
if ann is not None:
bbox = json.loads(ann.bbox)
image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
image = self.__draw_word_annotations(image, word_annotations)
cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)

Expand Down

0 comments on commit 48599dd

Please sign in to comment.