Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/add_tabby_cells_bbox' into add_t…
Browse files Browse the repository at this point in the history
…abby_cells_bbox

# Conflicts:
#	dedoc/data_structures/line_with_meta.py
#	dedoc/readers/pdf_reader/data_classes/tables/cell.py
#	dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
#	dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
#	dedoc/scripts/test_words_bbox_extraction.py
  • Loading branch information
sunveil committed Oct 11, 2023
2 parents 4c7e5eb + f3779fe commit 7922b18
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 73 deletions.
3 changes: 0 additions & 3 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta":

return common_line

def __lt__(self, other: "LineWithMeta") -> bool:
return self.line < other.line

def split(self, sep: str) -> List["LineWithMeta"]:
"""
Split this line into a list of lines, keep annotations consistent.
Expand Down
19 changes: 0 additions & 19 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from dedocutils.data_structures import BBox

from dedoc.data_structures import BBoxAnnotation
from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.line_with_meta import LineWithMeta

Expand Down Expand Up @@ -71,24 +70,6 @@ def get_text(self) -> str:
def get_annotations(self) -> List[Annotation]:
return LineWithMeta.join(self.lines, delimiter="\n").annotations

def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
for i_line, _ in enumerate(self.lines):
for i_ann, annotation in enumerate(self.lines[i_line].annotations):
if annotation.name != "bounding box":
continue

bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
k_w = new_page_width / page_width
k_h = new_page_height / page_height
new_bbox = BBox(x_top_left=int(bbox.x_top_left * k_w), y_top_left=int(bbox.y_top_left * k_h),
width=int(bbox.width * k_w), height=int(bbox.height * k_h))

self.lines[i_line].annotations[i_ann] = BBoxAnnotation(start=annotation.start,
end=annotation.end,
value=new_bbox,
page_width=new_page_width,
page_height=new_page_height)

def __repr__(self) -> str:
return self.__str__()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,24 +63,12 @@ def _process_one_page(self,
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)

if self.config.get("labeling_mode"):
save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))

return lines, tables, page.attachments, []

def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
"""
Change table boxes's width height into pdf space like textual lines
"""

for table in tables:
for row in table.matrix_cells:

for cell in row:
cell.change_lines_boxes_page_width_height(new_page_width=pdf_width, new_page_height=pdf_height)

def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool:
"""
Check obj_bbox inside some unreadable blocks or not
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithB

attachments = images if len(images) < 10 else []

return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments, pdf_page_height=height, pdf_page_width=width)
return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments)

def __extract_image(self,
directory: str,
Expand Down Expand Up @@ -186,7 +186,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl
# duplicated previous style
chars_with_style.append(chars_with_style[-1])

annotations = self.__extract_words_bbox_annotation(lobj, height, width)
annotations = self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)
# 3 - extract range from chars_with_style array
char_pointer = 0

Expand All @@ -197,7 +197,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl

return annotations

def __extract_words_bbox_annotation(self, lobj: LTTextContainer, height: int, width: int) -> List[Annotation]:
def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]:
words: List[WordObj] = []
word: WordObj = WordObj(start=0, end=0, value=LTTextContainer())
if isinstance(lobj, LTTextLineHorizontal):
Expand All @@ -216,7 +216,7 @@ def __extract_words_bbox_annotation(self, lobj: LTTextContainer, height: int, wi
annotations = [
BBoxAnnotation(start=word.start,
end=word.end,
value=create_bbox(height=height, k_h=1.0, k_w=1.0, lobj=word.value),
value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value),
page_width=width,
page_height=height) for word in words
]
Expand Down
66 changes: 31 additions & 35 deletions dedoc/scripts/test_words_bbox_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,36 +127,35 @@ def __draw_word_annotations(self, image: np.ndarray, word_annotations: List[Bbox
cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 255), thickness)
return image

def __draw_tables_words(self, tables: List[dict], image: np.ndarray) -> np.ndarray:
for table in tables:
table_angle = table["metadata"]["rotated_angle"]

word_annotations = self.__get_words_annotation_from_cell(table)
image = self.__draw_word_annotations(image, word_annotations, angle=table_angle)
return image

def test_pdf_documents(self):
filename_parameters_outputdir = [
["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="true"), "pdfminer_reader"],
["pdf_with_text_layer/english_doc.pdf", dict(pdf_with_text_layer="tabby"), "tabby_reader"]
]
def test_pdfminer_document(self):
output_path = os.path.join(self.output_path, "pdfminer_reader")
os.makedirs(output_path, exist_ok=True)
file_name = "pdf_with_text_layer/english_doc.pdf"
result = self._send_request(file_name, data=dict(pdf_with_text_layer="true"))
structure = result["content"]["structure"]
word_annotations = self.__get_words_annotation(structure)
image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
image = self.__draw_word_annotations(image, word_annotations)
cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)

def test_tabby_document(self):
output_path = os.path.join(self.output_path, "tabby_reader")
os.makedirs(output_path, exist_ok=True)
file_name = "pdf_with_text_layer/english_doc.pdf"
result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
structure = result["content"]["structure"]
image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
word_annotations = self.__get_words_annotation(structure)
ann = word_annotations[0]
if ann is not None:
bbox = json.loads(ann.bbox)
image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)

for file_name, parameters, outputdir in filename_parameters_outputdir:
output_path = os.path.join(self.output_path, outputdir)
os.makedirs(output_path, exist_ok=True)
result = self._send_request(file_name, data=parameters)
structure = result["content"]["structure"]
word_annotations = self.__get_words_annotation(structure)
image = np.asarray(get_page_image(self._get_abs_path(file_name), 0))
ann = word_annotations[0]
if ann is not None:
bbox = json.loads(ann.bbox)
image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
image = self.__draw_word_annotations(image, word_annotations)
tables = result["content"]["tables"]
if len(tables) > 0:
image = self.__draw_tables_words(tables, image)
cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)
image = self.__draw_word_annotations(image, word_annotations)
table0 = result["content"]["tables"][0]
word_annotations = self.__get_words_annotation_from_cell(table0)
image = self.__draw_word_annotations(image, word_annotations, angle=0)
cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)

def test_table_word_extraction(self):
output_path = os.path.join(self.output_path, 'tables')
Expand All @@ -169,13 +168,13 @@ def test_table_word_extraction(self):
result = self._send_request(file_name, data=dict())
table0 = result["content"]["tables"][0]
page_angle = result["metadata"]["other_fields"]["rotated_page_angles"][0]
table_angle = table0["metadata"]["rotated_angle"]

word_annotations = self.__get_words_annotation_from_cell(table0)
image = cv2.imread(self._get_abs_path(file_name))
image = rotate_image(image, page_angle)
tables = result["content"]["tables"]
if len(tables) > 0:
image = self.__draw_tables_words(tables, image)

image = self.__draw_word_annotations(image, word_annotations, angle=table_angle)
cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image)

def test_document_image_reader(self) -> None:
Expand All @@ -192,7 +191,4 @@ def test_document_image_reader(self) -> None:
image = cv2.imread(self._get_abs_path(filename))
image = rotate_image(image, result["metadata"]["other_fields"].get("rotated_page_angles", [0.])[0])
image = self.__draw_word_annotations(image, word_annotations)
tables = result["content"]["tables"]
if len(tables) > 0:
image = self.__draw_tables_words(tables, image)
cv2.imwrite(os.path.join(output_path, filename.split("/")[-1]), image)

0 comments on commit 7922b18

Please sign in to comment.