Skip to content

Commit

Permalink
ESL-155 Add table bbox annotations to tabby reader (#354)
Browse files Browse the repository at this point in the history
* Add table bbox annotations to tabby reader

* Fix tests

* Review fixes

---------

Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
  • Loading branch information
sunveil and NastyBoget authored Oct 12, 2023
1 parent ff26829 commit 3d7f22a
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 45 deletions.
72 changes: 32 additions & 40 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,11 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
tables = []
tables_on_image = []
page_number = page["number"]
page_width = int(page["width"])
page_height = int(page["height"])

for table in page["tables"]:
x_top_left = table["x_top_left"]
y_top_left = table["y_top_left"]
x_bottom_right = x_top_left + table["width"]
y_bottom_right = y_top_left + table["height"]
table_bbox = BBox(x_top_left=table["x_top_left"], y_top_left=table["y_top_left"], width=table["width"], height=table["height"])
order = table["order"] # TODO add table order into TableMetadata
rows = table["rows"]
cell_properties = table["cell_properties"]
Expand All @@ -144,15 +144,24 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
result_cells = []
for num_row, row in enumerate(rows):
assert len(row) == len(cell_properties[num_row])
result_row = []
for num_col, cell_text in enumerate(row):
result_row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=page_number, line_id=0))],
colspan=cell_properties[num_row][num_col]["col_span"],
rowspan=cell_properties[num_row][num_col]["row_span"],
invisible=bool(cell_properties[num_row][num_col]["invisible"])))

result_row = []
for num_col, cell in enumerate(row):
annotations = []
cell_blocks = cell["cell_blocks"]

for c in cell_blocks:
cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"]))
annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height))

result_row.append(CellWithMeta(
lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)],
colspan=cell_properties[num_row][num_col]["col_span"],
rowspan=cell_properties[num_row][num_col]["row_span"],
invisible=bool(cell_properties[num_row][num_col]["invisible"])
))
result_cells.append(result_row)
table_bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) # noqa TODO add table location into TableMetadata

table_name = str(uuid.uuid4())
tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, uid=table_name)))
tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order))
Expand All @@ -161,55 +170,38 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:

def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
lines = []
page_number = page["number"]
page_width = int(page["width"])
page_height = int(page["height"])
page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
prev_line = None

for block in page["blocks"]:
annotations = []
order = block["order"]
block_text = block["text"]
bx_top_left = int(block["x_top_left"])
by_top_left = int(block["y_top_left"])
bx_bottom_right = bx_top_left + int(block["width"])
by_bottom_right = by_top_left + int(block["height"])
indent = block["indent"]
spacing = block["spacing"]
len_block = len(block_text)
annotations.append(IndentationAnnotation(0, len_block, str(indent)))
annotations.append(SpacingAnnotation(0, len_block, str(spacing)))
annotations.append(IndentationAnnotation(0, len_block, str(block["indent"])))
annotations.append(SpacingAnnotation(0, len_block, str(block["spacing"])))

for annotation in block["annotations"]:
is_bold = annotation["is_bold"]
is_italic = annotation["is_italic"]
font_name = annotation["font_name"]
font_size = annotation["font_size"]
link = annotation["metadata"]
url = annotation["url"]
start = annotation["start"]
end = annotation["end"]
x_top_left = int(annotation["x_top_left"])
y_top_left = int(annotation["y_top_left"])
x_bottom_right = bx_top_left + int(annotation["width"])
y_bottom_right = by_top_left + int(annotation["height"])
box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
box = BBox(x_top_left=int(annotation["x_top_left"]), y_top_left=int(annotation["y_top_left"]),
width=int(annotation["width"]), height=int(annotation["height"]))
annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
annotations.append(SizeAnnotation(start, end, str(font_size)))
annotations.append(StyleAnnotation(start, end, font_name))
annotations.append(SizeAnnotation(start, end, str(annotation["font_size"])))
annotations.append(StyleAnnotation(start, end, annotation["font_name"]))

if is_bold:
if annotation["is_bold"]:
annotations.append(BoldAnnotation(start, end, "True"))

if is_italic:
if annotation["is_italic"]:
annotations.append(ItalicAnnotation(start, end, "True"))

if link == "LINK":
annotations.append(LinkedTextAnnotation(start, end, url))
if annotation["metadata"] == "LINK":
annotations.append(LinkedTextAnnotation(start, end, annotation["url"]))

meta = block["metadata"].lower()
uid = f"txt_{file_hash}_{order}"
bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right))
bbox = BBox(x_top_left=int(block["x_top_left"]), y_top_left=int(block["y_top_left"]), width=int(block["width"]), height=int(block["height"]))
annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))

metadata = LineMetadata(page_id=page_number, line_id=order)
Expand Down
Binary file not shown.
1 change: 0 additions & 1 deletion dedoc/scripts/test_words_bbox_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ def test_table_word_extraction(self):

for file_name in file_names:
result = self._send_request(file_name, data=dict())
table0 = result["content"]["tables"][0]
page_angle = result["metadata"]["other_fields"]["rotated_page_angles"][0]

image = cv2.imread(self._get_abs_path(file_name))
Expand Down
8 changes: 4 additions & 4 deletions tests/api_tests/test_api_format_pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ def test_pdf_with_tables(self) -> None:

table = tables[3]["cells"]
self.assertListEqual(["", "2016", "2017", "2018", "2019"], self._get_text_of_row(table[0]))
self.assertListEqual(["", "Прогноз", "Прогноз бюджета", "Прогноз бюджета", "Прогноз бюджета"], self._get_text_of_row(table[1]))
self.assertListEqual(["Ненефтегазов\nые доходы", "10,4", "9,6", "9,6", "9,6"], self._get_text_of_row(table[19]))
self.assertListEqual(["Сальдо\nбюджета", "-3,7", "-3,2", "-2,2", "-1,2"], self._get_text_of_row(table[20]))
self.assertListEqual(["", "Прогноз", "Прогноз бюджета"], self._get_text_of_row(table[1]))
self.assertListEqual(["Ненефтегазов\nые доходы", "10,4", "9,6", "9,6", "9,6"], self._get_text_of_row(table[21]))
self.assertListEqual(["Сальдо\nбюджета", "-3,7", "-3,2", "-2,2", "-1,2"], self._get_text_of_row(table[22]))

tree = content["structure"]
self._check_tree_sanity(tree)
Expand Down Expand Up @@ -225,7 +225,7 @@ def test_tables_with_merged_cells(self) -> None:
result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
table = result["content"]["tables"][0]["cells"]

hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]]
hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 1), 5]]

for (i, j), k in hidden_cells_big_table_with_colspan:
self.assertFalse(table[i][j]["invisible"])
Expand Down

0 comments on commit 3d7f22a

Please sign in to comment.