Skip to content

Commit

Permalink
TLDR-471 added angle rotation from PdfImageReader and Tables
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy committed Sep 22, 2023
1 parent b5253b2 commit 459c065
Show file tree
Hide file tree
Showing 10 changed files with 84 additions and 50 deletions.
19 changes: 19 additions & 0 deletions dedoc/data_structures/bbox.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
from collections import OrderedDict
from typing import Dict, Tuple

Expand Down Expand Up @@ -51,6 +52,24 @@ def y_bottom_right(self) -> int:
def crop_image_by_box(image: np.ndarray, bbox: "BBox") -> np.ndarray:
return image[bbox.y_top_left:bbox.y_bottom_right, bbox.x_top_left:bbox.x_bottom_right]

@staticmethod
def rotate_coordinates(bbox: "BBox", angle_rotate: float, image_shape: Tuple[int]) -> "BBox":
xb, yb = bbox.x_top_left, bbox.y_top_left
# TODO check!!! was xe, ye = self.bbox.x_begin + self.bbox.width, self.bbox.x_begin + self.bbox.height
xe, ye = bbox.x_bottom_right, bbox.y_bottom_right # self.bbox.x_top_left + self.bbox.height
rad = angle_rotate * math.pi / 180

xc = image_shape[1] // 2
yc = image_shape[0] // 2

bbox_xb = min((int(float(xb - xc) * math.cos(rad) - float(yb - yc) * math.sin(rad)) + xc), image_shape[1])
bbox_yb = min((int(float(yb - yc) * math.cos(rad) + float(xb - xc) * math.sin(rad)) + yc), image_shape[0])
bbox_xe = min((int(float(xe - xc) * math.cos(rad) - float(ye - yc) * math.sin(rad)) + xc), image_shape[1])
bbox_ye = min((int(float(ye - yc) * math.cos(rad) + float(xe - xc) * math.sin(rad)) + yc), image_shape[0])
bbox_new = BBox(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb)

return bbox_new

def __str__(self) -> str:
return f"BBox(x = {self.x_top_left} y = {self.y_top_left}, w = {self.width}, h = {self.height})"

Expand Down
7 changes: 5 additions & 2 deletions dedoc/data_structures/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class TableMetadata(Serializable):
"""
This class holds the information about the table location in the document and information about cell properties.
"""
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False) -> None:
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False, rotated_angle: float = 0.0) -> None:
"""
:param page_id: number of the page where table starts
:param uid: unique identifier of the table
Expand All @@ -21,18 +21,21 @@ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserte
self.page_id = page_id
self.uid = str(uuid.uuid1()) if not uid else uid
self.is_inserted = is_inserted
self.rotated_angle = rotated_angle

def to_dict(self) -> dict:
res = OrderedDict()
res["uid"] = self.uid
res["page_id"] = self.page_id
res["is_inserted"] = self.is_inserted
res["rotated_angle"] = self.rotated_angle
return res

@staticmethod
def get_api_dict(api: Api) -> Model:
return api.model("TableMetadata", {
"page_id": fields.Integer(readonly=False, description="table start page number"),
"uid": fields.String(description="table unique id"),
"is_inserted": fields.Boolean(description="was the table inserted into document body")
"is_inserted": fields.Boolean(description="was the table inserted into document body"),
"rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes?")
})
21 changes: 4 additions & 17 deletions dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,24 @@
import math
from collections import OrderedDict
from functools import total_ordering
from typing import Any, Dict, Tuple
from typing import Any, Dict

from dedoc.data_structures.bbox import BBox


@total_ordering
class Location:
def __init__(self, page_number: int, bbox: BBox, name: str = "") -> None:
def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0) -> None:
self.page_number = page_number
self.bbox = bbox
self.name = name

def rotate_coordinates(self, angle_rotate: float, image_shape: Tuple[int]) -> None:
xb, yb = self.bbox.x_top_left, self.bbox.y_top_left
# TODO check!!! was xe, ye = self.bbox.x_begin + self.bbox.width, self.bbox.x_begin + self.bbox.height
xe, ye = self.bbox.x_bottom_right, self.bbox.y_bottom_right # self.bbox.x_top_left + self.bbox.height
rad = angle_rotate * math.pi / 180

bbox_xb = min((int(float(xb) * math.cos(rad) - float(yb) * math.sin(rad))), image_shape[1])
bbox_yb = min((int(float(yb) * math.cos(rad) + float(xb) * math.sin(rad))), image_shape[0])
bbox_xe = min((int(float(xe) * math.cos(rad) - float(ye) * math.sin(rad))), image_shape[1])
bbox_ye = min((int(float(ye) * math.cos(rad) + float(xe) * math.sin(rad))), image_shape[0])
bbox_new = BBox(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb)

self.bbox = bbox_new
self.rotated_angle = rotated_angle

def to_dict(self) -> Dict[str, Any]:
res = OrderedDict()
res["page_number"] = self.page_number
res["bbox"] = self.bbox.to_dict() # [x_begin, y_begin, width, height]
res["name"] = self.name
res["rotated_angle"] = self.rotated_angle
return res

def __eq__(self, other: "Location") -> bool:
Expand Down
14 changes: 8 additions & 6 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse)
tables = []
for scan_table in scan_tables:
metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle)
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row]
for row in scan_table.matrix_cells]

Expand Down Expand Up @@ -133,14 +133,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple
metadata["last_page"] = last_page
else:
warnings = []
metadata = None
metadata = {}

if len(result) == 0:
all_lines, unref_tables, attachments = [], [], []
all_lines, unref_tables, attachments, page_angles = [], [], [], []
else:
all_lines, unref_tables, attachments = map(list, map(flatten, zip(*result)))
all_lines, unref_tables, attachments, page_angles = map(list, map(flatten, zip(*result)))
if parameters.need_header_footers_analysis:
lines = [lines for lines, _, _ in result]
lines = [lines for lines, _, _, _ in result]
lines, headers, footers = footer_header_analysis(lines)
all_lines = list(flatten(lines))
mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
Expand All @@ -152,11 +152,13 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple
prev_line = line

all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
metadata["rotated_page_angles"] = page_angles
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

@abstractmethod
def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \
-> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
-> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
pass

def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]:
Expand Down
14 changes: 8 additions & 6 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,11 @@ def _process_one_page(self,
image: np.ndarray,
parameters: ParametersForParseDoc,
page_number: int,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
# --- Step 1: correct orientation and detect column count ---
rotated_image, is_one_column_document = self._detect_column_count_and_orientation(image, parameters)
rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
if self.config.get("debug_mode"):
self.logger.info(f"Angle page rotation = {angle}")

# --- Step 2: do binarization ---
if parameters.need_binarization:
Expand Down Expand Up @@ -100,9 +102,9 @@ def _process_one_page(self,
if self.config.get("labeling_mode"):
save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))

return lines, tables, page.attachments
return lines, tables, page.attachments, [angle]

def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool]:
def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, int]:
"""
Function :
- detects the number of page columns
Expand All @@ -120,10 +122,10 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa
angle = angle if parameters.document_orientation is None else 0
self.logger.info(f"Final orientation angle = {angle}, is_one_column_document = {is_one_column_document}")

rotated_image, _ = self.scan_rotator.auto_rotate(image, angle)
rotated_image, result_angle = self.scan_rotator.auto_rotate(image, angle)
if self.config.get("debug_mode"):
img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
self.logger.info(f"Save image to {img_path}")
cv2.imwrite(img_path, rotated_image)

return rotated_image, is_one_column_document
return rotated_image, is_one_column_document, result_angle
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
union_cell[col_id].y_bottom_right = y_bottom_split

cell_image = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right])
result_row[col_id].lines = __get_ocr_lines(cell_image, language)
result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image)

col_id -= 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@

import numpy as np

from dedoc.data_structures import BBox
from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import \
TableAttributeExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours


Expand All @@ -33,7 +35,7 @@ def extract_onepage_tables_from_image(self,
page_number: int,
language: str,
orient_analysis_cells: bool,
orient_cell_angle: int,
orient_cell_angle: int, # TODO remove
table_type: str) -> List[ScanTable]:
"""
extracts tables from input image
Expand All @@ -59,17 +61,14 @@ def extract_onepage_tables_from_image(self,

for matrix in tables:
for location in matrix.locations:
location.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape)
location.bbox = BBox.rotate_coordinates(bbox=location.bbox, angle_rotate=-angle_rotate, image_shape=image.shape)
location.rotated_angle = angle_rotate

tables = self.__select_attributes_matrix_tables(tables=tables)

"""
TODO: fix in the future
if orient_analysis_cells:
tables = self.__analyze_header_cell_with_diff_orient(tables, language, orient_cell_angle)"""
return tables

""" TODO fix in the future
""" TODO fix in the future (REMOVE)
def __detect_diff_orient(self, cell_text: str) -> bool:
# 1 - разбиваем на строки длины которых состоят хотя бы из одного символа
parts = cell_text.split("\n")
Expand Down Expand Up @@ -165,7 +164,8 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type:
cur_table.matrix_cells = self.splitter.split(cells=cur_table.matrix_cells)

# Эвристика 2: таблица должна иметь больше одного столбца
if len(cur_table.matrix_cells[0]) > 1 or (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []):
if len(cur_table.matrix_cells[0]) > 1 or (
self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []):
tables.append(cur_table)

if self.table_options.split_last_column in table_type:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def __extract(self, path: str, start_page: int = None, end_page: int = None) ->

return all_lines, all_tables, all_tables_on_images

def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[Table], List[ScanTable]]:
tables = []
tables_on_image = []
page_number = page["number"]
Expand Down Expand Up @@ -272,6 +272,6 @@ def _process_one_page(self,
image: np.ndarray,
parameters: ParametersForParseDoc,
page_number: int,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:

return [], [], []
return [], [], [], []
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _process_one_page(self,
image: np.ndarray,
parameters: ParametersForParseDoc,
page_number: int,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
if parameters.need_pdf_table_analysis:
gray_image = self._convert_to_gray(image)
cleaned_image, tables = self.table_recognizer.recognize_tables_from_image(
Expand All @@ -59,15 +59,15 @@ def _process_one_page(self,

page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number)
if page is None:
return [], [], []
return [], [], [], []
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)

if self.config.get("labeling_mode"):
save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))

return lines, tables, page.attachments
return lines, tables, page.attachments, []

def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool:
"""
Expand Down
27 changes: 24 additions & 3 deletions dedoc/scripts/test_words_bbox_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np

from dedoc.api.dedoc_api import config
from dedoc.utils.image_utils import rotate_image
from dedoc.utils.pdf_utils import get_page_image
from tests.api_tests.abstract_api_test import AbstractTestApiDocReader

Expand Down Expand Up @@ -104,14 +105,27 @@ def normalize_font_thickness(image: np.ndarray) -> Tuple[float, int]:
return font_scale, thickness

@staticmethod
def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType]) -> np.ndarray:
def rotate_coordinate(x: int, y: int, xc: float, yc: float, angle: float) -> Tuple[int, int]:
rad = angle * math.pi / 180
x_rotated = int(float(x - xc) * math.cos(rad) - float(y - yc) * math.sin(rad) + xc)
y_rotated = int(float(y - yc) * math.cos(rad) + float(x - xc) * math.sin(rad) + yc)

return x_rotated, y_rotated

@staticmethod
def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType], angle: float) -> np.ndarray:

font_scale, thickness = TestWordExtraction.normalize_font_thickness(image)

for ann in word_annotations:
bbox = json.loads(ann.bbox)
p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"]))
p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"]))
x_c = image.shape[1] / 2
y_c = image.shape[0] / 2
p1 = TestWordExtraction.rotate_coordinate(p1[0], p1[1], x_c, y_c, angle)
p2 = TestWordExtraction.rotate_coordinate(p2[0], p2[1], x_c, y_c, angle)

cv2.rectangle(image, p1, p2, (0, 255, 0) if ann.text_type == "typewritten" else (255, 0, 0))
text = ",".join(ann.confs) if ann.confs != [] else "None"
cv2.putText(image, text, (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])),
Expand Down Expand Up @@ -148,12 +162,19 @@ def test_tabby_document(self):
def test_table_word_extraction(self):
output_path = os.path.join(self.output_path)
os.makedirs(output_path, exist_ok=True)
file_names = ["tables/example_with_table3.png", "tables/example_with_table4.jpg", "tables/example_with_table5.png", "tables/example_with_table6.png"]
file_names = ["tables/example_with_table5.png", "tables/example_with_table3.png", "tables/example_with_table4.jpg",
"tables/example_with_table6.png" "tables/example_with_table_horizontal_union.jpg"]
for file_name in file_names:
result = self._send_request(file_name, data=dict())
table0 = result["content"]["tables"][0]
page_angle = result["metadata"]["other_fields"]["rotated_page_angles"][0]
table_angle = table0["metadata"]["rotated_angle"]

word_annotations = TestWordExtraction.get_words_annotation_from_cell(table0)
image = cv2.imread(self._get_abs_path(file_name))
image = TestWordExtraction.draw_word_annotations(image, word_annotations)
image = rotate_image(image, page_angle)
#image = rotate_image(image, table_angle)

image = TestWordExtraction.draw_word_annotations(image, word_annotations, angle=table_angle)
cv2.imwrite(os.path.join(output_path, file_name), image)

0 comments on commit 459c065

Please sign in to comment.