Skip to content

Commit

Permalink
ESL-137 fixed after review; removing some unused functions
Browse files Browse the repository at this point in the history
- fixed after review
- removing some unused functions
  • Loading branch information
oksidgy committed Sep 22, 2023
1 parent fc71c10 commit 156586a
Show file tree
Hide file tree
Showing 13 changed files with 29 additions and 134 deletions.
6 changes: 3 additions & 3 deletions dedoc/data_structures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .annotation import Annotation
from .attached_file import AttachedFile
from .bbox import BBox
from .cell_with_meta import CellWithMeta
from .concrete_annotations import *
from .document_content import DocumentContent
from .document_metadata import DocumentMetadata
Expand All @@ -16,6 +17,5 @@
from .tree_node import TreeNode
from .unstructured_document import UnstructuredDocument

__all__ = (['Annotation', 'AttachedFile', 'BBox', 'CellProperty', 'DocumentContent', 'DocumentMetadata', 'HierarchyLevel', 'LineMetadata',
'LineWithMeta', 'ParsedDocument', 'Serializable', 'Table', 'TableMetadata', 'TreeNode', 'UnstructuredDocument']
+ annotations.__all__)
__all__ = ['Annotation', 'AttachedFile', 'BBox', 'DocumentContent', 'DocumentMetadata', 'HierarchyLevel', 'LineMetadata',
'LineWithMeta', 'ParsedDocument', 'Serializable', 'Table', 'TableMetadata', 'CellWithMeta', 'TreeNode', 'UnstructuredDocument', *annotations.__all__]
24 changes: 10 additions & 14 deletions dedoc/data_structures/bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,19 @@ def y_bottom_right(self) -> int:
def crop_image_by_box(image: np.ndarray, bbox: "BBox") -> np.ndarray:
return image[bbox.y_top_left:bbox.y_bottom_right, bbox.x_top_left:bbox.x_bottom_right]

@staticmethod
def rotate_coordinates(bbox: "BBox", angle_rotate: float, image_shape: Tuple[int]) -> "BBox":
xb, yb = bbox.x_top_left, bbox.y_top_left
# TODO check!!! was xe, ye = self.bbox.x_begin + self.bbox.width, self.bbox.x_begin + self.bbox.height
xe, ye = bbox.x_bottom_right, bbox.y_bottom_right # self.bbox.x_top_left + self.bbox.height
def rotate_coordinates(self, angle_rotate: float, image_shape: Tuple[int]) -> None:
xb, yb = self.x_top_left, self.y_top_left
xe, ye = self.x_bottom_right, self.y_bottom_right
rad = angle_rotate * math.pi / 180

xc = image_shape[1] // 2
yc = image_shape[0] // 2

bbox_xb = min((int(float(xb - xc) * math.cos(rad) - float(yb - yc) * math.sin(rad)) + xc), image_shape[1])
bbox_yb = min((int(float(yb - yc) * math.cos(rad) + float(xb - xc) * math.sin(rad)) + yc), image_shape[0])
bbox_xe = min((int(float(xe - xc) * math.cos(rad) - float(ye - yc) * math.sin(rad)) + xc), image_shape[1])
bbox_ye = min((int(float(ye - yc) * math.cos(rad) + float(xe - xc) * math.sin(rad)) + yc), image_shape[0])
bbox_new = BBox(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb)
xc = image_shape[1] / 2
yc = image_shape[0] / 2

return bbox_new
bbox_xb = min((int(float(xb - xc) * math.cos(rad) - float(yb - yc) * math.sin(rad) + xc)), image_shape[1])
bbox_yb = min((int(float(yb - yc) * math.cos(rad) + float(xb - xc) * math.sin(rad) + yc)), image_shape[0])
bbox_xe = min((int(float(xe - xc) * math.cos(rad) - float(ye - yc) * math.sin(rad) + xc)), image_shape[1])
bbox_ye = min((int(float(ye - yc) * math.cos(rad) + float(xe - xc) * math.sin(rad) + yc)), image_shape[0])
self.__init__(bbox_xb, bbox_yb, bbox_xe - bbox_xb, bbox_ye - bbox_yb)

def __str__(self) -> str:
return f"BBox(x = {self.x_top_left} y = {self.y_top_left}, w = {self.width}, h = {self.height})"
Expand Down
8 changes: 1 addition & 7 deletions dedoc/data_structures/cell_property.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,25 @@
from collections import OrderedDict
from typing import List

import numpy as np
from flask_restx import Api, Model, fields

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.serializable import Serializable


class CellProperty(Serializable):
"""
This class holds information about the table cell.
"""
def __init__(self, colspan: int, rowspan: int, invisible: bool, annotations: List[Annotation] = []) -> None: # noqa
def __init__(self, colspan: int, rowspan: int, invisible: bool) -> None:
"""
:param cell: class which should contain the following attributes: colspan, rowspan, invisible.
"""
self.colspan = colspan
self.rowspan = rowspan
self.invisible = invisible
self.annotations = annotations

def to_dict(self) -> dict:
res = OrderedDict()
res["annotations"] = [annotation.to_dict() for annotation in self.annotations]
res["colspan"] = int(np.int8(self.colspan)) if self.colspan else None
res["rowspan"] = int(np.int8(self.rowspan)) if self.colspan else None
res["invisible"] = self.invisible
Expand All @@ -35,6 +31,4 @@ def get_api_dict(api: Api) -> Model:
"colspan": fields.Integer(description="attribute of union column count"),
"rowspan": fields.Integer(description="attribute of union row count"),
"invisible": fields.Boolean(description='flag for cell display (for example: if invisible==true then style="display: none")'),
"annotations": fields.List(
fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
})
4 changes: 1 addition & 3 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
tables = []
for scan_table in scan_tables:
metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name, rotated_angle=scan_table.location.rotated_angle)
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row]
for row in scan_table.matrix_cells]

cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in scan_table.matrix_cells]
table = Table(metadata=metadata, cells=cells_with_meta)
tables.append(table)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,6 @@ def __set_indentations(self, page: PageWithBBox) -> PageWithBBox:

return page

def __get_line_metadata(self, bbox: TextWithBBox, page_with_lines: PageWithBBox) -> LineMetadata:
return LineMetadata(page_id=page_with_lines.page_num, line_id=bbox.line_num)

def __get_font_size(self, bbox: TextWithBBox, image_height: int) -> int:
"""
determines the font size by the bbox size, return font size in typography point
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,10 @@ def __init__(self, *, config: dict) -> None:
self.logger = config.get("logger", logging.getLogger())

def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], language: str) -> List[List[LineWithMeta]]: # noqa
# try:
# if len(img_cells) == 0:
# return []
for node in tree_nodes:
node.set_crop_text_box(page_image)
# img_cells_cropped = map(crop_image_text, img_cells)
# ids, images = zip(*sorted(enumerate(img_cells_cropped), key=lambda t: -t[1].shape[1]))
tree_nodes.sort(key=lambda t: -t.crop_text_box.width) # TODO check

tree_nodes.sort(key=lambda t: -t.crop_text_box.width)
originalbox_to_fastocrbox = {}
batches = list(self.__nodes2batch(tree_nodes))
for num_batch, nodes_batch in enumerate(batches):
Expand Down Expand Up @@ -66,8 +62,7 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"],

return self.__create_lines_with_meta(tree_nodes, originalbox_to_fastocrbox, page_image)

def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> ( # noqa
Tuple)[OcrPage, List[BBox]]:
def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> Tuple[OcrPage, List[BBox]]: # noqa
concatenated, chunk_boxes = self.__concat_images(src_image=src_image, tree_table_nodes=tree_table_nodes)
if self.config.get("debug_mode", False):
image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", f"stacked_batch_image_{num_batch}.png")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
import concurrent.futures
from collections import namedtuple
from typing import Iterable, Iterator, List
from typing import Iterable, List

import numpy as np

from dedoc.data_structures.bbox import BBox
from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox
from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_document_page, \
get_text_with_bbox_from_document_page_one_column

BBoxLevel = namedtuple("BBoxLevel", ["text_line", "some_word"])
bbox_level = BBoxLevel(4, 5)
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_document_page, get_text_with_bbox_from_document_page_one_column


class OCRLineExtractor:
Expand All @@ -30,43 +23,6 @@ def split_image2lines(self,
if len(filtered_bboxes) >= 0:
new_parsed_doc = PageWithBBox(page_num=page_num, bboxes=filtered_bboxes, image=image)
return new_parsed_doc
"""
def split_imagecell2lines(self,
cell_image: np.ndarray,
page_num: int,
page_height: int,
page_width: int,
language: str = "rus+eng",) -> PageWithBBox:
bboxes = self.__split_image2bboxes_from_cell(cell_image=cell_image, page_num=page_num, language=language, page_height=page_height,
page_width=page_width)
filtered_bboxes = list(self._filtered_bboxes(bboxes))
if len(filtered_bboxes) >= 0:
new_parsed_cell = PageWithBBox(page_num=page_num, bboxes=filtered_bboxes, image=cell_image)
return new_parsed_cell"""

def split_images2lines(self, images: Iterator[np.ndarray], language: str = "rus+eng") -> List[PageWithBBox]:
input_data = ((page, image, language) for page, image in enumerate(images))
with concurrent.futures.ProcessPoolExecutor(max_workers=self.config["n_jobs"]) as executor:
documents = executor.map(self._parse_one_image, input_data)

return [doc for doc in documents if doc is not None]

def _parse_one_image(self, args: List) -> PageWithBBox:
page_num, image, language = args
bboxes = self.__split_image2bboxes(image=image, page_num=page_num, language=language, is_one_column_document=True)
if len(bboxes) > 0:
new_parsed_doc = PageWithBBox(page_num=page_num, bboxes=bboxes, image=image)
return new_parsed_doc

@staticmethod
def _is_box_in(box1: BBox, box2: BBox) -> bool:
"""
check if box1 is in box2
"""
x_inside = (box1.x_top_left >= box2.x_top_left) and (box1.x_bottom_right <= box2.x_bottom_right)
y_inside = (box1.y_top_left >= box2.y_top_left) and (box1.y_bottom_right <= box2.y_bottom_right)
return x_inside and y_inside

def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str, is_one_column_document: bool) -> List[TextWithBBox]:
ocr_conf_threshold = self.config.get("ocr_conf_threshold", -1)
Expand All @@ -81,14 +37,6 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str,

return line_boxes

"""def __split_image2bboxes_from_cell(self, cell_image: np.ndarray, page_num: int, language: str, page_height: int, page_width: int) -> List[TextWithBBox]
output_dict = get_text_with_bbox_from_cells(cell_image, language, ocr_conf_threshold=0.0)
line_boxes = [
TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(page_width, page_height))
for line_num, line in enumerate(output_dict.lines)]
return line_boxes"""

def _filtered_bboxes(self, bboxes: List[TextWithBBox]) -> Iterable[TextWithBBox]:
for text_with_bbox in bboxes:
bbox = text_with_bbox.bbox
Expand Down
16 changes: 0 additions & 16 deletions dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,6 @@

from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage

"""
def get_cell_text_by_ocr(img_cell: np.ndarray, language: str) -> str:
if img_cell.shape[0] == 0 or img_cell.shape[1] == 0:
return ""
text = get_text_from_table_cell(img_cell, language=language)
return text
def get_text_from_table_cell(image: np.ndarray, language: str) -> str:
config = "--psm 6"
text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)["text"]
return text
"""


def get_text_with_bbox_from_document_page_one_column(image: np.ndarray, language: str, ocr_conf_threshold: float) -> OcrPage:
"""
Expand Down
13 changes: 0 additions & 13 deletions dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import logging
from typing import Iterator, List

import cv2
import numpy as np
from joblib import Parallel, delayed

from dedoc.utils.image_utils import rotate_image
from dedoc.utils.utils import get_batch


class ScanRotator:
Expand Down Expand Up @@ -48,13 +45,3 @@ def auto_rotate(self, image: np.ndarray, orientation_angle: int = 0) -> (np.ndar
if self.config.get("debug_mode"):
self.logger.debug(f"Best angle: {best_angle}, orientation angle: {orientation_angle}")
return rotated, best_angle + orientation_angle

def rotate(self, images: List[np.ndarray]) -> Iterator[np.ndarray]:
"""
automatic rotation of list of images
"""
n_jobs = self.config["n_jobs"]
for batch in get_batch(size=n_jobs, iterable=images):
rotated_ = Parallel(n_jobs=n_jobs)(delayed(self.auto_rotate)(img) for img in batch)
for res, _ in rotated_:
yield res
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,14 @@

import numpy as np

from dedoc.data_structures import BBox
from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import \
TableAttributeExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours


Expand Down Expand Up @@ -61,7 +59,7 @@ def extract_onepage_tables_from_image(self,

for matrix in tables:
for location in matrix.locations:
location.bbox = BBox.rotate_coordinates(bbox=location.bbox, angle_rotate=-angle_rotate, image_shape=image.shape)
location.bbox.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape)
location.rotated_angle = angle_rotate

tables = self.__select_attributes_matrix_tables(tables=tables)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import math
import os
import subprocess
from collections import namedtuple
from typing import List, Optional, Tuple

import numpy as np
Expand Down Expand Up @@ -36,8 +35,6 @@
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.utils import calculate_file_hash

CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")


class PdfTabbyReader(PdfBaseReader):
"""
Expand Down
1 change: 0 additions & 1 deletion dedoc/readers/reader_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def parse_file(self, tmp_dir: str, filename: str, parameters: Dict[str, str]) ->
if can_read:
unstructured_document = reader.read(path=file_path, document_type=document_type, parameters=parameters)
assert len(unstructured_document.lines) == 0 or isinstance(unstructured_document.lines[0], LineWithMeta)
assert isinstance(unstructured_document, UnstructuredDocument) # TODO remove
return unstructured_document

raise BadFileFormatError(
Expand Down
12 changes: 7 additions & 5 deletions dedoc/scripts/test_words_bbox_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,18 +113,20 @@ def rotate_coordinate(x: int, y: int, xc: float, yc: float, angle: float) -> Tup
return x_rotated, y_rotated

@staticmethod
def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType], angle: float) -> np.ndarray:
def draw_word_annotations(image: np.ndarray, word_annotations: List[BboxWithConfsType], angle: float = 0.) -> np.ndarray:

font_scale, thickness = TestWordExtraction.normalize_font_thickness(image)
x_c = image.shape[1] / 2
y_c = image.shape[0] / 2

for ann in word_annotations:
bbox = json.loads(ann.bbox)
p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"]))
p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"]))
x_c = image.shape[1] / 2
y_c = image.shape[0] / 2
p1 = TestWordExtraction.rotate_coordinate(p1[0], p1[1], x_c, y_c, angle)
p2 = TestWordExtraction.rotate_coordinate(p2[0], p2[1], x_c, y_c, angle)

if angle == 0.0:
p1 = TestWordExtraction.rotate_coordinate(p1[0], p1[1], x_c, y_c, angle)
p2 = TestWordExtraction.rotate_coordinate(p2[0], p2[1], x_c, y_c, angle)

cv2.rectangle(image, p1, p2, (0, 255, 0) if ann.text_type == "typewritten" else (255, 0, 0))
text = ",".join(ann.confs) if ann.confs != [] else "None"
Expand Down

0 comments on commit 156586a

Please sign in to comment.