Skip to content

Commit

Permalink
ESL-137 after review
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy committed Sep 25, 2023
1 parent efc2820 commit 89147f0
Show file tree
Hide file tree
Showing 27 changed files with 107 additions and 195 deletions.
34 changes: 0 additions & 34 deletions dedoc/data_structures/cell_property.py

This file was deleted.

13 changes: 6 additions & 7 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ class CellWithMeta:
"""
def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
"""
:param lines: text lines (LineWithMeta) of the cell
:param colspan: The value of the rowspan attribute represents the number of columns to span. Like HTML format.
:param rowspan: The value of the rowspan attribute represents the number of rows to span. Like HTML format.
:param invisible: Display or hide cell values
"""
:param lines: text lines (LineWithMeta) of the cell
:param colspan: The value of the rowspan attribute represents the number of columns to span. Like HTML format.
:param rowspan: The value of the rowspan attribute represents the number of rows to span. Like HTML format.
:param invisible: Display or hide cell values
"""
self.lines = lines
self.colspan = colspan
self.rowspan = rowspan
Expand Down Expand Up @@ -45,6 +45,5 @@ def get_api_dict(api: Api) -> Model:
"colspan": fields.Integer(description="attribute of union column count"),
"rowspan": fields.Integer(description="attribute of union row count"),
"invisible": fields.Boolean(description='flag for cell display (for example: if invisible==true then style="display: none")'),
"lines": fields.List(
fields.Nested(LineWithMeta.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
"lines": fields.List(fields.Nested(LineWithMeta.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
})
29 changes: 22 additions & 7 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from collections import OrderedDict
from typing import List, Sized, Union
from typing import List, Optional, Sized, Union
from uuid import uuid1

from flask_restx import Api, Model, fields
Expand All @@ -17,7 +17,10 @@ class LineWithMeta(Sized):
(for example, document title and raw text of the document should not be in the same line).
Still the logical part of the document may be represented by more than one line (for example, document title may consist of many lines).
"""
def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotation], uid: str = None) -> None:
def __init__(self, line: str,
metadata: Optional[LineMetadata] = None,
annotations: Optional[List[Annotation]] = None,
uid: str = None) -> None:
"""
:param line: raw text of the document line
:param metadata: metadata (related to the entire line, as line or page number, its hierarchy level)
Expand All @@ -26,13 +29,26 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
"""

self._line = line
assert isinstance(metadata, LineMetadata)
metadata = LineMetadata(page_id=0, line_id=None) if metadata is None else metadata
self._metadata = metadata
self._annotations = annotations
self._annotations = [] if annotations is None else annotations
self._uid = str(uuid1()) if uid is None else uid

def __len__(self) -> int:
return len(self.line)
return len(self._line)

@staticmethod
def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta":
if len(lines) == 0:
return LineWithMeta("")

common_line = lines[0]

for next_line in lines[1:]:
common_line += LineWithMeta(delimiter)
common_line += next_line

return common_line

def split(self, sep: str) -> List["LineWithMeta"]:
"""
Expand Down Expand Up @@ -141,6 +157,5 @@ def to_dict(self) -> dict:
def get_api_dict(api: Api) -> Model:
return api.model("LineWithMeta", {
"text": fields.String(description="line's text"),
"annotations": fields.List(
fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
"annotations": fields.List(fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")),
})
1 change: 0 additions & 1 deletion dedoc/data_structures/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def __init__(self, cells: List[List[CellWithMeta]], metadata: TableMetadata) ->
"""
:param cells: a list of lists of cells (cell has text, colspan and rowspan attributes).
:param metadata: some table metadata, as location, size and so on.
:param cells_properties: a list of lists of cells properties - each should contain attributes rowspan, colspan, invisible (for repeated cells)
"""
self.metadata = metadata
self.cells = cells
Expand Down
5 changes: 3 additions & 2 deletions dedoc/data_structures/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserte
:param page_id: number of the page where table starts
:param uid: unique identifier of the table
:param is_inserted: indicator if table was already inserted into paragraphs list
:param cell_properties: information about rowspan, colspan and invisibility of each cell
:param rotated_angle: the value of the rotation angle by which the table was rotated during recognition.
Extracted boxes from a table will need to be rotated by this angle.
"""
self.page_id = page_id
self.uid = str(uuid.uuid1()) if not uid else uid
Expand All @@ -37,5 +38,5 @@ def get_api_dict(api: Api) -> Model:
"page_id": fields.Integer(readonly=False, description="table start page number"),
"uid": fields.String(description="table unique id"),
"is_inserted": fields.Boolean(description="was the table inserted into document body"),
"rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes?")
"rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes")
})
2 changes: 1 addition & 1 deletion dedoc/readers/csv_reader/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
for row in data:
row_lines = []
for cell in row:
row_lines.append(CellWithMeta(lines=[LineWithMeta(line=cell, metadata=LineMetadata(page_id=0, line_id=line_id), annotations=[])]))
row_lines.append(CellWithMeta(lines=[LineWithMeta(line=cell, metadata=LineMetadata(page_id=0, line_id=line_id))]))
line_id += 1
cells_with_meta.append(row_lines)

Expand Down
13 changes: 13 additions & 0 deletions dedoc/readers/docx_reader/data_structures/cell_property.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@


class CellProperty:
"""
This class holds information about the table cell.
"""
def __init__(self, colspan: int, rowspan: int, invisible: bool) -> None:
"""
:param cell: class which should contain the following attributes: colspan, rowspan, invisible.
"""
self.colspan = colspan
self.rowspan = rowspan
self.invisible = invisible
4 changes: 2 additions & 2 deletions dedoc/readers/docx_reader/data_structures/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from bs4 import Tag

from dedoc.data_structures import LineMetadata, LineWithMeta
from dedoc.data_structures.cell_property import CellProperty
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.table_metadata import TableMetadata
from dedoc.readers.docx_reader.data_structures.cell_property import CellProperty
from dedoc.readers.docx_reader.data_structures.run import Run
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor

Expand Down Expand Up @@ -83,7 +83,7 @@ def to_table(self) -> Table:
for num_row, row in enumerate(result_cells):
result_row = []
for num_col, cell_text in enumerate(row):
cell = CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=0, line_id=None), annotations=[])],
cell = CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=0, line_id=0))],
colspan=cell_property_list[num_row][num_col].colspan,
rowspan=cell_property_list[num_row][num_col].rowspan,
invisible=cell_property_list[num_row][num_col].invisible)
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/email_reader/email_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def __get_decoded(self, text: str) -> str:

def __get_field(self, message: Message, key: str, line_metadata: LineMetadata) -> LineWithMeta:
text = self.__get_decoded(message.get(key.lower(), ""))
return LineWithMeta(line=text, metadata=line_metadata, annotations=[])
return LineWithMeta(line=text, metadata=line_metadata)

def __get_main_fields(self, message: Message) -> List[LineWithMeta]:
lines = list()
Expand Down
4 changes: 1 addition & 3 deletions dedoc/readers/excel_reader/excel_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,7 @@ def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table:
row = []
for col_id in range(n_cols):
value = str(sheet.cell_value(rowx=row_id, colx=col_id))
row.append(CellWithMeta(lines=[LineWithMeta(line=value,
metadata=LineMetadata(page_id=sheet_id, line_id=None),
annotations=[])]))
row.append(CellWithMeta(lines=[LineWithMeta(line=value, metadata=LineMetadata(page_id=sheet_id, line_id=0))]))
res.append(row)
metadata = TableMetadata(page_id=sheet_id)
return Table(cells=res, metadata=metadata)
2 changes: 1 addition & 1 deletion dedoc/readers/html_reader/html_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def __get_li_header(self, list_type: str, index: int) -> LineWithMeta:
else:
header = str(index + 1) + end
metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel(2, 1, False, line_type=HierarchyLevel.list_item), page_id=0, line_id=0)
header_line = LineWithMeta(line=header, metadata=metadata, annotations=[])
header_line = LineWithMeta(line=header, metadata=metadata)
return header_line

def __read_list(self, lst: Tag, uid: str, path_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/json_reader/json_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def __handle_one_element(self, depth: int, value: Any, line_type: str, line_type

hierarchy_level = HierarchyLevel(level_1=level1, level_2=level2, can_be_multiline=False, line_type=line_type_meta)
metadata = LineMetadata(tag_hierarchy_level=hierarchy_level, page_id=0, line_id=None)
line = LineWithMeta(line=self.__get_text(value), metadata=metadata, annotations=[])
line = LineWithMeta(line=self.__get_text(value), metadata=metadata)
return line

def __is_flat(self, value: Any) -> bool: # noqa
Expand Down
3 changes: 1 addition & 2 deletions dedoc/readers/note_reader/note_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import Optional

from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader
Expand Down Expand Up @@ -40,7 +39,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
text = note_dict["content"]
if isinstance(text, bytes):
text = text.decode()
lines = [LineWithMeta(line=text, annotations=[], metadata=LineMetadata(line_id=0, page_id=0))]
lines = [LineWithMeta(line=text)]
unstructured = UnstructuredDocument(tables=[], lines=lines, attachments=[])

return unstructured
Expand Down
27 changes: 4 additions & 23 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import uuid
from collections import OrderedDict
from typing import List, Optional

from dedoc.data_structures.annotation import Annotation
Expand Down Expand Up @@ -37,23 +36,21 @@ def __init__(self,
y_top_left: int,
y_bottom_right: int,
id_con: int = -1,
lines: List[LineWithMeta] = None,
lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False,
is_attribute_required: bool = False,
rotated_angle: int = 0,
uid: str = None,
contour_coord: Optional[BBox] = None) -> None:

if lines is None:
lines = []
assert x_top_left <= x_bottom_right
assert y_top_left <= y_bottom_right
self.x_top_left = x_top_left
self.x_bottom_right = x_bottom_right
self.y_top_left = y_top_left
self.y_bottom_right = y_bottom_right
self.id_con = id_con
self.lines = lines
self.lines = [] if lines is None else lines
self.is_attribute = is_attribute
self.is_attribute_required = is_attribute_required
self.rotated_angle = rotated_angle
Expand All @@ -67,13 +64,10 @@ def __str__(self) -> str:
return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"

def get_text(self) -> str:
return "\n".join([line.line for line in self.lines])
return LineWithMeta.join(self.lines).line

def get_annotations(self) -> List[Annotation]:
annotations = []
for line in self.lines:
annotations.extend(line.annotations)
return annotations
return LineWithMeta.join(self.lines).annotations

def __repr__(self) -> str:
return self.__str__()
Expand All @@ -85,16 +79,3 @@ def width(self) -> int:
@property
def height(self) -> int:
return self.y_bottom_right - self.y_top_left

def to_dict(self) -> dict:
cell_dict = OrderedDict()
cell_dict["text"] = self.get_text()
cell_dict["is_attribute"] = self.is_attribute
cell_dict["colspan"] = self.colspan
cell_dict["rowspan"] = self.rowspan
cell_dict["invisible"] = self.invisible

return cell_dict

def set_rotated_angle(self, rotated_angle: int) -> None:
self.rotated_angle = rotated_angle
9 changes: 8 additions & 1 deletion dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple

@abstractmethod
def _process_one_page(self, image: np.ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \
-> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
-> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
"""
function parses image and returns:
- recognized textual lines with annotations
- recognized tables on an image
- attachments (figures on images)
- [rotated_angle] - the angle by which the image was rotated for recognition
"""
pass

def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"],
for i, table_tree_node in enumerate(nodes_batch):
cv2.imwrite(os.path.join(tmp_dir, f"image_{num_batch}_{i}.png"), BBox.crop_image_by_box(page_image, table_tree_node.cell_box))

ocr_result, chunk_boxes = self.__handle_one_batch(src_image=page_image, tree_table_nodes=nodes_batch, num_batch=num_batch,
language=language)
ocr_result, chunk_boxes = self.__handle_one_batch(src_image=page_image, tree_table_nodes=nodes_batch, num_batch=num_batch, language=language)

for chunk_index, _ in enumerate(chunk_boxes):
originalbox_to_fastocrbox[nodes_batch[chunk_index].cell_box] = []
Expand Down
4 changes: 2 additions & 2 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _process_one_page(self,
image: np.ndarray,
parameters: ParametersForParseDoc,
page_number: int,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[int]]:
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
# --- Step 1: correct orientation and detect column count ---
rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
if self.config.get("debug_mode"):
Expand Down Expand Up @@ -104,7 +104,7 @@ def _process_one_page(self,

return lines, tables, page.attachments, [angle]

def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, int]:
def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, float]:
"""
Function :
- detects the number of page columns
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def determine_score(self, arr: np.ndarray, angle: int) -> (np.ndarray, float):
score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
return score

def auto_rotate(self, image: np.ndarray, orientation_angle: int = 0) -> (np.ndarray, int):
def auto_rotate(self, image: np.ndarray, orientation_angle: float = 0.) -> (np.ndarray, float):
if orientation_angle:
image = rotate_image(image, orientation_angle)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,7 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type:
cur_table.matrix_cells = self.splitter.split(cells=cur_table.matrix_cells)

# Эвристика 2: таблица должна иметь больше одного столбца
if len(cur_table.matrix_cells[0]) > 1 or (
self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []):
if len(cur_table.matrix_cells[0]) > 1 or (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []):
tables.append(cur_table)

if self.table_options.split_last_column in table_type:
Expand Down
Loading

0 comments on commit 89147f0

Please sign in to comment.