Skip to content

Commit

Permalink
TLDR-748 add patterns for structure extraction, delete regexps from r…
Browse files Browse the repository at this point in the history
…eaders
  • Loading branch information
NastyBoget committed Aug 13, 2024
1 parent 2c6cca9 commit 11f1a3a
Show file tree
Hide file tree
Showing 27 changed files with 309 additions and 88 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ exclude =
*__init__.py,
resources,
venv,
.venv,
build,
dedoc.egg-info,
docs/_build,
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
args:
- "--config=.flake8"
additional_dependencies: [
Expand Down
6 changes: 2 additions & 4 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]:
import math
from joblib import Parallel, delayed
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import flatten

Expand Down Expand Up @@ -129,10 +129,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)

prev_line = None
for line in all_lines_with_links:
line.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
prev_line = line
line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()

all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,6 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith

lines = []
page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
prev_line = None
labeling_mode = self.config.get("labeling_mode", False)

for block in page["blocks"]:
Expand Down Expand Up @@ -261,15 +260,13 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
uid=uid,
location=Location(bbox=bbox, page_number=page_number),
order=order)
line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, prev_line, meta)
prev_line = line_with_location
line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, meta)

lines.append(line_with_location)

return lines

def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_type: str) -> HierarchyLevel:
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel:
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth

if line_type == HierarchyLevel.header:
Expand All @@ -278,7 +275,7 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_
return HierarchyLevel(1, header_level, False, line_type)

if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines
return DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
return HierarchyLevel(None, None, False, HierarchyLevel.list_item)

return HierarchyLevel(None, None, True, line_type)

Expand Down
11 changes: 3 additions & 8 deletions dedoc/readers/txt_reader/raw_text_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,14 @@ def __get_encoding(self, path: str, parameters: dict) -> str:
def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
import time
from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.utils.utils import calculate_file_hash

lines = []
file_hash = calculate_file_hash(path=path)
number_of_empty_lines = 0
previous_log_time = time.time()
prev_line = None

for line_id, line in self.__get_lines(path=path, encoding=encoding):
if time.time() - previous_log_time > 5:
Expand All @@ -76,14 +75,10 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
indent_annotation = self.__get_indent_annotation(line)

line_with_meta = LineWithMeta(line=line, metadata=metadata, annotations=[spacing_annotation, indent_annotation], uid=uid)
line_with_meta.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line_with_meta, prev_line)
prev_line = line_with_meta
line_with_meta.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()
lines.append(line_with_meta)

if line.isspace():
number_of_empty_lines += 1
else:
number_of_empty_lines = 0
number_of_empty_lines = number_of_empty_lines + 1 if line.isspace() else 0

return lines

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import List, Optional

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern


class DefaultStructureExtractor(AbstractStructureExtractor):
Expand All @@ -12,82 +12,57 @@ class DefaultStructureExtractor(AbstractStructureExtractor):
You can find the description of this type of structure in the section :ref:`other_structure`.
"""
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix
from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix
from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix
from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix

document_type = "other"

prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, AnyLetterPrefix, BulletPrefix]

def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
Extract basic structure from the given document and add additional information to the lines' metadata.
To get the information about the method's parameters look at the documentation of the class \
:class:`~dedoc.structure_extractors.AbstractStructureExtractor`.
"""
previous_line = None
parameters = {} if parameters is None else parameters
patterns = self.__get_patterns(parameters)

for line in document.lines:
if line.metadata.tag_hierarchy_level is None:
line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()

if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.unknown:
line.metadata.hierarchy_level = self.get_hl_list_using_regexp(line, previous_line)
else:
line.metadata.hierarchy_level = self.__get_hl_with_tag(line)
line_pattern = None
for pattern in patterns:
if pattern.match(line):
line_pattern = pattern
break

line.metadata.hierarchy_level = line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text()
assert line.metadata.hierarchy_level is not None
if line.metadata.hierarchy_level.line_type != HierarchyLevel.raw_text:
previous_line = line

return document

def __get_hl_with_tag(self, line: LineWithMeta) -> HierarchyLevel:
assert line.metadata.tag_hierarchy_level is not None
level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2

if level_1 is None or level_2 is None:
return line.metadata.tag_hierarchy_level

if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.header:
return HierarchyLevel(level_1=1, level_2=level_2, can_be_multiline=False, line_type=HierarchyLevel.header)

if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.list_item:
return HierarchyLevel(level_1=level_1, level_2=level_2, can_be_multiline=False, line_type=HierarchyLevel.list_item)

return line.metadata.tag_hierarchy_level

@staticmethod
def get_hl_list_using_regexp(line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> HierarchyLevel:
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_prefix
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix
from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix
from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix

prefix = get_prefix(DefaultStructureExtractor.prefix_list, line)

# TODO dotted list without space after numbering, like "1.Some text"
if prefix.name == DottedPrefix.name: # list like 1.1.1
depth = len(prefix.numbers)
if all((n <= 1900 for n in prefix.numbers)) and depth <= 9:
return HierarchyLevel(2, depth, False, line_type=HierarchyLevel.list_item)
return HierarchyLevel.create_raw_text()

if prefix.name == BracketPrefix.name: # list like 1)
# check if tesseract recognize russian б as 6 (bi as six)
if prefix.prefix_num == 6 and previous_line is not None and previous_line.line.lower().strip().startswith(("a)", "а)")):
return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) # here is russian and english letters
return HierarchyLevel(3, 1, False, line_type=HierarchyLevel.list_item)

if prefix.name == AnyLetterPrefix.name: # list like a)
return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item)

if prefix.name == BulletPrefix.name: # bullet list
return HierarchyLevel(5, 1, False, line_type=HierarchyLevel.list_item) # TODO make bullet list

# no match for any list has been found
return HierarchyLevel(None, None, line.metadata.tag_hierarchy_level.can_be_multiline, HierarchyLevel.raw_text)
def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
if "patterns" not in parameters:
from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern
from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern
from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern
from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern

patterns = [
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1),
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1),
]
else:
import json
from dedoc.structure_extractors.patterns.utils import get_pattern

patterns = parameters["patterns"]
if isinstance(patterns, str):
patterns = json.loads(patterns)
assert isinstance(patterns, list)
assert len(patterns) > 0
if isinstance(patterns[0], dict):
patterns = [get_pattern(pattern) for pattern in patterns]

assert isinstance(patterns[0], AbstractPattern)
return patterns
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
from dedoc.structure_extractors.hierarchy_level_builders.abstract_hierarchy_level_builder import AbstractHierarchyLevelBuilder
from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.abstract_body_hierarchy_level_builder import \
AbstractBodyHierarchyLevelBuilder
from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots
from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagHeaderPattern, TagListPattern


class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder):
Expand All @@ -17,6 +17,14 @@ class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder):
def __int__(self) -> None:
super().__init__()
self.digits_with_dots_regexp = regexps_digits_with_dots
self.patterns = [
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1),
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1),
]

def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]:
if len(lines_with_labels) > 0:
Expand All @@ -27,7 +35,6 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s
result = [body_line]
else:
result = [AbstractBodyHierarchyLevelBuilder.get_body_line(init_hl_depth=init_hl_depth)]
previous_raw_text_line = None
previous_named_item_line = None

for line, prediction in lines_with_labels:
Expand All @@ -44,8 +51,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s
elif prediction == "raw_text":
line = self.__postprocess_raw_text(line, init_hl_depth)
if not (line.metadata.hierarchy_level is not None and line.metadata.hierarchy_level.line_type == "named_item"):
line.metadata.hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, previous_raw_text_line)
previous_raw_text_line = line
line.metadata.hierarchy_level = self.__get_level_by_patterns(line)
else:
line.metadata.hierarchy_level = HierarchyLevel.create_raw_text()
line.metadata.hierarchy_level.line_type = prediction
Expand All @@ -69,6 +75,15 @@ def __handle_named_item(self, init_hl_depth: int, line: LineWithMeta, prediction
line.metadata.hierarchy_level = hierarchy_level
return line

def __get_level_by_patterns(self, line: LineWithMeta) -> HierarchyLevel:
line_pattern = None
for pattern in self.patterns:
if pattern.match(line):
line_pattern = pattern
break

return line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text()

def __postprocess_raw_text(self, line: LineWithMeta, init_hl_depth: int) -> LineWithMeta:
text = line.line.strip().lower()
if not text.startswith(self.named_item_keywords):
Expand Down
8 changes: 8 additions & 0 deletions dedoc/structure_extractors/patterns/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern
from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern
from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern
from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern

__all__ = ["BracketListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "TagHeaderPattern", "TagListPattern"]
27 changes: 27 additions & 0 deletions dedoc/structure_extractors/patterns/abstract_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from abc import ABC, abstractmethod
from typing import Optional

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta


class AbstractPattern(ABC):
__name = ""

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
self._line_type = line_type
self._level_1 = level_1
self._level_2 = level_2 if level_2 else 1
self._can_be_multiline = can_be_multiline

@classmethod
def name(cls: "AbstractPattern") -> str:
return cls.__name

@abstractmethod
def match(self, line: LineWithMeta) -> bool:
pass

@abstractmethod
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
pass
11 changes: 11 additions & 0 deletions dedoc/structure_extractors/patterns/bracket_list_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import Optional

from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern


class BracketListPattern(RegexpPattern):
__name = "bracket_list"

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
11 changes: 11 additions & 0 deletions dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import Optional

from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern


class BracketRomanListPattern(RegexpPattern):
__name = "bracket_roman_list"

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
Loading

0 comments on commit 11f1a3a

Please sign in to comment.