Skip to content

Commit

Permalink
TLDR-748 fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Aug 14, 2024
1 parent 11f1a3a commit 142aa81
Show file tree
Hide file tree
Showing 28 changed files with 146 additions and 87 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel:
if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines
return HierarchyLevel(None, None, False, HierarchyLevel.list_item)

return HierarchyLevel(None, None, True, line_type)
return HierarchyLevel.create_unknown()

def __jar_path(self) -> str:
import os
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/utils/line_object_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i
@return:
"""
if len(lines) == 0:
metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_raw_text(), page_id=0, line_id=0)
metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_unknown(), page_id=0, line_id=0)
lines = [LineWithLocation(line="", metadata=metadata, annotations=[], location=Location(page_number=0, bbox=BBox(0, 0, 1, 1)))]
last_page_line = self._get_last_page_line(lines)
all_objects = list(lines + tables + images)
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pptx_reader/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties
def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
text = ""
paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
hierarchy_level = HierarchyLevel.create_raw_text()
hierarchy_level = HierarchyLevel.create_unknown()

if is_title or paragraph_properties.title:
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
Expand Down
8 changes: 1 addition & 7 deletions dedoc/readers/txt_reader/raw_text_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,9 @@ def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int:
return space_this.end() - space_this.start()

def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool:
from dedoc.data_structures.hierarchy_level import HierarchyLevel

if not line.metadata.tag_hierarchy_level.can_be_multiline and \
line.metadata.tag_hierarchy_level.line_type not in (HierarchyLevel.raw_text, HierarchyLevel.unknown):
return True
space_this = self.__get_starting_spacing(line)
space_prev = self.__get_starting_spacing(previous_line)
return line.metadata.tag_hierarchy_level.line_type in (HierarchyLevel.raw_text, HierarchyLevel.unknown) \
and not line.line.isspace() and space_this - space_prev >= 2
return not line.line.isspace() and space_this - space_prev >= 2

def _postprocess(self, document: UnstructuredDocument) -> UnstructuredDocument:
previous_line = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,18 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern
from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern

patterns = [
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1),
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1),
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False),
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
TagTypePattern(),
TagPattern(line_type=HierarchyLevel.raw_text)
]
else:
import json
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@
from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.abstract_body_hierarchy_level_builder import \
AbstractBodyHierarchyLevelBuilder
from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots
from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagHeaderPattern, TagListPattern
from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagListPattern, TagPattern


class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder):
named_item_keywords = ("введение", "заключение", "библиографический список", "список литературы", "глава", "приложение", "приложения")

def __int__(self) -> None:
def __init__(self) -> None:
super().__init__()
self.digits_with_dots_regexp = regexps_digits_with_dots
self.patterns = [
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1),
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1),
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
TagPattern(line_type=HierarchyLevel.raw_text)
]

def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]:
Expand Down Expand Up @@ -66,10 +66,10 @@ def __handle_named_item(self, init_hl_depth: int, line: LineWithMeta, prediction
if text.startswith(self.named_item_keywords):
hierarchy_level = HierarchyLevel(init_hl_depth, 0, True, prediction)
elif item_depth == -1:
if previous_named_item_line and previous_named_item_line.metadata.hierarchy_level.line_type == "named_item":
if previous_named_item_line:
hierarchy_level = previous_named_item_line.metadata.hierarchy_level
else:
hierarchy_level = HierarchyLevel(init_hl_depth + 1, 0, True, prediction)
hierarchy_level = HierarchyLevel(init_hl_depth, 0, True, prediction)
else:
hierarchy_level = HierarchyLevel(init_hl_depth, item_depth - 1, True, prediction)
line.metadata.hierarchy_level = hierarchy_level
Expand Down
9 changes: 8 additions & 1 deletion dedoc/structure_extractors/patterns/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern
from dedoc.structure_extractors.patterns.bracket_roman_list_pattern import BracketRomanListPattern
from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern
from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern
from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern
from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern
from dedoc.structure_extractors.patterns.start_word_pattern import StartWordPattern
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern
from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern

__all__ = ["BracketListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "TagHeaderPattern", "TagListPattern"]
__all__ = ["BracketListPattern", "BracketRomanListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "RegexpPattern", "RomanListPattern",
"StartWordPattern", "TagHeaderPattern", "TagListPattern", "TagPattern", "TagTypePattern"]
43 changes: 39 additions & 4 deletions dedoc/structure_extractors/patterns/abstract_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@
class AbstractPattern(ABC):
__name = ""

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
self._line_type = line_type
self._level_1 = level_1
self._level_2 = level_2 if level_2 else 1
self._level_2 = level_2
self._can_be_multiline = can_be_multiline

@classmethod
Expand All @@ -22,6 +26,37 @@ def name(cls: "AbstractPattern") -> str:
def match(self, line: LineWithMeta) -> bool:
pass

@abstractmethod
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
pass
return HierarchyLevel(
line_type=self._get_line_type(line),
level_1=self._get_level_1(line),
level_2=self._get_level_2(line),
can_be_multiline=self._get_can_be_multiline(line)
)

def _get_line_type(self, line: LineWithMeta) -> str:
if self._line_type is not None:
return self._line_type

if line.metadata.tag_hierarchy_level is None:
raise ValueError(f"Cannot resolve line type: tag_hierarchy_level is missing and {self.__name} line_type isn't configured")

return line.metadata.tag_hierarchy_level.line_type

def _get_level_1(self, line: LineWithMeta) -> Optional[int]:
if self._level_1 is not None:
return self._level_1

return line.metadata.tag_hierarchy_level.level_1 if line.metadata.tag_hierarchy_level else None

def _get_level_2(self, line: LineWithMeta) -> Optional[int]:
if self._level_2 is not None:
return self._level_2

return line.metadata.tag_hierarchy_level.level_2 if line.metadata.tag_hierarchy_level else None

def _get_can_be_multiline(self, line: LineWithMeta) -> bool:
if self._can_be_multiline is not None:
return self._can_be_multiline

return line.metadata.tag_hierarchy_level.can_be_multiline if line.metadata.tag_hierarchy_level else True
6 changes: 5 additions & 1 deletion dedoc/structure_extractors/patterns/bracket_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,9 @@
class BracketListPattern(RegexpPattern):
__name = "bracket_list"

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,9 @@
class BracketRomanListPattern(RegexpPattern):
__name = "bracket_roman_list"

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
6 changes: 5 additions & 1 deletion dedoc/structure_extractors/patterns/bullet_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,9 @@
class BulletListPattern(RegexpPattern):
__name = "bullet_list"

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
14 changes: 11 additions & 3 deletions dedoc/structure_extractors/patterns/dotted_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,20 @@
class DottedListPattern(RegexpPattern):
__name = "dotted_list"

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
level_2 = self.__get_list_depth(line=line)
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=level_2, can_be_multiline=self._can_be_multiline)
return HierarchyLevel(
line_type=self._get_line_type(line),
level_1=self._get_level_1(line),
level_2=self.__get_list_depth(line=line),
can_be_multiline=self._get_can_be_multiline(line)
)

def __get_list_depth(self, line: LineWithMeta) -> int:
text = line.line.strip().lower()
Expand Down
6 changes: 5 additions & 1 deletion dedoc/structure_extractors/patterns/letter_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,9 @@
class LetterListPattern(RegexpPattern):
__name = "letter_list"

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
11 changes: 6 additions & 5 deletions dedoc/structure_extractors/patterns/regexp_pattern.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import re
from typing import Optional

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern


class RegexpPattern(AbstractPattern):
__name = "regexp"

def __init__(self, regexp: str or re.Pattern, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
regexp: str or re.Pattern,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
self._regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

def match(self, line: LineWithMeta) -> bool:
text = line.line.strip().lower()
match = self._regexp.match(text)
return match is not None

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)
6 changes: 5 additions & 1 deletion dedoc/structure_extractors/patterns/roman_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,9 @@
class RomanListPattern(RegexpPattern):
__name = "roman_list"

def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(regexp=RomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
11 changes: 6 additions & 5 deletions dedoc/structure_extractors/patterns/start_word_pattern.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from typing import Optional

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern


class StartWordPattern(AbstractPattern):
__name = "start_word"

def __init__(self, start_word: str, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
def __init__(self,
start_word: str,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
self.__start_word = start_word.strip().lower()

def match(self, line: LineWithMeta) -> bool:
text = line.line.strip().lower()
return text.startswith(self.__start_word)

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)
4 changes: 0 additions & 4 deletions dedoc/structure_extractors/patterns/tag_header_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,3 @@ def match(self, line: LineWithMeta) -> bool:

level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2
return level_1 is not None and level_2 is not None

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
level_2 = line.metadata.tag_hierarchy_level.level_2
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=level_2, can_be_multiline=self._can_be_multiline)
4 changes: 0 additions & 4 deletions dedoc/structure_extractors/patterns/tag_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,3 @@ def match(self, line: LineWithMeta) -> bool:

level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2
return level_1 is not None and level_2 is not None

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2
return HierarchyLevel(line_type=self._line_type, level_1=level_1, level_2=level_2, can_be_multiline=self._can_be_multiline)
9 changes: 9 additions & 0 deletions dedoc/structure_extractors/patterns/tag_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern


class TagPattern(AbstractPattern):
__name = "tag"

def match(self, line: LineWithMeta) -> bool:
return line.metadata.tag_hierarchy_level is not None
Loading

0 comments on commit 142aa81

Please sign in to comment.