-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
TLDR-748 add patterns for structure extraction, delete regexps from r…
…eaders
- Loading branch information
1 parent
2c6cca9
commit 11f1a3a
Showing
27 changed files
with
309 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ exclude = | |
*__init__.py, | ||
resources, | ||
venv, | ||
.venv, | ||
build, | ||
dedoc.egg-info, | ||
docs/_build, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern | ||
from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern | ||
from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern | ||
from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern | ||
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern | ||
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern | ||
|
||
__all__ = ["BracketListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "TagHeaderPattern", "TagListPattern"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Optional | ||
|
||
from dedoc.data_structures.hierarchy_level import HierarchyLevel | ||
from dedoc.data_structures.line_with_meta import LineWithMeta | ||
|
||
|
||
class AbstractPattern(ABC): | ||
__name = "" | ||
|
||
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: | ||
self._line_type = line_type | ||
self._level_1 = level_1 | ||
self._level_2 = level_2 if level_2 else 1 | ||
self._can_be_multiline = can_be_multiline | ||
|
||
@classmethod | ||
def name(cls: "AbstractPattern") -> str: | ||
return cls.__name | ||
|
||
@abstractmethod | ||
def match(self, line: LineWithMeta) -> bool: | ||
pass | ||
|
||
@abstractmethod | ||
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: | ||
pass |
11 changes: 11 additions & 0 deletions
11
dedoc/structure_extractors/patterns/bracket_list_pattern.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from typing import Optional | ||
|
||
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix | ||
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern | ||
|
||
|
||
class BracketListPattern(RegexpPattern): | ||
__name = "bracket_list" | ||
|
||
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: | ||
super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) |
11 changes: 11 additions & 0 deletions
11
dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from typing import Optional | ||
|
||
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix | ||
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern | ||
|
||
|
||
class BracketRomanListPattern(RegexpPattern): | ||
__name = "bracket_roman_list" | ||
|
||
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: | ||
super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) |
Oops, something went wrong.