From f0be0dbefd7edac7f8e4c91912653cb1075a47f8 Mon Sep 17 00:00:00 2001 From: Andrew Perminov Date: Wed, 27 Sep 2023 11:12:58 +0300 Subject: [PATCH] TLDR-472 add flake8-fill-one-line and flake8-multiline-containers and fix lint (#336) * add flake8-fill-one-line and flake8-multiline-containers and fix lint * update precommit hook --- .pre-commit-config.yaml | 2 + .../api_collect_train_dataset.py | 5 +- .../train_dataset/async_archive_handler.py | 14 +---- dedoc/attachments_extractors/utils.py | 12 +++-- dedoc/config.py | 4 +- dedoc/data_structures/annotation.py | 6 ++- dedoc/data_structures/line_with_meta.py | 6 +-- dedoc/data_structures/parsed_document.py | 3 +- dedoc/data_structures/tree_node.py | 5 +- dedoc/download_models.py | 10 +--- dedoc/extensions.py | 17 ++++--- .../image_metadata_extractor.py | 3 +- .../docx_reader/numbering_extractor.py | 7 +-- dedoc/readers/html_reader/html_reader.py | 24 +++------ dedoc/readers/html_reader/html_tags.py | 47 ++--------------- dedoc/readers/mhtml_reader/mhtml_reader.py | 5 +- .../pdf_auto_reader/pdf_auto_reader.py | 5 +- dedoc/readers/pdf_reader/pdf_base_reader.py | 38 +++++++------- .../dataset_executor.py | 3 +- .../transforms.py | 3 +- .../ocr/ocr_cell_extractor.py | 5 +- .../ocr/ocr_line_extractor.py | 12 ++--- .../table_recognizer/cell_splitter.py | 5 +- .../split_last_hor_union_cells.py | 12 ++--- .../table_utils/img_processing.py | 3 +- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 3 +- .../pdfminer_reader/pdfminer_extractor.py | 21 +++----- .../pdf_reader/utils/line_object_linker.py | 5 +- dedoc/readers/pptx_reader/pptx_reader.py | 5 +- dedoc/structure_constructors/table_patcher.py | 5 +- .../classifying_law_structure_extractor.py | 26 +++++----- .../default_structure_extractor.py | 5 +- .../diploma_structure_extractor.py | 5 +- .../foiv_law_structure_extractor.py | 10 ++-- .../law_structure_excractor.py | 10 ++-- .../feature_extractors/abstract_extractor.py | 17 ++++--- .../feature_extractors/law_text_features.py | 3 +- .../list_features/list_features_extractor.py | 8 +-- .../toc_feature_extractor.py | 11 ++-- .../tz_feature_extractor.py | 3 +- .../header_hierarchy_level_builder.py | 6 +-- ...act_application_hierarchy_level_builder.py | 6 +-- .../abstract_body_hierarchy_level_builder.py | 6 +-- .../structure_unit/abstract_structure_unit.py | 5 +- .../structure_unit/foiv_structure_unit.py | 5 +- .../structure_unit/law_structure_unit.py | 5 +- .../stub_hierarchy_level_builder.py | 6 +-- .../diploma_classifier.py | 3 +- .../line_type_classifiers/law_classifier.py | 3 +- .../line_type_classifiers/tz_classifier.py | 3 +- .../filtered_line_label_tasker.py | 6 +-- .../concrete_creators/txt_images_creator.py | 3 +- .../trainer/base_sklearn_line_classifier.py | 3 +- dedoc/train_dataset/trainer/errors_saver.py | 14 ++--- .../trainer/line_lstm_classifier_trainer.py | 12 ++--- pyproject.toml | 2 + tests/api_tests/test_api_format_csv.py | 11 ++-- tests/api_tests/test_api_format_html.py | 5 +- .../test_api_format_pdf_page_limit.py | 20 ++++---- .../test_api_misc_multipage_table.py | 6 ++- .../test_api_misc_with_attachments.py | 5 +- .../test_doctype_law_structure_extractor.py | 23 +++++---- .../test_doctype_tz_feature_extractor.py | 3 +- tests/unit_tests/test_format_docx_reader.py | 6 ++- tests/unit_tests/test_misc_annotations.py | 51 ++++++++++++++----- tests/unit_tests/test_misc_prefix.py | 13 +++-- tests/unit_tests/test_misc_tasker.py | 20 ++++---- tests/unit_tests/test_misc_tree_node.py | 16 +++--- tests/unit_tests/test_module_builders.py | 18 +++---- tests/unit_tests/test_module_cell_splitter.py | 27 +++++++--- 70 files changed, 327 insertions(+), 378 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 330ca49d..012ba03a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,9 @@ repos: flake8-annotations==2.9.1, flake8-bugbear==23.3.12, flake8-builtins==2.1.0, + flake8-fill-one-line>=0.4.0, flake8-import-order==0.18.2, + flake8-multiline-containers==0.0.19, flake8-print==5.0.0, flake8-quotes==3.3.2, flake8-use-fstring==1.4, diff --git a/dedoc/api/train_dataset/api_collect_train_dataset.py b/dedoc/api/train_dataset/api_collect_train_dataset.py index 593cb005..047edfe7 100644 --- a/dedoc/api/train_dataset/api_collect_train_dataset.py +++ b/dedoc/api/train_dataset/api_collect_train_dataset.py @@ -163,10 +163,7 @@ def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParam clear() parameters = query_params.dict(by_alias=True) uid = handler.handle(file=file, parameters=parameters) - return HTMLResponse( - f'Successfully handle file. UID=

get_result_archive/?uid={uid}

', - status_code=201 - ) + return HTMLResponse(f'Successfully handle file. UID=

get_result_archive/?uid={uid}

', status_code=201) @app.get("/get_result_archive") diff --git a/dedoc/api/train_dataset/async_archive_handler.py b/dedoc/api/train_dataset/async_archive_handler.py index 09b6d39d..abf4d761 100644 --- a/dedoc/api/train_dataset/async_archive_handler.py +++ b/dedoc/api/train_dataset/async_archive_handler.py @@ -45,11 +45,7 @@ def _handle_archive(self, uid: str, path: str, parameters: dict) -> str: self.__handle_one_file(archive, file, parameters) self.progress[uid] = f"files done\t= {i + 1} \n files_in_progress\t= {0}\n total\t= {len(archive.namelist())}" - task, _ = self.tasker.create_tasks( - type_of_task=parameters["type_of_task"], - task_size=int(parameters["task_size"]), - task_uid=uid - ) + task, _ = self.tasker.create_tasks(type_of_task=parameters["type_of_task"], task_size=int(parameters["task_size"]), task_uid=uid) return task except Exception as e: self.progress[uid] = f"Fail with\n{e}" @@ -79,13 +75,7 @@ def __init__(self, tasker: Tasker, manager: DedocManager, *, config: dict) -> No self.queue = Queue() self.__results = {} self._progress = tasker.progress_bar - self._handler = _ArchiveHandler( - queue=self.queue, - progress=self._progress, - manager=manager, - tasker=tasker, - config=config, - results=self.__results) + self._handler = _ArchiveHandler(queue=self.queue, progress=self._progress, manager=manager, tasker=tasker, config=config, results=self.__results) self._handler.start() self.tmp_dir = TemporaryDirectory() diff --git a/dedoc/attachments_extractors/utils.py b/dedoc/attachments_extractors/utils.py index 679677e9..7c99e9cf 100644 --- a/dedoc/attachments_extractors/utils.py +++ b/dedoc/attachments_extractors/utils.py @@ -5,11 +5,13 @@ def create_note(content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]: filename = get_unique_name("note.json") - note_dict = {"content": content, - "modified_time": modified_time, - "created_time": created_time, - "size": size if size else len(content), - "author": author} + note_dict = { + "content": content, + "modified_time": modified_time, + "created_time": created_time, + "size": size if size else len(content), + "author": author + } encode_data = json.dumps(note_dict).encode("utf-8") return filename, encode_data diff --git a/dedoc/config.py b/dedoc/config.py index 34f51297..95533213 100644 --- a/dedoc/config.py +++ b/dedoc/config.py @@ -4,9 +4,7 @@ import sys from typing import Any, Optional -logging.basicConfig(stream=sys.stdout, - level=logging.INFO, - format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s") +logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s") DEBUG_MODE = False RESOURCES_PATH = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources")) diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py index 11bffc01..23c27937 100644 --- a/dedoc/data_structures/annotation.py +++ b/dedoc/data_structures/annotation.py @@ -50,8 +50,10 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - names = ["style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table", - "attachment", "spacing", "strike", "subscript", "superscript"] + names = [ + "style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table", + "attachment", "spacing", "strike", "subscript", "superscript" + ] return api.model("Annotation", { "start": fields.Integer(description="annotation start index", required=True, example=0), "end": fields.Integer(description="annotation end index", required=True, example=4), diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index 51260250..18e2d5dd 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -17,11 +17,7 @@ class LineWithMeta(Sized): (for example, document title and raw text of the document should not be in the same line). Still the logical part of the document may be represented by more than one line (for example, document title may consist of many lines). """ - def __init__(self, - line: str, - metadata: Optional[LineMetadata] = None, - annotations: Optional[List[Annotation]] = None, - uid: str = None) -> None: + def __init__(self, line: str, metadata: Optional[LineMetadata] = None, annotations: Optional[List[Annotation]] = None, uid: str = None) -> None: """ :param line: raw text of the document line :param metadata: metadata (related to the entire line, as line or page number, its hierarchy level) diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index 9483ecb1..f4a05710 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -58,4 +58,5 @@ def get_api_dict(api: Api, depth: int = 0, name: str = "ParsedDocument") -> Mode if depth == 10 # TODO delete this else fields.List(fields.Nested(ParsedDocument.get_api_dict(api, depth=depth + 1, name="refParsedDocument" + str(depth)), description="Attachment structure", - required=False))}) + required=False)) + }) diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py index 454e5059..595bd89a 100644 --- a/dedoc/data_structures/tree_node.py +++ b/dedoc/data_structures/tree_node.py @@ -124,10 +124,7 @@ def add_text(self, line: LineWithMeta) -> None: def __shift_annotations(line: LineWithMeta, text_length: int) -> List[Annotation]: new_annotations = [] for annotation in line.annotations: - new_annotation = Annotation(start=annotation.start + text_length, - end=annotation.end + text_length, - name=annotation.name, - value=annotation.value) + new_annotation = Annotation(start=annotation.start + text_length, end=annotation.end + text_length, name=annotation.name, value=annotation.value) new_annotations.append(new_annotation) return new_annotations diff --git a/dedoc/download_models.py b/dedoc/download_models.py index e6ebcc12..643cf30e 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -26,20 +26,14 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str def download(resources_path: str) -> None: - download_from_hub(out_dir=resources_path, - out_name="txtlayer_classifier.pkl.gz", - repo_name="txtlayer_classifier", - hub_name="model.pkl.gz") + download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz") download_from_hub(out_dir=resources_path, out_name="scan_orientation_efficient_net_b0.pth", repo_name="scan_orientation_efficient_net_b0", hub_name="model.pth") - download_from_hub(out_dir=resources_path, - out_name="paragraph_classifier.pkl.gz", - repo_name="paragraph_classifier", - hub_name="model.pkl.gz") + download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.pkl.gz", repo_name="paragraph_classifier", hub_name="model.pkl.gz") line_clf_resources_path = os.path.join(resources_path, "line_type_classifiers") for classifier_type in ("diploma", "law", "law_txt", "tz", "tz_txt"): diff --git a/dedoc/extensions.py b/dedoc/extensions.py index 3e8d326a..665102d0 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -27,13 +27,18 @@ converted_mimes = Extensions( excel_like_format=["application/vnd.oasis.opendocument.spreadsheet", "application/vnd.ms-excel"], docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text"], - pptx_like_format=["application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.ms-powerpoint", "application/vnd.oasis.opendocument.presentation"], + pptx_like_format=[ + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.ms-powerpoint", + "application/vnd.oasis.opendocument.presentation" + ], archive_like_format=[], - image_like_format=["image/gif", - "image/x-portable-pixmap", "image/x-portable-anymap", "image/x-portable-graymap", - "image/x-portable-bitmap", "image/x-pcx", "image/x-pict", - "application/postscript", "image/x-cmu-raster"], + image_like_format=[ + "image/gif", + "image/x-portable-pixmap", "image/x-portable-anymap", "image/x-portable-graymap", + "image/x-portable-bitmap", "image/x-pcx", "image/x-pict", + "application/postscript", "image/x-cmu-raster" + ], pdf_like_format=["image/vnd.djvu"], csv_like_format=[], txt_like_format=["application/xml", "text/xml"] diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index 5d37ad61..ac573d02 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -124,8 +124,7 @@ def _get_exif(self, path: str) -> dict: image = Image.open(path) exif_dict = piexif.load(image.info["exif"]).get("Exif", {}) if "exif" in image.info else {} exif = {ExifTags.TAGS[k]: v for k, v in exif_dict.items() if k in ExifTags.TAGS} - encoded_dict = {key_renamed: encode_function(exif.get(key)) - for key, (key_renamed, encode_function) in self.keys.items() if key in exif} + encoded_dict = {key_renamed: encode_function(exif.get(key)) for key, (key_renamed, encode_function) in self.keys.items() if key in exif} encoded_dict = {k: v for k, v in encoded_dict.items() if k is not None if v is not None} image.close() return encoded_dict diff --git a/dedoc/readers/docx_reader/numbering_extractor.py b/dedoc/readers/docx_reader/numbering_extractor.py index 37fe8591..4378adab 100644 --- a/dedoc/readers/docx_reader/numbering_extractor.py +++ b/dedoc/readers/docx_reader/numbering_extractor.py @@ -329,12 +329,7 @@ def parse(self, lvl_list: List[Tag]) -> None: class Num(AbstractNum): - - def __init__(self, - num_id: str, - abstract_num_dict: Dict[str, Tag], - num_dict: Dict[str, Tag], - styles_extractor: StylesExtractor) -> None: + def __init__(self, num_id: str, abstract_num_dict: Dict[str, Tag], num_dict: Dict[str, Tag], styles_extractor: StylesExtractor) -> None: """ :param num_id: numId for num element :param abstract_num_dict: dictionary with abstractNum BeautifulSoup trees diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py index 1738614b..fe97614a 100644 --- a/dedoc/readers/html_reader/html_reader.py +++ b/dedoc/readers/html_reader/html_reader.py @@ -54,8 +54,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true" path_hash = calculate_file_hash(path=path) lines = self.__read_blocks(soup, path_hash=path_hash, handle_invisible_table=handle_invisible_table) - tables = [self._read_table(table, path_hash) for table in soup.find_all("table") - if self._visible_table(table, handle_invisible_table=handle_invisible_table)] + tables = [ + self._read_table(table, path_hash) for table in soup.find_all("table") if self._visible_table(table, handle_invisible_table=handle_invisible_table) + ] document = UnstructuredDocument(tables=tables, lines=lines, attachments=[]) document_postprocess = self.postprocessor.postprocess(document) return document_postprocess @@ -102,10 +103,7 @@ def __handle_single_tag(self, tag: Tag, uid: str) -> List[LineWithMeta]: line.metadata.extend_other_fields({"html_tag": tag.name}) return [line] - def __read_blocks(self, - block: Tag, - path_hash: str = "", - handle_invisible_table: bool = False) -> List[LineWithMeta]: + def __read_blocks(self, block: Tag, path_hash: str = "", handle_invisible_table: bool = False) -> List[LineWithMeta]: uid = hashlib.md5((path_hash + str(block.name)).encode()).hexdigest() if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table): return [] @@ -125,12 +123,7 @@ def _handle_text_line(self, block: str, path_hash: str, ignore_space: bool = Tru line = self.__make_line(block, HierarchyLevel.unknown, 0, uid=uid, path_hash=path_hash) return [line] - def __make_line(self, line: str, - line_type: str, - header_level: int = 0, - uid: str = None, - path_hash: str = None, - annotations: List = None) -> LineWithMeta: + def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, path_hash: str = None, annotations: List = None) -> LineWithMeta: if annotations is None: annotations = [] @@ -176,12 +169,7 @@ def __read_list(self, lst: Tag, uid: str, path_hash: str, handle_invisible_table lines.extend(item_lines) return lines - def __handle_list_item(self, - item: Tag, - item_index: int, - list_type: str, - path_hash: str, - handle_invisible_table: bool) -> List[LineWithMeta]: + def __handle_list_item(self, item: Tag, item_index: int, list_type: str, path_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]: lines = [] header_line = self.__get_li_header(list_type=list_type, index=item_index) block_lines = self.__handle_block(item, uid=path_hash, handle_invisible_table=handle_invisible_table) diff --git a/dedoc/readers/html_reader/html_tags.py b/dedoc/readers/html_reader/html_tags.py index a0dc032a..2c90681c 100644 --- a/dedoc/readers/html_reader/html_tags.py +++ b/dedoc/readers/html_reader/html_tags.py @@ -1,20 +1,8 @@ class HtmlTags: - service_tags = ["script", "style"] list_items = ["li", "dd", "dt"] - block_tags = ["aside", - "article", - "body", - "div", - "footer", - "header", - "html", - "main", - "nav", - "section", - "form" - ] + list_items + block_tags = ["aside", "article", "body", "div", "footer", "header", "html", "main", "nav", "section", "form", *list_items] unordered_list = ["ul", "dl", "dir"] ordered_list = ["ol"] list_tags = unordered_list + ordered_list @@ -31,35 +19,10 @@ class HtmlTags: paragraphs = ["p"] + block_tags + list_items + header_tags styled_tag = bold_tags + italic_tags + underlined_tags + strike_tags + superscript_tags + subscript_tags - simple_text_tags = ["a", - "abbr", - "acronym", - "applet", - "area", - "article", - "aside", - "bdi", - "bdo", - "big", - "blockquote", - "canvas", - "caption", - "center", - "cite", - "code", - "data", - "font", - "kbd", - "mark", - "output", - "p", - "pre", - "q", - "samp", - "small", - "span", - "tt", - "wbr"] + simple_text_tags = [ + "a", "abbr", "acronym", "applet", "area", "article", "aside", "bdi", "bdo", "big", "blockquote", "canvas", "caption", "center", "cite", "code", "data", + "font", "kbd", "mark", "output", "p", "pre", "q", "samp", "small", "span", "tt", "wbr" + ] text_tags = simple_text_tags + styled_tag table_tags = ["table"] diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py index 3ce1d83c..84374afd 100644 --- a/dedoc/readers/mhtml_reader/mhtml_reader.py +++ b/dedoc/readers/mhtml_reader/mhtml_reader.py @@ -60,8 +60,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio tables.extend(result.tables) need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true" - attachments_names = [os.path.join(os.path.basename(os.path.dirname(file_name)), os.path.basename(file_name)) - for file_name in names_list if file_name not in names_html] + attachments_names = [ + os.path.join(os.path.basename(os.path.dirname(file_name)), os.path.basename(file_name)) for file_name in names_list if file_name not in names_html + ] attachments = self.__get_attachments(save_dir=save_dir, names_list=attachments_names, need_content_analysis=need_content_analysis) return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index a370c33c..6ed650ef 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -145,8 +145,9 @@ def __merge_documents(self, first: UnstructuredDocument, second: UnstructuredDoc for line in chain(first.lines, second.lines): line.metadata.line_id = line_id line_id += 1 - annotations = [annotation for annotation in line.annotations - if not (isinstance(annotation, TableAnnotation) and annotation.value in dropped_tables)] + annotations = [ + annotation for annotation in line.annotations if not (isinstance(annotation, TableAnnotation) and annotation.value in dropped_tables) + ] new_line = LineWithMeta(line=line.line, metadata=line.metadata, annotations=annotations, uid=line.uid) lines.append(new_line) return UnstructuredDocument(tables=tables, lines=lines, attachments=first.attachments + second.attachments, metadata=second.metadata) diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index d5474f04..6812c749 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -34,18 +34,20 @@ from dedoc.utils.utils import flatten from dedoc.utils.utils import get_file_mime_type, splitext_ -ParametersForParseDoc = namedtuple("ParametersForParseDoc", ["orient_analysis_cells", - "orient_cell_angle", - "is_one_column_document", - "document_orientation", - "document_type", - "language", - "need_header_footers_analysis", - "need_pdf_table_analysis", - "first_page", - "last_page", - "need_binarization", - "table_type"]) +ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ + "orient_analysis_cells", + "orient_cell_angle", + "is_one_column_document", + "document_orientation", + "document_type", + "language", + "need_header_footers_analysis", + "need_pdf_table_analysis", + "first_page", + "last_page", + "need_binarization", + "table_type" +]) class PdfBaseReader(BaseReader): @@ -110,17 +112,15 @@ def _can_contain_attachements(self, path: str) -> bool: can_contain_attachments = True return can_contain_attachments - def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> Tuple[List[LineWithMeta], - List[ScanTable], - List[PdfImageAttachment], - List[str], - Optional[dict]]: + def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( + Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]: first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page last_page = math.inf if parameters.last_page is None else parameters.last_page images = self._get_images(path, first_page, last_page) - result = Parallel(n_jobs=self.config["n_jobs"])(delayed(self._process_one_page)(image, parameters, page_number, path) - for page_number, image in enumerate(images, start=first_page)) + result = Parallel(n_jobs=self.config["n_jobs"])( + delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page) + ) page_count = get_pdf_page_count(path) page_count = math.inf if page_count is None else page_count diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py index 80442aee..b3b0790b 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py @@ -33,8 +33,7 @@ def __getitem__(self, idx: torch.Tensor) -> Dict[str, str]: if torch.is_tensor(idx): idx = idx.tolist() - img_name = os.path.join(self.root_dir, - self.label_loader.iloc[idx, 0]) + img_name = os.path.join(self.root_dir, self.label_loader.iloc[idx, 0]) image = io.imread(img_name) label = self.label_loader.iloc[idx, 1:] orientation = label["orientation"] diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/transforms.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/transforms.py index 14b88b30..4aae4823 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/transforms.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/transforms.py @@ -4,8 +4,7 @@ from PIL import Image from torchvision import transforms -from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import \ - ColumnsOrientationClassifier +from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier class ImageTransform(object): diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index 924bf329..4cf80fc7 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -129,8 +129,9 @@ def __create_lines_with_meta(self, tree_nodes: List["TableTree"], original_box_t text_line += OCRCellExtractor.get_line_with_meta(" ", bbox=word.bbox, image=original_image) # add confidence value text_line += OCRCellExtractor.get_line_with_meta(text=word.text, bbox=word.bbox, image=original_image, - confidences=[ConfidenceAnnotation(start=0, end=len(word.text), - value=word.confidence / 100.)]) + confidences=[ + ConfidenceAnnotation(start=0, end=len(word.text), value=word.confidence / 100.) + ]) if len(text_line) > 0: # add new line cell_lines.append(text_line) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py index b48d7fd6..162a7b31 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py @@ -12,11 +12,7 @@ class OCRLineExtractor: def __init__(self, *, config: dict) -> None: self.config = config - def split_image2lines(self, - image: np.ndarray, - page_num: int, - language: str = "rus+eng", - is_one_column_document: bool = True) -> PageWithBBox: + def split_image2lines(self, image: np.ndarray, page_num: int, language: str = "rus+eng", is_one_column_document: bool = True) -> PageWithBBox: bboxes = self.__split_image2bboxes(image=image, page_num=page_num, language=language, is_one_column_document=is_one_column_document) filtered_bboxes = list(self._filtered_bboxes(bboxes)) @@ -32,8 +28,10 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str, output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_threshold) height, width = image.shape[:2] - line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height)) - for line_num, line in enumerate(output_dict.lines)] + line_boxes = [ + TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height)) + for line_num, line in enumerate(output_dict.lines) + ] return line_boxes diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py index a6593496..0e72128c 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py @@ -62,10 +62,7 @@ def split(self, cells: List[List[Cell]]) -> List[List[Cell]]: return result_matrix @staticmethod - def __split_one_cell(cell: Cell, - horizontal_borders: np.ndarray, - vertical_borders: np.ndarray, - result_matrix: List[List[Cell]]) -> None: + def __split_one_cell(cell: Cell, horizontal_borders: np.ndarray, vertical_borders: np.ndarray, result_matrix: List[List[Cell]]) -> None: left_id, right_id = np.searchsorted(vertical_borders, [cell.x_top_left, cell.x_bottom_right]) top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.y_top_left, cell.y_bottom_right]) colspan = right_id - left_id diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index 4729c2ca..c964c9a6 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -53,8 +53,7 @@ def split_last_column(matrix_table: List[List[Cell]], language: str, image: np.a union_cells = [prev_cell] continue - if row_id == len(last_column) - 1 and len(union_cells) > 1 or \ - cell.id_con != prev_cell.id_con and len(union_cells) > 1: + if row_id == len(last_column) - 1 and len(union_cells) > 1 or cell.id_con != prev_cell.id_con and len(union_cells) > 1: result_matrix[start_union_cell:start_union_cell + len(union_cells)] = \ _split_each_row(union_cells, matrix_table[start_union_cell:start_union_cell + len(union_cells)], language=language, image=image) union_cells = [cell] @@ -162,11 +161,10 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra if len(text_line) != 0: text_line += OCRCellExtractor.get_line_with_meta(" ", bbox=word.bbox, image=page_image) # add confidence value - text_line += OCRCellExtractor.get_line_with_meta(text=word.text, bbox=word.bbox, image=page_image, - confidences=[ - ConfidenceAnnotation(start=0, - end=len(word.text), - value=0. if word.confidence < 0 else word.confidence / 100.)]) + text_line += OCRCellExtractor.get_line_with_meta( + text=word.text, bbox=word.bbox, image=page_image, + confidences=[ConfidenceAnnotation(start=0, end=len(word.text), value=0. if word.confidence < 0 else word.confidence / 100.)] + ) if len(text_line) > 0: # add new line cell_lines.append(text_line) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py index 08adefbd..9f85bc5d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py @@ -165,8 +165,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np # ----- search height, width table ----- # # ----- detect gap for houph ------- # contours, hierarchy = cv2.findContours(image, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS) - contours_table = [cv2.boundingRect(c) for ind, c in enumerate(contours) - if hierarchy[0][ind][3] == 0 and hierarchy[0][ind][2] != -1] # List[[x,y,w,h]] + contours_table = [cv2.boundingRect(c) for ind, c in enumerate(contours) if hierarchy[0][ind][3] == 0 and hierarchy[0][ind][2] != -1] # List[[x,y,w,h]] if len(contours_table) > 0: gap_avg = min(np.mean([c[3] for c in contours_table]) // 35, 100) gap_avg = min(np.mean([c[2] for c in contours_table]) // 45, gap_avg) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index e708dd5a..3692d19e 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -239,8 +239,7 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_ def __jar_path(self) -> str: return os.environ.get("TABBY_JAR", self.default_config["JAR_PATH"]) - def __run(self, path: str = None, encoding: str = "utf-8", - start_page: int = None, end_page: int = None) -> bytes: + def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes: args = ["java"] + ["-jar", self.__jar_path(), "-i", path] if start_page is not None and end_page is not None: args += ["-sp", str(start_page), "-ep", str(end_page)] diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index f467f063..29e3c39a 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -147,14 +147,7 @@ def __get_interpreter(self) -> Tuple[PDFPageAggregator, PDFPageInterpreter]: interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter - def get_info_layout_object(self, - lobj: LTContainer, - page_num: int, - line_num: int, - k_w: float, - k_h: float, - height: int, - width: int) -> TextWithBBox: + def get_info_layout_object(self, lobj: LTContainer, page_num: int, line_num: int, k_w: float, k_h: float, height: int, width: int) -> TextWithBBox: # 1 - converting coordinate from pdf format into image bbox = create_bbox(height, k_h, k_w, lobj) # 2 - extract text and text annotations from current object @@ -220,11 +213,13 @@ def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h words.append(word) word = WordObj(start=item + 1, end=item + 1, value=LTTextContainer()) - annotations = [BBoxAnnotation(start=word.start, - end=word.end, - value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value), - page_width=width, - page_height=height) for word in words] + annotations = [ + BBoxAnnotation(start=word.start, + end=word.end, + value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value), + page_width=width, + page_height=height) for word in words + ] return annotations def _get_new_weight(self) -> str: diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py index 8d9a9125..a0661ff8 100644 --- a/dedoc/readers/pdf_reader/utils/line_object_linker.py +++ b/dedoc/readers/pdf_reader/utils/line_object_linker.py @@ -59,10 +59,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i best_line.annotations.append(annotation) # noqa return lines - def _add_lines(self, - all_objects: List[Union[LineWithLocation, ScanTable, PdfImageAttachment]], - lines_key: str, - objects_with_line_candidate: dict) -> None: + def _add_lines(self, all_objects: List[Union[LineWithLocation, ScanTable, PdfImageAttachment]], lines_key: str, objects_with_line_candidate: dict) -> None: lines_deque = deque(maxlen=self.n_lines) for page_object in all_objects: if isinstance(page_object, LineWithLocation): diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py index 2fc66976..0428ae56 100644 --- a/dedoc/readers/pptx_reader/pptx_reader.py +++ b/dedoc/readers/pptx_reader/pptx_reader.py @@ -46,8 +46,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio lines.append(LineWithMeta(line=shape.text, metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) if shape.has_table: - cells = [[CellWithMeta(lines=[ - LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells] for row in shape.table.rows + cells = [ + [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells] + for row in shape.table.rows ] tables.append(Table(cells=cells, metadata=TableMetadata(page_id=page_id))) diff --git a/dedoc/structure_constructors/table_patcher.py b/dedoc/structure_constructors/table_patcher.py index bba5c8bc..34d3e56e 100644 --- a/dedoc/structure_constructors/table_patcher.py +++ b/dedoc/structure_constructors/table_patcher.py @@ -21,8 +21,9 @@ def insert_table(self, document: UnstructuredDocument) -> UnstructuredDocument: """ tables_dict = {table.metadata.uid: table for table in document.tables if not table.metadata.is_inserted} paragraphs = [] - hierarchy_level = max((line.metadata.hierarchy_level.level_1 for line in document.lines - if line.metadata.hierarchy_level.level_1 is not None), default=0) + hierarchy_level = max( + (line.metadata.hierarchy_level.level_1 for line in document.lines if line.metadata.hierarchy_level.level_1 is not None), default=0 + ) for line in document.lines: if line.metadata.hierarchy_level.is_raw_text(): hierarchy_level_raw_text = HierarchyLevel(level_1=hierarchy_level + 1, diff --git a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py index cf2e97ad..8a9dc4f8 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py @@ -29,18 +29,20 @@ class LawDocType(Enum): @staticmethod def doc_types() -> List[str]: # order is important - return [LawDocType.definition, - LawDocType.order, - LawDocType.bylaw, - LawDocType.code, - LawDocType.federal_law, - LawDocType.edict, - LawDocType.law, - LawDocType.decree, - LawDocType.directive, - LawDocType.constitution, - LawDocType.state, - LawDocType.instruction] + return [ + LawDocType.definition, + LawDocType.order, + LawDocType.bylaw, + LawDocType.code, + LawDocType.federal_law, + LawDocType.edict, + LawDocType.law, + LawDocType.decree, + LawDocType.directive, + LawDocType.constitution, + LawDocType.state, + LawDocType.instruction + ] @staticmethod def foiv_types() -> List["LawDocType"]: diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index 4ac9b075..ed65170a 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -72,9 +72,8 @@ def get_list_hl_with_regexp(line: LineWithMeta, previous_line: Optional[LineWith if prefix.name == BracketPrefix.name: # list like 1) # check if tesseract recognize russian б as 6 (bi as six) - if prefix.prefix_num == 6 and previous_line is not None and \ - previous_line.line.lower().strip().startswith(("a)", "а)")): # here is russian and english letters - return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) + if prefix.prefix_num == 6 and previous_line is not None and previous_line.line.lower().strip().startswith(("a)", "а)")): + return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) # here is russian and english letters return HierarchyLevel(3, 1, False, line_type=HierarchyLevel.list_item) if prefix.name == LetterPrefix.name: # list like a) diff --git a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py index 09c2c9eb..d1ce8818 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py @@ -53,8 +53,9 @@ def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> toc_lines = [(line, "toc") for line in toc_lines] header_lines = [(line, "title") for line in lines if line.metadata.tag_hierarchy_level.line_type == "title"] - body_lines = [(line, line.metadata.tag_hierarchy_level.line_type) for line in lines if - line.metadata.tag_hierarchy_level.line_type not in ("title", "toc")] + body_lines = [ + (line, line.metadata.tag_hierarchy_level.line_type) for line in lines if line.metadata.tag_hierarchy_level.line_type not in ("title", "toc") + ] header_lines = self.header_builder.get_lines_with_hierarchy(lines_with_labels=header_lines, init_hl_depth=0) toc_lines = self.toc_builder.get_lines_with_hierarchy(lines_with_labels=toc_lines, init_hl_depth=1) diff --git a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py index e5008a50..47c1bb2c 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py @@ -20,10 +20,12 @@ class FoivLawStructureExtractor(AbstractLawStructureExtractor): def __init__(self, *, config: dict) -> None: super().__init__(config=config) - self.hierarchy_level_builders = [HeaderHierarchyLevelBuilder(), - BodyFoivHierarchyLevelBuilder(), - CellarHierarchyLevelBuilder(), - ApplicationFoivHierarchyLevelBuilder()] + self.hierarchy_level_builders = [ + HeaderHierarchyLevelBuilder(), + BodyFoivHierarchyLevelBuilder(), + CellarHierarchyLevelBuilder(), + ApplicationFoivHierarchyLevelBuilder() + ] self.regexps_subitem_with_number = BodyFoivHierarchyLevelBuilder.regexps_subitem_with_number self.regexps_subitem_with_char = BodyFoivHierarchyLevelBuilder.regexps_subitem_with_char self.regexps_ends_of_number = regexps_ends_of_number diff --git a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py index b46f8c7d..f2bed5eb 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py @@ -21,10 +21,12 @@ class LawStructureExtractor(AbstractLawStructureExtractor): def __init__(self, *, config: dict) -> None: super().__init__(config=config) - self.hierarchy_level_builders = [HeaderHierarchyLevelBuilder(), - BodyLawHierarchyLevelBuilder(), - CellarHierarchyLevelBuilder(), - ApplicationLawHierarchyLevelBuilder()] + self.hierarchy_level_builders = [ + HeaderHierarchyLevelBuilder(), + BodyLawHierarchyLevelBuilder(), + CellarHierarchyLevelBuilder(), + ApplicationLawHierarchyLevelBuilder() + ] self.regexps_item = re.compile(r"^\s*(\d*\.)*\d+[\)|\}]") self.regexps_part = regexps_number self.regexps_subitem = re.compile(r"^\s*[а-яё]\)") diff --git a/dedoc/structure_extractors/feature_extractors/abstract_extractor.py b/dedoc/structure_extractors/feature_extractors/abstract_extractor.py index 161ed9d9..49d11bc4 100644 --- a/dedoc/structure_extractors/feature_extractors/abstract_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/abstract_extractor.py @@ -67,10 +67,14 @@ def prev_next_line_features(self, matrix: pd.DataFrame, n_prev: int, n_next: int add previous and next features with their names """ feature_names = matrix.columns - prev_line_features = [pd.DataFrame(data=self._prev_line_features(matrix.values, i), columns=self._create_features_name(feature_names, "prev", i)) - for i in range(1, n_prev + 1)] - next_line_features = [pd.DataFrame(data=self._next_line_features(matrix.values, i), columns=self._create_features_name(feature_names, "next", i)) - for i in range(1, n_next + 1)] + prev_line_features = [ + pd.DataFrame(data=self._prev_line_features(matrix.values, i), columns=self._create_features_name(feature_names, "prev", i)) + for i in range(1, n_prev + 1) + ] + next_line_features = [ + pd.DataFrame(data=self._next_line_features(matrix.values, i), columns=self._create_features_name(feature_names, "next", i)) + for i in range(1, n_next + 1) + ] matrices = [matrix] + prev_line_features + next_line_features result_matrix = pd.concat(matrices, axis=1) @@ -107,8 +111,9 @@ def _get_bold(line: LineWithMeta) -> float: @staticmethod def _get_bold_percent(line: LineWithMeta) -> float: - bold_character_number = sum([annotation.end - annotation.start for annotation in line.annotations - if annotation.name == BoldAnnotation.name and annotation.value == "True"]) + bold_character_number = sum([ + annotation.end - annotation.start for annotation in line.annotations if annotation.name == BoldAnnotation.name and annotation.value == "True" + ]) if len(line.line) == 0: return 0 return bold_character_number / len(line.line) diff --git a/dedoc/structure_extractors/feature_extractors/law_text_features.py b/dedoc/structure_extractors/feature_extractors/law_text_features.py index 2acbb3ae..333af89a 100644 --- a/dedoc/structure_extractors/feature_extractors/law_text_features.py +++ b/dedoc/structure_extractors/feature_extractors/law_text_features.py @@ -17,8 +17,7 @@ class LawTextFeatures(AbstractFeatureExtractor): named_regexp = [ re.compile(r"^(Статья|(Г|г)лава|ГЛАВА|ЧАСТЬ|Часть|Раздел|РАЗДЕЛ|\$|§)\s*((\d+\.*)+|[IVXХxхviУП]{1,3}\.?)\s*") ] - roman_regexp = re.compile(r"\s*(I|Г|T|Т|II|П|III|Ш|ТУ|TУ|IV|V|У|VI|УТ|УT|VII|УТТ|VIII|I[XХ]|[XХ]|[XХ]I|[XХ]II)\.\s+" - ) + roman_regexp = re.compile(r"\s*(I|Г|T|Т|II|П|III|Ш|ТУ|TУ|IV|V|У|VI|УТ|УT|VII|УТТ|VIII|I[XХ]|[XХ]|[XХ]I|[XХ]II)\.\s+") regexp_application_begin = re.compile( r"^(\'|\")?(((П|п)риложение)|((У|у)твержден)[оаы]?){1}(( )*([№nN]?( )*(\d){1,3})?( )*)" r"((к распоряжению)|(к постановлению)|(к приказу))?\s*$" diff --git a/dedoc/structure_extractors/feature_extractors/list_features/list_features_extractor.py b/dedoc/structure_extractors/feature_extractors/list_features/list_features_extractor.py index 8c392ef0..7a048bb1 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/list_features_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/list_features_extractor.py @@ -70,9 +70,11 @@ def _one_line_features(self, line: LineWithMeta, prefix: LinePrefix, line_id: in same_indent /= window_size predecessor_num_same_indent /= window_size predecessor_num /= window_size - return {f"same_indent_{self.window_size}": same_indent, - f"predecessor_num_same_indent_{self.window_size}": predecessor_num_same_indent, - f"predecessor_num_{self.window_size}": predecessor_num} + return { + f"same_indent_{self.window_size}": same_indent, + f"predecessor_num_same_indent_{self.window_size}": predecessor_num_same_indent, + f"predecessor_num_{self.window_size}": predecessor_num + } def _same_indent(self, this_indent: float, other_indent: float, std: float) -> bool: eps = 1 diff --git a/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py index 6db29463..28fab042 100644 --- a/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py @@ -10,11 +10,12 @@ class TOCFeatureExtractor: end_with_num = re.compile(r"(.*[^\s.…])?[….\s]+(\d{1,3})(-\d{1,3})?$") window_size = 5 - titles = ("tableofcontents", "contents", "tableofcontentspage", # english - "содержание", "оглавление", # russian - "tabledesmatières", "tabledesmatieres", "sommaire", # french - "indice", "índice", "contenidos", "tabladecontenido" # spanish - ) + titles = ( + "tableofcontents", "contents", "tableofcontentspage", # english + "содержание", "оглавление", # russian + "tabledesmatières", "tabledesmatieres", "sommaire", # french + "indice", "índice", "contenidos", "tabladecontenido" # spanish + ) def get_toc(self, document: List[LineWithMeta]) -> List[dict]: """ diff --git a/dedoc/structure_extractors/feature_extractors/tz_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/tz_feature_extractor.py index d46ff663..fce1cef1 100644 --- a/dedoc/structure_extractors/feature_extractors/tz_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/tz_feature_extractor.py @@ -55,7 +55,8 @@ def __process_document(self, lines: List[LineWithMeta]) -> pd.DataFrame: features_df = pd.DataFrame({ "toc": self._before_special_line(lines, self.__find_toc), "tz": self._before_special_line(lines, self.__find_tz), - "list_item": self._list_features(lines)}) + "list_item": self._list_features(lines) + }) page_ids = [line.metadata.page_id for line in lines] if page_ids: diff --git a/dedoc/structure_extractors/hierarchy_level_builders/header_builder/header_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/header_builder/header_hierarchy_level_builder.py index f1f6b236..746c383e 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/header_builder/header_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/header_builder/header_hierarchy_level_builder.py @@ -10,11 +10,7 @@ class HeaderHierarchyLevelBuilder(AbstractHierarchyLevelBuilder): document_types = ["foiv", "law"] starting_line_types = ["header"] - def _line_2level(self, - text: str, - label: str, - init_hl_depth: int, - previous_hl: HierarchyLevel = None) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: + def _line_2level(self, text: str, label: str, init_hl_depth: int, previous_hl: HierarchyLevel = None) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: hl = HierarchyLevel.create_root() return hl, hl diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py index 9f856cb5..1343e7d6 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py @@ -55,11 +55,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s return result - def _line_2level(self, - text: str, - label: str, - init_hl_depth: int, - previous_hl: HierarchyLevel = None) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: + def _line_2level(self, text: str, label: str, init_hl_depth: int, previous_hl: HierarchyLevel = None) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: text = text.strip() if len(text) == 0: label = HierarchyLevel.raw_text diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py index ec326538..95fb9b05 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py @@ -64,11 +64,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s result.append(self.get_body_line(init_hl_depth=init_hl_depth)) return result - def _line_2level(self, - text: str, - label: str, - init_hl_depth: int, - previous_hl: HierarchyLevel = None) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: + def _line_2level(self, text: str, label: str, init_hl_depth: int, previous_hl: HierarchyLevel = None) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: text = text.strip() if label == "header": label = "raw_text" diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/abstract_structure_unit.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/abstract_structure_unit.py index bf42c504..35dbb5f1 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/abstract_structure_unit.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/abstract_structure_unit.py @@ -7,8 +7,5 @@ class AbstractStructureUnit(abc.ABC): @abc.abstractmethod - def structure_unit(self, - text: str, - init_hl_depth: int, - previous_hl: Optional[HierarchyLevel]) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: + def structure_unit(self, text: str, init_hl_depth: int, previous_hl: Optional[HierarchyLevel]) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: pass diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py index e8102e8b..53666e89 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py @@ -12,10 +12,7 @@ class FoivStructureUnitBuilder(AbstractStructureUnit): regexps_subitem_with_char = regexps_subitem regexps_subitem_with_number = regexps_item_with_bracket - def structure_unit(self, - text: str, - init_hl_depth: int, - previous_hl: Optional[HierarchyLevel]) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: + def structure_unit(self, text: str, init_hl_depth: int, previous_hl: Optional[HierarchyLevel]) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: if text.lower().startswith("глава") or LawTextFeatures.roman_regexp.match(text): hl = HierarchyLevel(init_hl_depth + 4, 0, True, "chapter") return hl, hl diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py index 8be73e92..313fde91 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py @@ -14,10 +14,7 @@ class LawStructureUnitBuilder(AbstractStructureUnit): ends_of_number = AbstractFeatureExtractor.ends_of_number regexps_subitem = regexps_subitem - def structure_unit(self, - text: str, - init_hl_depth: int, - previous_hl: Optional[HierarchyLevel]) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: + def structure_unit(self, text: str, init_hl_depth: int, previous_hl: Optional[HierarchyLevel]) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: if text.lower().startswith("часть"): hl = HierarchyLevel(init_hl_depth + 1, 0, True, "part") # 3 return hl, hl diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/stub_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/stub_hierarchy_level_builder.py index cffa168e..7dcb8b09 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/stub_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/stub_hierarchy_level_builder.py @@ -8,11 +8,7 @@ class StubHierarchyLevelBuilder(AbstractHierarchyLevelBuilder): starting_line_types = ["no_tag"] - def _line_2level(self, - text: str, - label: str, - init_hl_depth: int, - previous_hl: HierarchyLevel = None) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: + def _line_2level(self, text: str, label: str, init_hl_depth: int, previous_hl: HierarchyLevel = None) -> Tuple[HierarchyLevel, Optional[HierarchyLevel]]: pass def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]: diff --git a/dedoc/structure_extractors/line_type_classifiers/diploma_classifier.py b/dedoc/structure_extractors/line_type_classifiers/diploma_classifier.py index b4d2828c..95ec562a 100644 --- a/dedoc/structure_extractors/line_type_classifiers/diploma_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/diploma_classifier.py @@ -2,8 +2,7 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.feature_extractors.diploma_feature_extractor import DiplomaFeatureExtractor -from dedoc.structure_extractors.line_type_classifiers.abstract_pickled_classifier import \ - AbstractPickledLineTypeClassifier +from dedoc.structure_extractors.line_type_classifiers.abstract_pickled_classifier import AbstractPickledLineTypeClassifier class DiplomaLineTypeClassifier(AbstractPickledLineTypeClassifier): diff --git a/dedoc/structure_extractors/line_type_classifiers/law_classifier.py b/dedoc/structure_extractors/line_type_classifiers/law_classifier.py index a8677189..b79e6be7 100644 --- a/dedoc/structure_extractors/line_type_classifiers/law_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/law_classifier.py @@ -5,8 +5,7 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures -from dedoc.structure_extractors.line_type_classifiers.abstract_pickled_classifier import \ - AbstractPickledLineTypeClassifier +from dedoc.structure_extractors.line_type_classifiers.abstract_pickled_classifier import AbstractPickledLineTypeClassifier class LawLineTypeClassifier(AbstractPickledLineTypeClassifier): diff --git a/dedoc/structure_extractors/line_type_classifiers/tz_classifier.py b/dedoc/structure_extractors/line_type_classifiers/tz_classifier.py index 47ea4b0c..e050d2ac 100644 --- a/dedoc/structure_extractors/line_type_classifiers/tz_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/tz_classifier.py @@ -5,8 +5,7 @@ from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.feature_extractors.tz_feature_extractor import TzTextFeatures -from dedoc.structure_extractors.line_type_classifiers.abstract_pickled_classifier import \ - AbstractPickledLineTypeClassifier +from dedoc.structure_extractors.line_type_classifiers.abstract_pickled_classifier import AbstractPickledLineTypeClassifier class TzLineTypeClassifier(AbstractPickledLineTypeClassifier): diff --git a/dedoc/train_dataset/taskers/concrete_taskers/filtered_line_label_tasker.py b/dedoc/train_dataset/taskers/concrete_taskers/filtered_line_label_tasker.py index 190921da..56d3269d 100644 --- a/dedoc/train_dataset/taskers/concrete_taskers/filtered_line_label_tasker.py +++ b/dedoc/train_dataset/taskers/concrete_taskers/filtered_line_label_tasker.py @@ -22,8 +22,7 @@ def __init__(self, item2label: Callable = None, *, config: dict) -> None: - super().__init__(path2bboxes, path2lines, path2docs, manifest_path, config_path, tmp_dir, progress_bar, - item2label, config=config) + super().__init__(path2bboxes, path2lines, path2docs, manifest_path, config_path, tmp_dir, progress_bar, item2label, config=config) # we can use page numbers only in pdf self.images_creators = ImageCreatorComposition(creators=[ ScannedImagesCreator(path2docs=self.path2docs), @@ -39,8 +38,7 @@ def _get_pages(self) -> List[List[dict]]: doc_lines = [self._make_line_with_meta(line) for line in page] doc_toc = self.toc_extractor.get_toc(doc_lines) # acual_page_num - 1 = page_id - page_numbers = [0, 1] + [int(item["page"]) - 1 for item in doc_toc] + [int(item["page"]) for item in - doc_toc] + page_numbers = [0, 1] + [int(item["page"]) - 1 for item in doc_toc] + [int(item["page"]) for item in doc_toc] doc_line_dicts = [line for line in page if int(line["_metadata"]["page_id"]) in page_numbers] filtered_pages.append(doc_line_dicts) return filtered_pages diff --git a/dedoc/train_dataset/taskers/images_creators/concrete_creators/txt_images_creator.py b/dedoc/train_dataset/taskers/images_creators/concrete_creators/txt_images_creator.py index 2feb3a84..d8e9c082 100644 --- a/dedoc/train_dataset/taskers/images_creators/concrete_creators/txt_images_creator.py +++ b/dedoc/train_dataset/taskers/images_creators/concrete_creators/txt_images_creator.py @@ -31,8 +31,7 @@ def __init__(self, path2docs: str, *, config: dict) -> None: self.background_color = (255, 255, 255) self.border_color = (0, 0, 0) - font_path = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "resources", "Arial_Narrow.ttf")) + font_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "resources", "Arial_Narrow.ttf")) self.font = ImageFont.truetype(font_path, self.font_size) self.txt_reader = RawTextReader(config=config) diff --git a/dedoc/train_dataset/trainer/base_sklearn_line_classifier.py b/dedoc/train_dataset/trainer/base_sklearn_line_classifier.py index a7298adf..c2c6b28c 100644 --- a/dedoc/train_dataset/trainer/base_sklearn_line_classifier.py +++ b/dedoc/train_dataset/trainer/base_sklearn_line_classifier.py @@ -179,8 +179,7 @@ def _cross_val(self, data: List[List[LineWithLabel]], save_errors_images: bool) scores_dict["mean"] = mean(scores) scores_dict["scores"] = scores csv_path = self.__save(data=data.tolist(), path=self.dataset_dir, csv_only=True) - self.errors_saver.save_errors(error_cnt=error_cnt, errors_uids=list(set(errors_uids)), - save_errors_images=save_errors_images, csv_path=csv_path) + self.errors_saver.save_errors(error_cnt=error_cnt, errors_uids=list(set(errors_uids)), save_errors_images=save_errors_images, csv_path=csv_path) return scores_dict def __get_labels(self, data: List[List[LineWithLabel]]) -> List[str]: diff --git a/dedoc/train_dataset/trainer/errors_saver.py b/dedoc/train_dataset/trainer/errors_saver.py index af90ba52..ae7fd26e 100644 --- a/dedoc/train_dataset/trainer/errors_saver.py +++ b/dedoc/train_dataset/trainer/errors_saver.py @@ -76,9 +76,11 @@ def __save_images(self, errors_uids: List[str], csv_path: str) -> None: with zipfile.ZipFile(self.dataset_path, "r") as dataset_archive: dataset_archive.extractall(documents_tmp_dir) path2docs = os.path.join(documents_tmp_dir, "original_documents") - images_creators = [ScannedImagesCreator(path2docs=path2docs), - # DocxImagesCreator(path2docs=path2docs, config=_config), - TxtImagesCreator(path2docs=path2docs, config=self.config)] + images_creators = [ + ScannedImagesCreator(path2docs=path2docs), + # DocxImagesCreator(path2docs=path2docs, config=_config), + TxtImagesCreator(path2docs=path2docs, config=self.config) + ] self.__group_data(os.path.join(documents_tmp_dir, "labeled.json")) dataset = pd.read_csv(csv_dataset_path) @@ -86,11 +88,9 @@ def __save_images(self, errors_uids: List[str], csv_path: str) -> None: ready_documents, ready_images = self.__prepare_files() - with zipfile.ZipFile(self.images_archive, "a") as images_archive, \ - zipfile.ZipFile(self.errors_images_archive, "w") as errors_images_archive: + with zipfile.ZipFile(self.images_archive, "a") as images_archive, zipfile.ZipFile(self.errors_images_archive, "w") as errors_images_archive: for uid in tqdm(errors_uids): - self.__process_uid(errors_images_archive, filtered_dataset, images_archive, images_creators, - ready_documents, ready_images, uid) + self.__process_uid(errors_images_archive, filtered_dataset, images_archive, images_creators, ready_documents, ready_images, uid) def __process_uid(self, errors_images_archive: zipfile.ZipFile, diff --git a/dedoc/train_dataset/trainer/line_lstm_classifier_trainer.py b/dedoc/train_dataset/trainer/line_lstm_classifier_trainer.py index a0f3bd38..e34b8194 100644 --- a/dedoc/train_dataset/trainer/line_lstm_classifier_trainer.py +++ b/dedoc/train_dataset/trainer/line_lstm_classifier_trainer.py @@ -92,8 +92,8 @@ def __init__(self, input_dim: int, hidden_dim: int, hidden_dim_2: int, num_class self.with_attention = with_attention def init_hidden(self, batch_size: int, device: torch.device) -> [torch.Tensor, torch.Tensor]: - h, c = (Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.lstm_units)).to(device), - Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.lstm_units)).to(device)) + h = Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.lstm_units)).to(device) + c = Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.lstm_units)).to(device) return h, c def forward(self, input_tensor: List[torch.Tensor], batch_lengths: torch.Tensor, device: torch.device) -> torch.Tensor: @@ -327,9 +327,7 @@ def train(self, model: nn.Module, iterator: Iterator, cnt_data: int, optimizer: batch_fatures, batch_lens, labels = self._get_batch_data(curr_batch_size, iterator) if len(labels) == 0: continue - predictions = model(torch.tensor(batch_fatures, dtype=torch.float32).to(self.device), - torch.tensor(batch_lens).to(self.device), - self.device) + predictions = model(torch.tensor(batch_fatures, dtype=torch.float32).to(self.device), torch.tensor(batch_lens).to(self.device), self.device) loss = criteria(predictions, torch.tensor(labels).to(self.device)) accuracy = self.accuracy(predictions, labels) @@ -356,9 +354,7 @@ def evaluate(self, model: nn.Module, iterator: Iterator, cnt_data: int, criteria curr_batch_size = rest_batch if batch_num == cnt_batch - 1 and rest_batch > 0 else batch_size batch_fatures, batch_lens, labels = self._get_batch_data(curr_batch_size, iterator) - predictions = model(torch.tensor(batch_fatures, dtype=torch.float32).to(self.device), - torch.tensor(batch_lens).to(self.device), - self.device) + predictions = model(torch.tensor(batch_fatures, dtype=torch.float32).to(self.device), torch.tensor(batch_lens).to(self.device), self.device) loss = criteria(predictions, torch.tensor(labels).to(self.device)) accuracy = self.accuracy(predictions, labels) diff --git a/pyproject.toml b/pyproject.toml index 1e0cfb14..ca2b0344 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,9 @@ lint = [ "flake8-annotations==2.9.1", "flake8-bugbear==23.3.12", "flake8-builtins==2.1.0", + "flake8-fill-one-line>=0.4.0", "flake8-import-order==0.18.2", + "flake8-multiline-containers==0.0.19", "flake8-print==5.0.0", "flake8-quotes==3.3.2", "flake8-use-fstring==1.4", diff --git a/tests/api_tests/test_api_format_csv.py b/tests/api_tests/test_api_format_csv.py index c9fe4324..0bb7f584 100644 --- a/tests/api_tests/test_api_format_csv.py +++ b/tests/api_tests/test_api_format_csv.py @@ -40,8 +40,7 @@ def test_csv_books(self) -> None: row0 = self._get_text_of_row(table[0]) row3 = self._get_text_of_row(table[3]) self.assertListEqual(["id", "cat", "name", "price", "inStock", "author", "series_t", "sequence_i", "genre_s"], row0) - self.assertListEqual(["055357342X", "book", "A Storm of Swords", "7.99", "true", "George R.R. Martin", "A Song of Ice and Fire", "3", "fantasy"], - row3) + self.assertListEqual(["055357342X", "book", "A Storm of Swords", "7.99", "true", "George R.R. Martin", "A Song of Ice and Fire", "3", "fantasy"], row3) def test_csv_books2(self) -> None: file_name = "books_2.csv" @@ -51,10 +50,10 @@ def test_csv_books2(self) -> None: table = tables[0]["cells"] row1 = self._get_text_of_row(table[1]) row2 = self._get_text_of_row(table[2]) - self.assertListEqual(["0553573403", "book", "A Game of Throne, kings and other stuff", "7.99", "True", "George R.R. Martin", "A Song of Ice and Fire", - "1", "fantasy"], row1) - self.assertListEqual(["0553579908", "book", 'A Clash of "Kings"', "7.99", "True", "George R.R. Martin", "A Song of Ice and Fire", "2", "fantasy"], - row2) + self.assertListEqual([ + "0553573403", "book", "A Game of Throne, kings and other stuff", "7.99", "True", "George R.R. Martin", "A Song of Ice and Fire", "1", "fantasy" + ], row1) + self.assertListEqual(["0553579908", "book", 'A Clash of "Kings"', "7.99", "True", "George R.R. Martin", "A Song of Ice and Fire", "2", "fantasy"], row2) def __check_content(self, tables: List[dict]) -> None: self.assertEqual(1, len(tables)) diff --git a/tests/api_tests/test_api_format_html.py b/tests/api_tests/test_api_format_html.py index d3358fea..8a28c357 100644 --- a/tests/api_tests/test_api_format_html.py +++ b/tests/api_tests/test_api_format_html.py @@ -187,8 +187,9 @@ def test_html_no_newline(self) -> None: content = result["content"]["structure"] node = content["subparagraphs"][0] text = node["text"] - expected_text = ('"I can’t bring myself to feel too sorry for Amazon or textbook publishers, given how much ' - 'they tend to gouge on the prices of those books."') + expected_text = ( + '"I can’t bring myself to feel too sorry for Amazon or textbook publishers, given how much they tend to gouge on the prices of those books."' + ) self.assertEqual(expected_text, text.strip()) italics = [text[annotation["start"]: annotation["end"]] for annotation in node["annotations"] if annotation["name"] == "italic"] self.assertIn("or", italics) diff --git a/tests/api_tests/test_api_format_pdf_page_limit.py b/tests/api_tests/test_api_format_pdf_page_limit.py index ee1bf841..18a86e69 100644 --- a/tests/api_tests/test_api_format_pdf_page_limit.py +++ b/tests/api_tests/test_api_format_pdf_page_limit.py @@ -8,15 +8,17 @@ class TestApiPdfPageLimit(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "pdf_with_text_layer", file_name) - lines = ["Первая страница", - "Вторая страница", - "Третья страница", - "Четвёртая страница", - "Пятая страница", - "Шестая страница", - "Седьмая страница", - "Восьмая страница", - "Девятая страница"] + lines = [ + "Первая страница", + "Вторая страница", + "Третья страница", + "Четвёртая страница", + "Пятая страница", + "Шестая страница", + "Седьмая страница", + "Восьмая страница", + "Девятая страница" + ] def test_no_text_layer(self) -> None: self.__check_limit("false", check_partially=True) diff --git a/tests/api_tests/test_api_misc_multipage_table.py b/tests/api_tests/test_api_misc_multipage_table.py index 214233ef..6a016bb7 100644 --- a/tests/api_tests/test_api_misc_multipage_table.py +++ b/tests/api_tests/test_api_misc_multipage_table.py @@ -49,8 +49,10 @@ def test_api_ml_table_recognition_synthetic_data_3(self) -> None: self.assertEqual(len(tables), 1) table = tables[0]["cells"] - self.assertListEqual(["Заголовок\nБольшой", "Еще один большой заголовок", "Еще один большой заголовок", "Еще один большой заголовок", - "Еще один большой заголовок"], self._get_text_of_row(table[0])) + self.assertListEqual( + ["Заголовок\nБольшой", "Еще один большой заголовок", "Еще один большой заголовок", "Еще один большой заголовок", "Еще один большой заголовок"], + self._get_text_of_row(table[0]) + ) self.assertListEqual(["Заголовок\nБольшой", "Заголовок поменьше 1", "Заголовок поменьше 1", "Заголовок поменьше 2", "Заголовок поменьше 2"], self._get_text_of_row(table[1])) self.assertListEqual(["Заголовок\nБольшой", "Заголовочек 1", "Заголовочек 2", "Заголовочек 3", "Заголовочек 4"], self._get_text_of_row(table[2])) diff --git a/tests/api_tests/test_api_misc_with_attachments.py b/tests/api_tests/test_api_misc_with_attachments.py index 0f53199a..5babe30f 100644 --- a/tests/api_tests/test_api_misc_with_attachments.py +++ b/tests/api_tests/test_api_misc_with_attachments.py @@ -86,8 +86,9 @@ def test_json_invalid_html_fields(self) -> None: file_name = "json/with_html.json" parameters = dict() parameters["with_attachments"] = True - parameters["html_fields"] = json.dumps([["title"], ["example"], ["another_field"], ["test"], ["lists"], ["log"], ["text"], - ["deep_key1", "deep_key2", "deep_key3"]]) + parameters["html_fields"] = json.dumps([ + ["title"], ["example"], ["another_field"], ["test"], ["lists"], ["log"], ["text"], ["deep_key1", "deep_key2", "deep_key3"] + ]) result = self._send_request(file_name, parameters) attachments = result["attachments"] diff --git a/tests/unit_tests/test_doctype_law_structure_extractor.py b/tests/unit_tests/test_doctype_law_structure_extractor.py index 8431ef37..f68068e6 100644 --- a/tests/unit_tests/test_doctype_law_structure_extractor.py +++ b/tests/unit_tests/test_doctype_law_structure_extractor.py @@ -48,17 +48,21 @@ def test_article_part(self) -> None: self.assertEqual("articlePart", hl.line_type) def test_begin_application(self) -> None: - application_starts = ["Утвержден", "УТВЕРЖДЕНО \n", "Приложение №1\n", "Приложение № 45\n", "Утверждено \n", - "'Приложение N2", "утверждены\n", "Приложение к постановлению\n", - "Приложение № 1 к распоряжению\n"] + application_starts = [ + "Утвержден", "УТВЕРЖДЕНО \n", "Приложение №1\n", "Приложение № 45\n", "Утверждено \n", + "'Приложение N2", "утверждены\n", "Приложение к постановлению\n", + "Приложение № 1 к распоряжению\n" + ] for application_start in application_starts: self.assertIsNotNone(self.structure_extractor.classifier.regexp_application_begin.match(application_start.lower())) def test_string_number_correctness_with_regexp(self) -> None: - lines = ["03.06.2009 № 17, от 07.10.2009 № 42, от 10.03.2010 № 6, от 14.04.2010 № 11, от", - "правонарушениях. (В редакции Закона Москвы от 24.06.2015 г. № 39)", - "2. Нарушение административного регламента", - "1.2.2)", "1.2.4.6}", "1.23.005 ", "1.4.5 ", "1.4.5\n", "1.5.6.Закон о ...."] + lines = [ + "03.06.2009 № 17, от 07.10.2009 № 42, от 10.03.2010 № 6, от 14.04.2010 № 11, от", + "правонарушениях. (В редакции Закона Москвы от 24.06.2015 г. № 39)", + "2. Нарушение административного регламента", + "1.2.2)", "1.2.4.6}", "1.23.005 ", "1.4.5 ", "1.4.5\n", "1.5.6.Закон о ...." + ] answers = [False, False, True, True, True, False, True, True, True] for num, line in enumerate(lines): @@ -107,8 +111,9 @@ def test_fix_labels(self) -> None: self.assertListEqual(labels, self.__fix_labels(labels)) labels = ["structure_unit", "application", "title", "cellar", "title", "application", "structure_unit", "structure_unit", "structure_unit", "title"] - labels_expected = ["structure_unit", "application", "raw_text", "raw_text", "raw_text", "application", "structure_unit", "structure_unit", - "structure_unit", "raw_text"] + labels_expected = [ + "structure_unit", "application", "raw_text", "raw_text", "raw_text", "application", "structure_unit", "structure_unit", "structure_unit", "raw_text" + ] self.assertListEqual(labels_expected, self.__fix_labels(labels)) classes = ["structure_unit", "cellar", "application", "title", "footer"] diff --git a/tests/unit_tests/test_doctype_tz_feature_extractor.py b/tests/unit_tests/test_doctype_tz_feature_extractor.py index c985f96c..a0b64daa 100644 --- a/tests/unit_tests/test_doctype_tz_feature_extractor.py +++ b/tests/unit_tests/test_doctype_tz_feature_extractor.py @@ -33,8 +33,7 @@ def test_start_regexp(self) -> None: self.assertEqual(0, self.__count_start(line5)) def __count_start(self, line: str) -> int: - return sum([1 for _, i in self.feature_extractor._start_regexp(line, self.feature_extractor.list_item_regexp) - if i > 0]) + return sum([1 for _, i in self.feature_extractor._start_regexp(line, self.feature_extractor.list_item_regexp) if i > 0]) def test_end_regexp(self) -> None: line1 = "Подраздел 3.2 Требования к качеству оказываемых услуг\t12" diff --git a/tests/unit_tests/test_format_docx_reader.py b/tests/unit_tests/test_format_docx_reader.py index 3203a0f1..49355997 100644 --- a/tests/unit_tests/test_format_docx_reader.py +++ b/tests/unit_tests/test_format_docx_reader.py @@ -222,8 +222,10 @@ def test_tables_with_merged_cells(self) -> None: docx_reader = DocxReader(config=get_config()) path = self._get_path("big_table_with_merged_cells.docx") result = docx_reader.read(path) - hidden_cells_big_table = [(0, 1), (0, 2), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (3, 1), (3, 2), (3, 3), - (4, 0), (4, 1), (4, 2), (4, 3), (5, 0), (5, 1), (5, 2), (5, 3), (5, 6), (5, 7), (5, 8), (5, 9)] + hidden_cells_big_table = [ + (0, 1), (0, 2), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (3, 1), (3, 2), (3, 3), + (4, 0), (4, 1), (4, 2), (4, 3), (5, 0), (5, 1), (5, 2), (5, 3), (5, 6), (5, 7), (5, 8), (5, 9) + ] for i, j in hidden_cells_big_table: self.assertTrue(result.tables[0].cells[i][j].invisible) self.assertEqual(result.tables[0].cells[i][j].rowspan, 1) diff --git a/tests/unit_tests/test_misc_annotations.py b/tests/unit_tests/test_misc_annotations.py index 3d91d639..f9b76470 100644 --- a/tests/unit_tests/test_misc_annotations.py +++ b/tests/unit_tests/test_misc_annotations.py @@ -67,7 +67,10 @@ def test_annotation_merge_same_value_separating_by_tab(self) -> None: self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_same_value_separating_by_newline(self) -> None: - annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=6, end=15, name="size", value="1")] + annotations = [ + Annotation(start=0, end=5, name="size", value="1"), + Annotation(start=6, end=15, name="size", value="1") + ] text = "hello\nmy friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) @@ -75,42 +78,64 @@ def test_annotation_merge_included(self) -> None: """ Tests the case where one annotation includes another. Both annotations share the same name and value """ - annotations = [Annotation(start=0, end=15, name="size", value="1"), Annotation(start=3, end=5, name="size", value="1")] + annotations = [ + Annotation(start=0, end=15, name="size", value="1"), + Annotation(start=3, end=5, name="size", value="1") + ] text = "hello my friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_three_annotations(self) -> None: - annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=6, end=10, name="size", value="1"), - Annotation(start=10, end=15, name="size", value="1")] + annotations = [ + Annotation(start=0, end=5, name="size", value="1"), + Annotation(start=6, end=10, name="size", value="1"), + Annotation(start=10, end=15, name="size", value="1") + ] text = "hello my friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_three_nested_annotations(self) -> None: - annotations = [Annotation(start=0, end=15, name="size", value="1"), Annotation(start=6, end=10, name="size", value="1"), - Annotation(start=3, end=8, name="size", value="1")] + annotations = [ + Annotation(start=0, end=15, name="size", value="1"), + Annotation(start=6, end=10, name="size", value="1"), + Annotation(start=3, end=8, name="size", value="1") + ] text = "hello my friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_three_intersected_annotations(self) -> None: - annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=3, end=8, name="size", value="1"), - Annotation(start=6, end=9, name="size", value="1")] + annotations = [ + Annotation(start=0, end=5, name="size", value="1"), + Annotation(start=3, end=8, name="size", value="1"), + Annotation(start=6, end=9, name="size", value="1") + ] text = "hello my friend" self.assertSetEqual({(0, 9, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_three_one_intersected_annotations(self) -> None: - annotations = [Annotation(start=0, end=3, name="size", value="1"), Annotation(start=3, end=6, name="size", value="1"), - Annotation(start=8, end=15, name="size", value="1")] + annotations = [ + Annotation(start=0, end=3, name="size", value="1"), + Annotation(start=3, end=6, name="size", value="1"), + Annotation(start=8, end=15, name="size", value="1") + ] text = "hello my friend" self.assertSetEqual({(0, 6, "size", "1"), (8, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_different_value(self) -> None: - annotations = [Annotation(start=0, end=5, name="bold", value="True"), Annotation(start=5, end=15, name="italic", value="True")] + annotations = [ + Annotation(start=0, end=5, name="bold", value="True"), + Annotation(start=5, end=15, name="italic", value="True") + ] text = "hello my friend" self.assertSetEqual({(0, 5, "bold", "True"), (5, 15, "italic", "True")}, self.merge(annotations, text)) def test_annotation_merge_mixed(self) -> None: - annotations = [Annotation(start=0, end=5, name="bold", value="True"), Annotation(start=5, end=15, name="bold", value="True"), - Annotation(start=4, end=6, name="italic", value="True"), Annotation(start=6, end=66, name="italic", value="True")] + annotations = [ + Annotation(start=0, end=5, name="bold", value="True"), + Annotation(start=5, end=15, name="bold", value="True"), + Annotation(start=4, end=6, name="italic", value="True"), + Annotation(start=6, end=66, name="italic", value="True") + ] text = "hello my friend, hello my friend, hello my friend, hello my friend" self.assertSetEqual({(0, 15, "bold", "True"), (4, 66, "italic", "True")}, self.merge(annotations, text)) diff --git a/tests/unit_tests/test_misc_prefix.py b/tests/unit_tests/test_misc_prefix.py index ee77673c..ca90d6d1 100644 --- a/tests/unit_tests/test_misc_prefix.py +++ b/tests/unit_tests/test_misc_prefix.py @@ -45,11 +45,14 @@ def _check_if_valid(self, valid_prefix: List[str], prefix_class: Type[LinePrefix self.assertFalse(prefix_class.is_valid(prefix), message) def test_is_predecessor_mixed_type(self) -> None: - mixed_prefix = [DottedPrefix("1.", 0), - BracketPrefix("1)", 0), - EmptyPrefix("some prefix"), - LetterPrefix("a)", 0), - BulletPrefix("-", 0)] + mixed_prefix = [ + DottedPrefix("1.", 0), + BracketPrefix("1)", 0), + EmptyPrefix("some prefix"), + LetterPrefix("a)", 0), + BulletPrefix("-", 0) + ] + for first in mixed_prefix: for second in mixed_prefix: if first != second: diff --git a/tests/unit_tests/test_misc_tasker.py b/tests/unit_tests/test_misc_tasker.py index 585d14ea..8724f046 100644 --- a/tests/unit_tests/test_misc_tasker.py +++ b/tests/unit_tests/test_misc_tasker.py @@ -62,15 +62,17 @@ def test_line_label_tasker_size2(self) -> None: def test_tasker(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: - taskers = {"law_classifier": LineLabelTasker( - path2bboxes=self.path2bboxes, - path2lines=self.path2lines, - path2docs=self.path2docs, - manifest_path=self.manifest_path, - config_path=self.config_path, - tmp_dir=tmpdir, - config=get_test_config() - )} + taskers = { + "law_classifier": LineLabelTasker( + path2bboxes=self.path2bboxes, + path2lines=self.path2lines, + path2docs=self.path2docs, + manifest_path=self.manifest_path, + config_path=self.config_path, + tmp_dir=tmpdir, + config=get_test_config() + ) + } tasker = Tasker(boxes_label_path=self.path2bboxes, line_info_path=self.path2lines, images_path=self.path2docs, diff --git a/tests/unit_tests/test_misc_tree_node.py b/tests/unit_tests/test_misc_tree_node.py index 6bd936a0..286b5056 100644 --- a/tests/unit_tests/test_misc_tree_node.py +++ b/tests/unit_tests/test_misc_tree_node.py @@ -11,13 +11,15 @@ class TestTreeNode(TestCase): def test_root_node_annotations(self) -> None: - lines = [LineWithMeta(line="bold text\n", - metadata=LineMetadata(hierarchy_level=HierarchyLevel.create_root(), page_id=0, line_id=0), - annotations=[BoldAnnotation(start=0, end=10, value="True")]), - LineWithMeta(line="italic text\n", - metadata=LineMetadata(hierarchy_level=HierarchyLevel.create_root(), page_id=0, line_id=1), - annotations=[ItalicAnnotation(start=0, end=12, value="True")]), - ] + lines = [ + LineWithMeta(line="bold text\n", + metadata=LineMetadata(hierarchy_level=HierarchyLevel.create_root(), page_id=0, line_id=0), + annotations=[BoldAnnotation(start=0, end=10, value="True")]), + LineWithMeta(line="italic text\n", + metadata=LineMetadata(hierarchy_level=HierarchyLevel.create_root(), page_id=0, line_id=1), + annotations=[ItalicAnnotation(start=0, end=12, value="True")]), + ] + node = TreeNode.create(lines=lines) node_annotations = node.get_root().annotations node_annotations.sort(key=lambda a: a.start) diff --git a/tests/unit_tests/test_module_builders.py b/tests/unit_tests/test_module_builders.py index 0a161fed..f2dcad8b 100644 --- a/tests/unit_tests/test_module_builders.py +++ b/tests/unit_tests/test_module_builders.py @@ -5,19 +5,19 @@ ApplicationFoivHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.application_builder.application_law_hierarchy_level_builder import \ ApplicationLawHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_foiv_hierarchy_level_builder import \ - BodyFoivHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_law_hierarchy_level_builder import \ - BodyLawHierarchyLevelBuilder +from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_foiv_hierarchy_level_builder import BodyFoivHierarchyLevelBuilder +from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_law_hierarchy_level_builder import BodyLawHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.composition_hierarchy_level_builder import HierarchyLevelBuilderComposition class TestBuilders(unittest.TestCase): - builders = [HeaderHierarchyLevelBuilder(), - BodyLawHierarchyLevelBuilder(), - BodyFoivHierarchyLevelBuilder(), - ApplicationLawHierarchyLevelBuilder(), - ApplicationFoivHierarchyLevelBuilder()] + builders = [ + HeaderHierarchyLevelBuilder(), + BodyLawHierarchyLevelBuilder(), + BodyFoivHierarchyLevelBuilder(), + ApplicationLawHierarchyLevelBuilder(), + ApplicationFoivHierarchyLevelBuilder() + ] composition_builder = HierarchyLevelBuilderComposition(builders=builders) def test_creation_of_builders(self) -> None: diff --git a/tests/unit_tests/test_module_cell_splitter.py b/tests/unit_tests/test_module_cell_splitter.py index c12aa7a9..ad48952a 100644 --- a/tests/unit_tests/test_module_cell_splitter.py +++ b/tests/unit_tests/test_module_cell_splitter.py @@ -9,10 +9,14 @@ class TestCellSplitter(unittest.TestCase): def test_merge_close_borders(self) -> None: cells = [ - [Cell(x_top_left=0, y_top_left=0, x_bottom_right=50, y_bottom_right=30), - Cell(x_top_left=51, y_top_left=2, x_bottom_right=90, y_bottom_right=29)], - [Cell(x_top_left=0, y_top_left=31, x_bottom_right=50, y_bottom_right=50), - Cell(x_top_left=51, y_top_left=31, x_bottom_right=91, y_bottom_right=50)] + [ + Cell(x_top_left=0, y_top_left=0, x_bottom_right=50, y_bottom_right=30), + Cell(x_top_left=51, y_top_left=2, x_bottom_right=90, y_bottom_right=29) + ], + [ + Cell(x_top_left=0, y_top_left=31, x_bottom_right=50, y_bottom_right=50), + Cell(x_top_left=51, y_top_left=31, x_bottom_right=91, y_bottom_right=50) + ] ] cells_merged = self.splitter._merge_close_borders(cells) self.assertEqual(0, cells_merged[0][0].x_top_left) @@ -135,10 +139,17 @@ def test_vertical_split(self) -> None: self.assertEqual(5, cell_d.y_bottom_right) def test_no_split(self) -> None: - cells = [[Cell(x_top_left=160, y_top_left=321, x_bottom_right=825, y_bottom_right=369), - Cell(x_top_left=825, y_top_left=321, x_bottom_right=1494, y_bottom_right=369)], - [Cell(x_top_left=160, y_top_left=374, x_bottom_right=825, y_bottom_right=423), - Cell(x_top_left=825, y_top_left=374, x_bottom_right=1494, y_bottom_right=423)]] + cells = [ + [ + Cell(x_top_left=160, y_top_left=321, x_bottom_right=825, y_bottom_right=369), + Cell(x_top_left=825, y_top_left=321, x_bottom_right=1494, y_bottom_right=369) + ], + [ + Cell(x_top_left=160, y_top_left=374, x_bottom_right=825, y_bottom_right=423), + Cell(x_top_left=825, y_top_left=374, x_bottom_right=1494, y_bottom_right=423) + ] + ] + splitted = self.splitter.split(cells=cells) self.assertEqual(2, len(splitted)) self.assertEqual(2, len(splitted[0]))