TLDR-472 add flake8-fill-one-line and flake8-multiline-containers and…

… fix lint (#336) * add flake8-fill-one-line and flake8-multiline-containers and fix lint * update precommit hook
ispras · Sep 27, 2023 · f0be0db · f0be0db
1 parent 78d423c
commit f0be0db
Show file tree

Hide file tree

Showing 70 changed files with 327 additions and 378 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,9 @@ repos:
             flake8-annotations==2.9.1,
             flake8-bugbear==23.3.12,
             flake8-builtins==2.1.0,
+            flake8-fill-one-line>=0.4.0,
             flake8-import-order==0.18.2,
+            flake8-multiline-containers==0.0.19,
             flake8-print==5.0.0,
             flake8-quotes==3.3.2,
             flake8-use-fstring==1.4,

diff --git a/dedoc/api/train_dataset/api_collect_train_dataset.py b/dedoc/api/train_dataset/api_collect_train_dataset.py
@@ -163,10 +163,7 @@ def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParam
     clear()
     parameters = query_params.dict(by_alias=True)
     uid = handler.handle(file=file, parameters=parameters)
-    return HTMLResponse(
-        f'Successfully handle file. UID=<p><a href="/get_result_archive/?uid={uid}">get_result_archive/?uid={uid}</a></p>',
-        status_code=201
-    )
+    return HTMLResponse(f'Successfully handle file. UID=<p><a href="/get_result_archive/?uid={uid}">get_result_archive/?uid={uid}</a></p>', status_code=201)
 
 
 @app.get("/get_result_archive")

diff --git a/dedoc/api/train_dataset/async_archive_handler.py b/dedoc/api/train_dataset/async_archive_handler.py
@@ -45,11 +45,7 @@ def _handle_archive(self, uid: str, path: str, parameters: dict) -> str:
                     self.__handle_one_file(archive, file, parameters)
                     self.progress[uid] = f"files done\t= {i + 1} \n files_in_progress\t= {0}\n total\t= {len(archive.namelist())}"
 
-            task, _ = self.tasker.create_tasks(
-                type_of_task=parameters["type_of_task"],
-                task_size=int(parameters["task_size"]),
-                task_uid=uid
-            )
+            task, _ = self.tasker.create_tasks(type_of_task=parameters["type_of_task"], task_size=int(parameters["task_size"]), task_uid=uid)
             return task
         except Exception as e:
             self.progress[uid] = f"Fail with\n{e}"
@@ -79,13 +75,7 @@ def __init__(self, tasker: Tasker, manager: DedocManager, *, config: dict) -> No
         self.queue = Queue()
         self.__results = {}
         self._progress = tasker.progress_bar
-        self._handler = _ArchiveHandler(
-            queue=self.queue,
-            progress=self._progress,
-            manager=manager,
-            tasker=tasker,
-            config=config,
-            results=self.__results)
+        self._handler = _ArchiveHandler(queue=self.queue, progress=self._progress, manager=manager, tasker=tasker, config=config, results=self.__results)
         self._handler.start()
         self.tmp_dir = TemporaryDirectory()
 

diff --git a/dedoc/attachments_extractors/utils.py b/dedoc/attachments_extractors/utils.py
@@ -5,11 +5,13 @@
 
 def create_note(content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]:
     filename = get_unique_name("note.json")
-    note_dict = {"content": content,
-                 "modified_time": modified_time,
-                 "created_time": created_time,
-                 "size": size if size else len(content),
-                 "author": author}
+    note_dict = {
+        "content": content,
+        "modified_time": modified_time,
+        "created_time": created_time,
+        "size": size if size else len(content),
+        "author": author
+    }
     encode_data = json.dumps(note_dict).encode("utf-8")
 
     return filename, encode_data
diff --git a/dedoc/config.py b/dedoc/config.py
@@ -4,9 +4,7 @@
 import sys
 from typing import Any, Optional
 
-logging.basicConfig(stream=sys.stdout,
-                    level=logging.INFO,
-                    format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")
+logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")
 
 DEBUG_MODE = False
 RESOURCES_PATH = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources"))

diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py
@@ -50,8 +50,10 @@ def to_dict(self) -> dict:
 
     @staticmethod
     def get_api_dict(api: Api) -> Model:
-        names = ["style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table",
-                 "attachment", "spacing", "strike", "subscript", "superscript"]
+        names = [
+            "style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table",
+            "attachment", "spacing", "strike", "subscript", "superscript"
+        ]
         return api.model("Annotation", {
             "start": fields.Integer(description="annotation start index", required=True, example=0),
             "end": fields.Integer(description="annotation end index", required=True, example=4),

diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
@@ -17,11 +17,7 @@ class LineWithMeta(Sized):
     (for example, document title and raw text of the document should not be in the same line).
     Still the logical part of the document may be represented by more than one line (for example, document title may consist of many lines).
     """
-    def __init__(self,
-                 line: str,
-                 metadata: Optional[LineMetadata] = None,
-                 annotations: Optional[List[Annotation]] = None,
-                 uid: str = None) -> None:
+    def __init__(self, line: str, metadata: Optional[LineMetadata] = None, annotations: Optional[List[Annotation]] = None, uid: str = None) -> None:
         """
         :param line: raw text of the document line
         :param metadata: metadata (related to the entire line, as line or page number, its hierarchy level)

diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py
@@ -58,4 +58,5 @@ def get_api_dict(api: Api, depth: int = 0, name: str = "ParsedDocument") -> Mode
             if depth == 10  # TODO delete this
             else fields.List(fields.Nested(ParsedDocument.get_api_dict(api, depth=depth + 1, name="refParsedDocument" + str(depth)),
                                            description="Attachment structure",
-                                           required=False))})
+                                           required=False))
+        })
diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py
@@ -124,10 +124,7 @@ def add_text(self, line: LineWithMeta) -> None:
     def __shift_annotations(line: LineWithMeta, text_length: int) -> List[Annotation]:
         new_annotations = []
         for annotation in line.annotations:
-            new_annotation = Annotation(start=annotation.start + text_length,
-                                        end=annotation.end + text_length,
-                                        name=annotation.name,
-                                        value=annotation.value)
+            new_annotation = Annotation(start=annotation.start + text_length, end=annotation.end + text_length, name=annotation.name, value=annotation.value)
             new_annotations.append(new_annotation)
         return new_annotations
 

diff --git a/dedoc/download_models.py b/dedoc/download_models.py
@@ -26,20 +26,14 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str
 
 
 def download(resources_path: str) -> None:
-    download_from_hub(out_dir=resources_path,
-                      out_name="txtlayer_classifier.pkl.gz",
-                      repo_name="txtlayer_classifier",
-                      hub_name="model.pkl.gz")
+    download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz")
 
     download_from_hub(out_dir=resources_path,
                       out_name="scan_orientation_efficient_net_b0.pth",
                       repo_name="scan_orientation_efficient_net_b0",
                       hub_name="model.pth")
 
-    download_from_hub(out_dir=resources_path,
-                      out_name="paragraph_classifier.pkl.gz",
-                      repo_name="paragraph_classifier",
-                      hub_name="model.pkl.gz")
+    download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.pkl.gz", repo_name="paragraph_classifier", hub_name="model.pkl.gz")
 
     line_clf_resources_path = os.path.join(resources_path, "line_type_classifiers")
     for classifier_type in ("diploma", "law", "law_txt", "tz", "tz_txt"):

diff --git a/dedoc/extensions.py b/dedoc/extensions.py
@@ -27,13 +27,18 @@
 converted_mimes = Extensions(
     excel_like_format=["application/vnd.oasis.opendocument.spreadsheet", "application/vnd.ms-excel"],
     docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text"],
-    pptx_like_format=["application/vnd.openxmlformats-officedocument.presentationml.presentation",
-                      "application/vnd.ms-powerpoint", "application/vnd.oasis.opendocument.presentation"],
+    pptx_like_format=[
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        "application/vnd.ms-powerpoint",
+        "application/vnd.oasis.opendocument.presentation"
+    ],
     archive_like_format=[],
-    image_like_format=["image/gif",
-                       "image/x-portable-pixmap", "image/x-portable-anymap", "image/x-portable-graymap",
-                       "image/x-portable-bitmap", "image/x-pcx", "image/x-pict",
-                       "application/postscript", "image/x-cmu-raster"],
+    image_like_format=[
+        "image/gif",
+        "image/x-portable-pixmap", "image/x-portable-anymap", "image/x-portable-graymap",
+        "image/x-portable-bitmap", "image/x-pcx", "image/x-pict",
+        "application/postscript", "image/x-cmu-raster"
+    ],
     pdf_like_format=["image/vnd.djvu"],
     csv_like_format=[],
     txt_like_format=["application/xml", "text/xml"]

diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py
@@ -124,8 +124,7 @@ def _get_exif(self, path: str) -> dict:
             image = Image.open(path)
             exif_dict = piexif.load(image.info["exif"]).get("Exif", {}) if "exif" in image.info else {}
             exif = {ExifTags.TAGS[k]: v for k, v in exif_dict.items() if k in ExifTags.TAGS}
-            encoded_dict = {key_renamed: encode_function(exif.get(key))
-                            for key, (key_renamed, encode_function) in self.keys.items() if key in exif}
+            encoded_dict = {key_renamed: encode_function(exif.get(key)) for key, (key_renamed, encode_function) in self.keys.items() if key in exif}
             encoded_dict = {k: v for k, v in encoded_dict.items() if k is not None if v is not None}
             image.close()
             return encoded_dict

diff --git a/dedoc/readers/docx_reader/numbering_extractor.py b/dedoc/readers/docx_reader/numbering_extractor.py
@@ -329,12 +329,7 @@ def parse(self, lvl_list: List[Tag]) -> None:
 
 
 class Num(AbstractNum):
-
-    def __init__(self,
-                 num_id: str,
-                 abstract_num_dict: Dict[str, Tag],
-                 num_dict: Dict[str, Tag],
-                 styles_extractor: StylesExtractor) -> None:
+    def __init__(self, num_id: str, abstract_num_dict: Dict[str, Tag], num_dict: Dict[str, Tag], styles_extractor: StylesExtractor) -> None:
         """
         :param num_id: numId for num element
         :param abstract_num_dict: dictionary with abstractNum BeautifulSoup trees

diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py
@@ -54,8 +54,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true"
         path_hash = calculate_file_hash(path=path)
         lines = self.__read_blocks(soup, path_hash=path_hash, handle_invisible_table=handle_invisible_table)
-        tables = [self._read_table(table, path_hash) for table in soup.find_all("table")
-                  if self._visible_table(table, handle_invisible_table=handle_invisible_table)]
+        tables = [
+            self._read_table(table, path_hash) for table in soup.find_all("table") if self._visible_table(table, handle_invisible_table=handle_invisible_table)
+        ]
         document = UnstructuredDocument(tables=tables, lines=lines, attachments=[])
         document_postprocess = self.postprocessor.postprocess(document)
         return document_postprocess
@@ -102,10 +103,7 @@ def __handle_single_tag(self, tag: Tag, uid: str) -> List[LineWithMeta]:
         line.metadata.extend_other_fields({"html_tag": tag.name})
         return [line]
 
-    def __read_blocks(self,
-                      block: Tag,
-                      path_hash: str = "",
-                      handle_invisible_table: bool = False) -> List[LineWithMeta]:
+    def __read_blocks(self, block: Tag, path_hash: str = "", handle_invisible_table: bool = False) -> List[LineWithMeta]:
         uid = hashlib.md5((path_hash + str(block.name)).encode()).hexdigest()
         if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table):
             return []
@@ -125,12 +123,7 @@ def _handle_text_line(self, block: str, path_hash: str, ignore_space: bool = Tru
         line = self.__make_line(block, HierarchyLevel.unknown, 0, uid=uid, path_hash=path_hash)
         return [line]
 
-    def __make_line(self, line: str,
-                    line_type: str,
-                    header_level: int = 0,
-                    uid: str = None,
-                    path_hash: str = None,
-                    annotations: List = None) -> LineWithMeta:
+    def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, path_hash: str = None, annotations: List = None) -> LineWithMeta:
         if annotations is None:
             annotations = []
 
@@ -176,12 +169,7 @@ def __read_list(self, lst: Tag, uid: str, path_hash: str, handle_invisible_table
                 lines.extend(item_lines)
         return lines
 
-    def __handle_list_item(self,
-                           item: Tag,
-                           item_index: int,
-                           list_type: str,
-                           path_hash: str,
-                           handle_invisible_table: bool) -> List[LineWithMeta]:
+    def __handle_list_item(self, item: Tag, item_index: int, list_type: str, path_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:
         lines = []
         header_line = self.__get_li_header(list_type=list_type, index=item_index)
         block_lines = self.__handle_block(item, uid=path_hash, handle_invisible_table=handle_invisible_table)

diff --git a/dedoc/readers/html_reader/html_tags.py b/dedoc/readers/html_reader/html_tags.py
@@ -1,20 +1,8 @@
 class HtmlTags:
-
     service_tags = ["script", "style"]
 
     list_items = ["li", "dd", "dt"]
-    block_tags = ["aside",
-                  "article",
-                  "body",
-                  "div",
-                  "footer",
-                  "header",
-                  "html",
-                  "main",
-                  "nav",
-                  "section",
-                  "form"
-                  ] + list_items
+    block_tags = ["aside", "article", "body", "div", "footer", "header", "html", "main", "nav", "section", "form", *list_items]
     unordered_list = ["ul", "dl", "dir"]
     ordered_list = ["ol"]
     list_tags = unordered_list + ordered_list
@@ -31,35 +19,10 @@ class HtmlTags:
     paragraphs = ["p"] + block_tags + list_items + header_tags
 
     styled_tag = bold_tags + italic_tags + underlined_tags + strike_tags + superscript_tags + subscript_tags
-    simple_text_tags = ["a",
-                        "abbr",
-                        "acronym",
-                        "applet",
-                        "area",
-                        "article",
-                        "aside",
-                        "bdi",
-                        "bdo",
-                        "big",
-                        "blockquote",
-                        "canvas",
-                        "caption",
-                        "center",
-                        "cite",
-                        "code",
-                        "data",
-                        "font",
-                        "kbd",
-                        "mark",
-                        "output",
-                        "p",
-                        "pre",
-                        "q",
-                        "samp",
-                        "small",
-                        "span",
-                        "tt",
-                        "wbr"]
+    simple_text_tags = [
+        "a", "abbr", "acronym", "applet", "area", "article", "aside", "bdi", "bdo", "big", "blockquote", "canvas", "caption", "center", "cite", "code", "data",
+        "font", "kbd", "mark", "output", "p", "pre", "q", "samp", "small", "span", "tt", "wbr"
+    ]
     text_tags = simple_text_tags + styled_tag
 
     table_tags = ["table"]

diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py
@@ -60,8 +60,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
             tables.extend(result.tables)
 
         need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
-        attachments_names = [os.path.join(os.path.basename(os.path.dirname(file_name)), os.path.basename(file_name))
-                             for file_name in names_list if file_name not in names_html]
+        attachments_names = [
+            os.path.join(os.path.basename(os.path.dirname(file_name)), os.path.basename(file_name)) for file_name in names_list if file_name not in names_html
+        ]
         attachments = self.__get_attachments(save_dir=save_dir, names_list=attachments_names, need_content_analysis=need_content_analysis)
 
         return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments)

diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
@@ -145,8 +145,9 @@ def __merge_documents(self, first: UnstructuredDocument, second: UnstructuredDoc
         for line in chain(first.lines, second.lines):
             line.metadata.line_id = line_id
             line_id += 1
-            annotations = [annotation for annotation in line.annotations
-                           if not (isinstance(annotation, TableAnnotation) and annotation.value in dropped_tables)]
+            annotations = [
+                annotation for annotation in line.annotations if not (isinstance(annotation, TableAnnotation) and annotation.value in dropped_tables)
+            ]
             new_line = LineWithMeta(line=line.line, metadata=line.metadata, annotations=annotations, uid=line.uid)
             lines.append(new_line)
         return UnstructuredDocument(tables=tables, lines=lines, attachments=first.attachments + second.attachments, metadata=second.metadata)