Skip to content

Commit

Permalink
TLDR-472 add flake8-fill-one-line and flake8-multiline-containers and…
Browse files Browse the repository at this point in the history
… fix lint (#336)

* add flake8-fill-one-line and flake8-multiline-containers and fix lint

* update precommit hook
  • Loading branch information
dronperminov authored Sep 27, 2023
1 parent 78d423c commit f0be0db
Show file tree
Hide file tree
Showing 70 changed files with 327 additions and 378 deletions.
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ repos:
flake8-annotations==2.9.1,
flake8-bugbear==23.3.12,
flake8-builtins==2.1.0,
flake8-fill-one-line>=0.4.0,
flake8-import-order==0.18.2,
flake8-multiline-containers==0.0.19,
flake8-print==5.0.0,
flake8-quotes==3.3.2,
flake8-use-fstring==1.4,
Expand Down
5 changes: 1 addition & 4 deletions dedoc/api/train_dataset/api_collect_train_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,7 @@ def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParam
clear()
parameters = query_params.dict(by_alias=True)
uid = handler.handle(file=file, parameters=parameters)
return HTMLResponse(
f'Successfully handle file. UID=<p><a href="/get_result_archive/?uid={uid}">get_result_archive/?uid={uid}</a></p>',
status_code=201
)
return HTMLResponse(f'Successfully handle file. UID=<p><a href="/get_result_archive/?uid={uid}">get_result_archive/?uid={uid}</a></p>', status_code=201)


@app.get("/get_result_archive")
Expand Down
14 changes: 2 additions & 12 deletions dedoc/api/train_dataset/async_archive_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,7 @@ def _handle_archive(self, uid: str, path: str, parameters: dict) -> str:
self.__handle_one_file(archive, file, parameters)
self.progress[uid] = f"files done\t= {i + 1} \n files_in_progress\t= {0}\n total\t= {len(archive.namelist())}"

task, _ = self.tasker.create_tasks(
type_of_task=parameters["type_of_task"],
task_size=int(parameters["task_size"]),
task_uid=uid
)
task, _ = self.tasker.create_tasks(type_of_task=parameters["type_of_task"], task_size=int(parameters["task_size"]), task_uid=uid)
return task
except Exception as e:
self.progress[uid] = f"Fail with\n{e}"
Expand Down Expand Up @@ -79,13 +75,7 @@ def __init__(self, tasker: Tasker, manager: DedocManager, *, config: dict) -> No
self.queue = Queue()
self.__results = {}
self._progress = tasker.progress_bar
self._handler = _ArchiveHandler(
queue=self.queue,
progress=self._progress,
manager=manager,
tasker=tasker,
config=config,
results=self.__results)
self._handler = _ArchiveHandler(queue=self.queue, progress=self._progress, manager=manager, tasker=tasker, config=config, results=self.__results)
self._handler.start()
self.tmp_dir = TemporaryDirectory()

Expand Down
12 changes: 7 additions & 5 deletions dedoc/attachments_extractors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

def create_note(content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]:
filename = get_unique_name("note.json")
note_dict = {"content": content,
"modified_time": modified_time,
"created_time": created_time,
"size": size if size else len(content),
"author": author}
note_dict = {
"content": content,
"modified_time": modified_time,
"created_time": created_time,
"size": size if size else len(content),
"author": author
}
encode_data = json.dumps(note_dict).encode("utf-8")

return filename, encode_data
4 changes: 1 addition & 3 deletions dedoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
import sys
from typing import Any, Optional

logging.basicConfig(stream=sys.stdout,
level=logging.INFO,
format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")

DEBUG_MODE = False
RESOURCES_PATH = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources"))
Expand Down
6 changes: 4 additions & 2 deletions dedoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ def to_dict(self) -> dict:

@staticmethod
def get_api_dict(api: Api) -> Model:
names = ["style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table",
"attachment", "spacing", "strike", "subscript", "superscript"]
names = [
"style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table",
"attachment", "spacing", "strike", "subscript", "superscript"
]
return api.model("Annotation", {
"start": fields.Integer(description="annotation start index", required=True, example=0),
"end": fields.Integer(description="annotation end index", required=True, example=4),
Expand Down
6 changes: 1 addition & 5 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,7 @@ class LineWithMeta(Sized):
(for example, document title and raw text of the document should not be in the same line).
Still the logical part of the document may be represented by more than one line (for example, document title may consist of many lines).
"""
def __init__(self,
line: str,
metadata: Optional[LineMetadata] = None,
annotations: Optional[List[Annotation]] = None,
uid: str = None) -> None:
def __init__(self, line: str, metadata: Optional[LineMetadata] = None, annotations: Optional[List[Annotation]] = None, uid: str = None) -> None:
"""
:param line: raw text of the document line
:param metadata: metadata (related to the entire line, as line or page number, its hierarchy level)
Expand Down
3 changes: 2 additions & 1 deletion dedoc/data_structures/parsed_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,5 @@ def get_api_dict(api: Api, depth: int = 0, name: str = "ParsedDocument") -> Mode
if depth == 10 # TODO delete this
else fields.List(fields.Nested(ParsedDocument.get_api_dict(api, depth=depth + 1, name="refParsedDocument" + str(depth)),
description="Attachment structure",
required=False))})
required=False))
})
5 changes: 1 addition & 4 deletions dedoc/data_structures/tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,7 @@ def add_text(self, line: LineWithMeta) -> None:
def __shift_annotations(line: LineWithMeta, text_length: int) -> List[Annotation]:
new_annotations = []
for annotation in line.annotations:
new_annotation = Annotation(start=annotation.start + text_length,
end=annotation.end + text_length,
name=annotation.name,
value=annotation.value)
new_annotation = Annotation(start=annotation.start + text_length, end=annotation.end + text_length, name=annotation.name, value=annotation.value)
new_annotations.append(new_annotation)
return new_annotations

Expand Down
10 changes: 2 additions & 8 deletions dedoc/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,14 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str


def download(resources_path: str) -> None:
download_from_hub(out_dir=resources_path,
out_name="txtlayer_classifier.pkl.gz",
repo_name="txtlayer_classifier",
hub_name="model.pkl.gz")
download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz")

download_from_hub(out_dir=resources_path,
out_name="scan_orientation_efficient_net_b0.pth",
repo_name="scan_orientation_efficient_net_b0",
hub_name="model.pth")

download_from_hub(out_dir=resources_path,
out_name="paragraph_classifier.pkl.gz",
repo_name="paragraph_classifier",
hub_name="model.pkl.gz")
download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.pkl.gz", repo_name="paragraph_classifier", hub_name="model.pkl.gz")

line_clf_resources_path = os.path.join(resources_path, "line_type_classifiers")
for classifier_type in ("diploma", "law", "law_txt", "tz", "tz_txt"):
Expand Down
17 changes: 11 additions & 6 deletions dedoc/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,18 @@
converted_mimes = Extensions(
excel_like_format=["application/vnd.oasis.opendocument.spreadsheet", "application/vnd.ms-excel"],
docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text"],
pptx_like_format=["application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint", "application/vnd.oasis.opendocument.presentation"],
pptx_like_format=[
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint",
"application/vnd.oasis.opendocument.presentation"
],
archive_like_format=[],
image_like_format=["image/gif",
"image/x-portable-pixmap", "image/x-portable-anymap", "image/x-portable-graymap",
"image/x-portable-bitmap", "image/x-pcx", "image/x-pict",
"application/postscript", "image/x-cmu-raster"],
image_like_format=[
"image/gif",
"image/x-portable-pixmap", "image/x-portable-anymap", "image/x-portable-graymap",
"image/x-portable-bitmap", "image/x-pcx", "image/x-pict",
"application/postscript", "image/x-cmu-raster"
],
pdf_like_format=["image/vnd.djvu"],
csv_like_format=[],
txt_like_format=["application/xml", "text/xml"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,7 @@ def _get_exif(self, path: str) -> dict:
image = Image.open(path)
exif_dict = piexif.load(image.info["exif"]).get("Exif", {}) if "exif" in image.info else {}
exif = {ExifTags.TAGS[k]: v for k, v in exif_dict.items() if k in ExifTags.TAGS}
encoded_dict = {key_renamed: encode_function(exif.get(key))
for key, (key_renamed, encode_function) in self.keys.items() if key in exif}
encoded_dict = {key_renamed: encode_function(exif.get(key)) for key, (key_renamed, encode_function) in self.keys.items() if key in exif}
encoded_dict = {k: v for k, v in encoded_dict.items() if k is not None if v is not None}
image.close()
return encoded_dict
Expand Down
7 changes: 1 addition & 6 deletions dedoc/readers/docx_reader/numbering_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,12 +329,7 @@ def parse(self, lvl_list: List[Tag]) -> None:


class Num(AbstractNum):

def __init__(self,
num_id: str,
abstract_num_dict: Dict[str, Tag],
num_dict: Dict[str, Tag],
styles_extractor: StylesExtractor) -> None:
def __init__(self, num_id: str, abstract_num_dict: Dict[str, Tag], num_dict: Dict[str, Tag], styles_extractor: StylesExtractor) -> None:
"""
:param num_id: numId for num element
:param abstract_num_dict: dictionary with abstractNum BeautifulSoup trees
Expand Down
24 changes: 6 additions & 18 deletions dedoc/readers/html_reader/html_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true"
path_hash = calculate_file_hash(path=path)
lines = self.__read_blocks(soup, path_hash=path_hash, handle_invisible_table=handle_invisible_table)
tables = [self._read_table(table, path_hash) for table in soup.find_all("table")
if self._visible_table(table, handle_invisible_table=handle_invisible_table)]
tables = [
self._read_table(table, path_hash) for table in soup.find_all("table") if self._visible_table(table, handle_invisible_table=handle_invisible_table)
]
document = UnstructuredDocument(tables=tables, lines=lines, attachments=[])
document_postprocess = self.postprocessor.postprocess(document)
return document_postprocess
Expand Down Expand Up @@ -102,10 +103,7 @@ def __handle_single_tag(self, tag: Tag, uid: str) -> List[LineWithMeta]:
line.metadata.extend_other_fields({"html_tag": tag.name})
return [line]

def __read_blocks(self,
block: Tag,
path_hash: str = "",
handle_invisible_table: bool = False) -> List[LineWithMeta]:
def __read_blocks(self, block: Tag, path_hash: str = "", handle_invisible_table: bool = False) -> List[LineWithMeta]:
uid = hashlib.md5((path_hash + str(block.name)).encode()).hexdigest()
if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table):
return []
Expand All @@ -125,12 +123,7 @@ def _handle_text_line(self, block: str, path_hash: str, ignore_space: bool = Tru
line = self.__make_line(block, HierarchyLevel.unknown, 0, uid=uid, path_hash=path_hash)
return [line]

def __make_line(self, line: str,
line_type: str,
header_level: int = 0,
uid: str = None,
path_hash: str = None,
annotations: List = None) -> LineWithMeta:
def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, path_hash: str = None, annotations: List = None) -> LineWithMeta:
if annotations is None:
annotations = []

Expand Down Expand Up @@ -176,12 +169,7 @@ def __read_list(self, lst: Tag, uid: str, path_hash: str, handle_invisible_table
lines.extend(item_lines)
return lines

def __handle_list_item(self,
item: Tag,
item_index: int,
list_type: str,
path_hash: str,
handle_invisible_table: bool) -> List[LineWithMeta]:
def __handle_list_item(self, item: Tag, item_index: int, list_type: str, path_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:
lines = []
header_line = self.__get_li_header(list_type=list_type, index=item_index)
block_lines = self.__handle_block(item, uid=path_hash, handle_invisible_table=handle_invisible_table)
Expand Down
47 changes: 5 additions & 42 deletions dedoc/readers/html_reader/html_tags.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,8 @@
class HtmlTags:

service_tags = ["script", "style"]

list_items = ["li", "dd", "dt"]
block_tags = ["aside",
"article",
"body",
"div",
"footer",
"header",
"html",
"main",
"nav",
"section",
"form"
] + list_items
block_tags = ["aside", "article", "body", "div", "footer", "header", "html", "main", "nav", "section", "form", *list_items]
unordered_list = ["ul", "dl", "dir"]
ordered_list = ["ol"]
list_tags = unordered_list + ordered_list
Expand All @@ -31,35 +19,10 @@ class HtmlTags:
paragraphs = ["p"] + block_tags + list_items + header_tags

styled_tag = bold_tags + italic_tags + underlined_tags + strike_tags + superscript_tags + subscript_tags
simple_text_tags = ["a",
"abbr",
"acronym",
"applet",
"area",
"article",
"aside",
"bdi",
"bdo",
"big",
"blockquote",
"canvas",
"caption",
"center",
"cite",
"code",
"data",
"font",
"kbd",
"mark",
"output",
"p",
"pre",
"q",
"samp",
"small",
"span",
"tt",
"wbr"]
simple_text_tags = [
"a", "abbr", "acronym", "applet", "area", "article", "aside", "bdi", "bdo", "big", "blockquote", "canvas", "caption", "center", "cite", "code", "data",
"font", "kbd", "mark", "output", "p", "pre", "q", "samp", "small", "span", "tt", "wbr"
]
text_tags = simple_text_tags + styled_tag

table_tags = ["table"]
Expand Down
5 changes: 3 additions & 2 deletions dedoc/readers/mhtml_reader/mhtml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
tables.extend(result.tables)

need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
attachments_names = [os.path.join(os.path.basename(os.path.dirname(file_name)), os.path.basename(file_name))
for file_name in names_list if file_name not in names_html]
attachments_names = [
os.path.join(os.path.basename(os.path.dirname(file_name)), os.path.basename(file_name)) for file_name in names_list if file_name not in names_html
]
attachments = self.__get_attachments(save_dir=save_dir, names_list=attachments_names, need_content_analysis=need_content_analysis)

return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments)
Expand Down
5 changes: 3 additions & 2 deletions dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,9 @@ def __merge_documents(self, first: UnstructuredDocument, second: UnstructuredDoc
for line in chain(first.lines, second.lines):
line.metadata.line_id = line_id
line_id += 1
annotations = [annotation for annotation in line.annotations
if not (isinstance(annotation, TableAnnotation) and annotation.value in dropped_tables)]
annotations = [
annotation for annotation in line.annotations if not (isinstance(annotation, TableAnnotation) and annotation.value in dropped_tables)
]
new_line = LineWithMeta(line=line.line, metadata=line.metadata, annotations=annotations, uid=line.uid)
lines.append(new_line)
return UnstructuredDocument(tables=tables, lines=lines, attachments=first.attachments + second.attachments, metadata=second.metadata)
Loading

0 comments on commit f0be0db

Please sign in to comment.