TLDR-451 tutorial new doc type (#331)

* docs added * add code testing * some fixes * some fixes * add tabula and some fixes * add python-djvulibre * delete python-djvulibre and add djvulibre-bin * add poppler-utils * add tesseract * some fixes * flake8 stylefix * fix docs after flake8 * update last part of adding_new_doc_type_tutorial * rewrite dedoc_add_new_doc_type_tutorial * minor fixes * minor fixes * minor fixes * some fixes * add more code examples * some fixes --------- Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru> Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
ispras · Sep 25, 2023 · db202e7 · db202e7
1 parent 8fe955f
commit db202e7
Show file tree

Hide file tree

Showing 15 changed files with 513 additions and 137 deletions.
diff --git a/.flake8 b/.flake8
@@ -16,7 +16,6 @@ exclude =
     resources,
     dedoc/scripts,
     examples,
-    docs,
     venv,
     build,
     dedoc.egg-info

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -19,7 +19,7 @@ jobs:
 
     - name: Install dependencies
       run: |
-        sudo apt-get install -y libreoffice
+        sudo apt-get install -y libreoffice djvulibre-bin poppler-utils tesseract-ocr libtesseract-dev tesseract-ocr-rus tesseract-ocr-eng
         python -m pip install --upgrade --no-cache-dir pip setuptools
         python -m pip install --exists-action=w --no-cache-dir -r requirements.txt
         python -m pip install --upgrade --upgrade-strategy eager --no-cache-dir .[torch,docs]
@@ -30,3 +30,4 @@ jobs:
         python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build
         cd docs/source/_static/code_examples
         python dedoc_usage_tutorial.py
+        python dedoc_add_new_doc_type_tutorial.py
diff --git a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py
@@ -0,0 +1,61 @@
+import mimetypes
+import os
+
+from djvu_converter import DjvuConverter
+from pdf_reader import PdfReader
+
+from dedoc import DedocManager
+from dedoc.attachments_handler import AttachmentsHandler
+from dedoc.converters import FileConverterComposition
+from dedoc.metadata_extractors import BaseMetadataExtractor, DocxMetadataExtractor, MetadataExtractorComposition
+from dedoc.readers import ReaderComposition
+from dedoc.structure_constructors import LinearConstructor, StructureConstructorComposition, TreeConstructor
+from dedoc.structure_extractors import DefaultStructureExtractor, StructureExtractorComposition
+
+
+file_dir, file_name = "test_dir", "example_with_table.djvu"
+file_path = os.path.join(file_dir, file_name)
+
+
+djvu_converter = DjvuConverter(config=dict())
+pdf_reader = PdfReader()
+
+name_wo_extension, file_extension = os.path.splitext(file_name)
+file_mime = mimetypes.guess_type(file_path)[0]
+
+djvu_converter.can_convert(file_extension, file_mime)  # True
+djvu_converter.do_convert(file_dir, name_wo_extension, file_extension)  # 'example_with_table.pdf'
+
+file_dir, file_name = "test_dir", "example_with_attachments_depth_1.pdf"
+file_path = os.path.join(file_dir, file_name)
+
+name_wo_extension, file_extension = os.path.splitext(file_name)
+file_mime = mimetypes.guess_type(file_path)[0]
+pdf_reader.can_read(file_path, file_mime, file_extension)  # True
+
+pdf_reader.read(file_path, parameters={"with_attachments": "true"})  # <dedoc.data_structures.UnstructuredDocument>
+
+document = pdf_reader.read(file_path, parameters={"with_attachments": "true"})
+list(vars(document))  # ['tables', 'lines', 'attachments', 'warnings', 'metadata']
+len(document.attachments)  # 2
+len(document.lines)  # 15
+
+"""Adding the implemented handlers to the manager config"""
+config = {}
+manager_config = dict(
+    converter=FileConverterComposition(converters=[DjvuConverter(config=config)]),
+    reader=ReaderComposition(readers=[PdfReader()]),
+    structure_extractor=StructureExtractorComposition(extractors={DefaultStructureExtractor.document_type: DefaultStructureExtractor()}, default_key="other"),
+    structure_constructor=StructureConstructorComposition(
+        constructors={"linear": LinearConstructor(), "tree": TreeConstructor()},
+        default_constructor=LinearConstructor()
+    ),
+    document_metadata_extractor=MetadataExtractorComposition(extractors=[DocxMetadataExtractor(), BaseMetadataExtractor()]),
+    attachments_handler=AttachmentsHandler(config=config),
+)
+
+manager = DedocManager(config=config, manager_config=manager_config)
+result = manager.parse(file_path=file_path, parameters={"with_attachments": "true"})
+
+result  # <dedoc.data_structures.ParsedDocument>
+result.to_dict()  # OrderedDict([('version', '0.11.2'), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ...
diff --git a/docs/source/_static/code_examples/dedoc_return_format.py b/docs/source/_static/code_examples/dedoc_return_format.py
@@ -6,60 +6,60 @@
 
 
 def basic_example() -> dict:
-    with open(filename, 'rb') as file:
-        files = {'file': (filename, file)}
+    with open(filename, "rb") as file:
+        files = {"file": (filename, file)}
         r = requests.post("http://localhost:1231/upload", files=files, data=dict())
-        result = r.content.decode('utf-8')
+        result = r.content.decode("utf-8")
 
     assert r.status_code == 200
     return json.loads(result)
 
 
 def linear_structure_type_example() -> dict:
-    with open(filename, 'rb') as file:
-        files = {'file': (filename, file)}
+    with open(filename, "rb") as file:
+        files = {"file": (filename, file)}
         r = requests.post("http://localhost:1231/upload", files=files, data=dict(structure_type="linear"))
-        result = r.content.decode('utf-8')
+        result = r.content.decode("utf-8")
 
     assert r.status_code == 200
     return json.loads(result)
 
 
 def with_attachments_example() -> dict:
-    with open(filename, 'rb') as file:
-        files = {'file': (filename, file)}
+    with open(filename, "rb") as file:
+        files = {"file": (filename, file)}
         r = requests.post("http://localhost:1231/upload", files=files, data=dict(with_attachments="true"))
-        result = r.content.decode('utf-8')
+        result = r.content.decode("utf-8")
 
     assert r.status_code == 200
     return json.loads(result)
 
 
 def with_base64_attachments_example() -> dict:
-    with open(filename, 'rb') as file:
-        files = {'file': (filename, file)}
+    with open(filename, "rb") as file:
+        files = {"file": (filename, file)}
         r = requests.post("http://localhost:1231/upload", files=files, data=dict(with_attachments="true", return_base64="true"))
-        result = r.content.decode('utf-8')
+        result = r.content.decode("utf-8")
 
     assert r.status_code == 200
     return json.loads(result)
 
 
 def with_parsed_attachments_example() -> dict:
-    with open(filename, 'rb') as file:
-        files = {'file': (filename, file)}
+    with open(filename, "rb") as file:
+        files = {"file": (filename, file)}
         r = requests.post("http://localhost:1231/upload", files=files, data=dict(with_attachments="true", need_content_analysis="true"))
-        result = r.content.decode('utf-8')
+        result = r.content.decode("utf-8")
 
     assert r.status_code == 200
     return json.loads(result)
 
 
 def with_inserted_table_example() -> dict:
-    with open(filename, 'rb') as file:
-        files = {'file': (filename, file)}
+    with open(filename, "rb") as file:
+        files = {"file": (filename, file)}
         r = requests.post("http://localhost:1231/upload", files=files, data=dict(insert_table="true"))
-        result = r.content.decode('utf-8')
+        result = r.content.decode("utf-8")
 
     assert r.status_code == 200
     return json.loads(result)

diff --git a/docs/source/_static/code_examples/dedoc_usage_tutorial.py b/docs/source/_static/code_examples/dedoc_usage_tutorial.py
@@ -1,12 +1,17 @@
-# noqa
-"""Using converters."""
+import mimetypes
+import os
+
+from dedoc import DedocManager
+from dedoc.attachments_extractors import DocxAttachmentsExtractor
 from dedoc.converters import DocxConverter
+from dedoc.metadata_extractors import DocxMetadataExtractor
+from dedoc.readers import DocxReader
+from dedoc.structure_constructors import TreeConstructor
+from dedoc.structure_extractors import DefaultStructureExtractor
 
+"""Using converters."""
 converter = DocxConverter(config={})
 
-import os
-import mimetypes
-
 file_dir, file_name = "test_dir", "example.odt"
 file_path = os.path.join(file_dir, file_name)
 
@@ -16,15 +21,9 @@
 converter.can_convert(file_extension, file_mime)  # True
 converter.do_convert(file_dir, name_wo_extension, file_extension)  # 'example.docx'
 
-
 """Using readers."""
-from dedoc.readers import DocxReader
-
 reader = DocxReader(config={})
 
-import os
-import mimetypes
-
 file_dir, file_name = "test_dir", "example.docx"
 file_path = os.path.join(file_dir, file_name)
 
@@ -35,97 +34,84 @@
 reader.read(file_path, parameters={"with_attachments": "true"})  # <dedoc.data_structures.UnstructuredDocument>
 
 document = reader.read(file_path, parameters={"with_attachments": "true"})
-print(list(vars(document)))  # ['tables', 'lines', 'attachments', 'warnings', 'metadata']
-
-print(document.lines[0].line)  # Document example
-print(document.lines[0].metadata.tag_hierarchy_level.line_type)  # header
-
-print(document.lines[0].annotations[0])  # Indentation(0:16, 0)
-print(document.lines[0].annotations[3])  # Style(0:16, Title)
-
-print(document.lines[3].annotations[4])  # Size(0:14, 16.0)
-print(document.lines[3].annotations[5])  # Size(19:26, 16.0)
-print(document.lines[3].annotations[6])  # Bold(0:4, True)
-print(document.lines[3].annotations[7])  # Italic(6:12, True)
-print(document.lines[3].annotations[8])  # Size(14:19, 10.0)
-
-print(document.tables[0].cells[0][0])  # N
-print(document.tables[0].cells[1][3])  # Cell3
-print(document.tables[1].cells[3])  # ['Text 3', 'Text 4']
-
-print(document.tables[0].metadata.uid)  # f2f08354fc2dbcb5ded8885479f498a6
-print(document.tables[0].metadata.cell_properties[0][0].colspan)  # 1
-print(document.tables[0].metadata.cell_properties[0][0].rowspan)  # 1
-print(document.tables[0].metadata.cell_properties[0][0].invisible)  # False
-
-print(document.tables[1].metadata.cell_properties[0][0].invisible)  # False
-print(document.tables[1].metadata.cell_properties[0][1].invisible)  # True
-
-print(document.tables[1].metadata.cell_properties[0][0].colspan)  # 2
-print(document.tables[1].metadata.cell_properties[0][1].colspan)  # 1
-
-print(document.tables[1].cells[0][0])  # Table header
-print(document.tables[1].cells[0][1])  # Table header
-
-print(document.tables[0].metadata.uid)  # f2f08354fc2dbcb5ded8885479f498a6
-print(document.lines[3].line)  # Bold, italic, small text.
-print(document.lines[3].annotations[-1])  # Table(0:26, f2f08354fc2dbcb5ded8885479f498a6)
-
-print(document.attachments[0].uid)  # attach_6de4dc06-0b75-11ee-a68a-acde48001122
-print(document.attachments[0].original_name)  # image1.png
-print(document.attachments[0].tmp_file_path)  # test_dir/1686830947_714.png
-print(document.attachments[0].need_content_analysis)  # False
-
-print(document.attachments[0].uid)  # attach_6de4dc06-0b75-11ee-a68a-acde48001122
-print(document.lines[5].line)  # More text.
-print(document.lines[5].annotations[-2])  # Attachment(0:10, attach_6de4dc06-0b75-11ee-a68a-acde48001122)
-
+# Access and print the values without using 'result' variables or 'print' statements.
+list(vars(document))  # ['tables', 'lines', 'attachments', 'warnings', 'metadata']
+
+document.lines[0].line  # Document example
+document.lines[0].metadata.tag_hierarchy_level.line_type  # header
+document.lines[0].annotations[0]  # Indentation(0:16, 0)
+document.lines[0].annotations[3]  # Style(0:16, Title)
+
+document.lines[3].annotations[4]  # Size(0:14, 16.0)
+document.lines[3].annotations[5]  # Size(19:26, 16.0)
+document.lines[3].annotations[6]  # Bold(0:4, True)
+document.lines[3].annotations[7]  # Italic(6:12, True)
+document.lines[3].annotations[8]  # Size(14:19, 10.0)
+
+document.tables[0].cells[0][0]  # N
+document.tables[0].cells[1][3]  # Cell3
+document.tables[1].cells[3]  # ['Text 3', 'Text 4']
+document.tables[0].metadata.uid  # f2f08354fc2dbcb5ded8885479f498a6
+document.tables[0].metadata.cell_properties[0][0].colspan  # 1
+document.tables[0].metadata.cell_properties[0][0].rowspan  # 1
+document.tables[0].metadata.cell_properties[0][0].invisible  # False
+document.tables[1].metadata.cell_properties[0][0].invisible  # False
+document.tables[1].metadata.cell_properties[0][1].invisible  # True
+document.tables[1].metadata.cell_properties[0][0].colspan  # 2
+document.tables[1].metadata.cell_properties[0][1].colspan  # 1
+document.tables[1].cells[0][0]  # Table header
+document.tables[1].cells[0][1]  # Table header
+document.tables[0].metadata.uid  # f2f08354fc2dbcb5ded8885479f498a6
+document.lines[3].line  # Bold, italic, small text.
+document.lines[3].annotations[-1]  # Table(0:26, f2f08354fc2dbcb5ded8885479f498a6)
+
+document.attachments[0].uid  # attach_6de4dc06-0b75-11ee-a68a-acde48001122
+document.attachments[0].original_name  # image1.png
+document.attachments[0].tmp_file_path  # test_dir/1686830947_714.png
+document.attachments[0].need_content_analysis  # False
+document.attachments[0].uid  # attach_6de4dc06-0b75-11ee-a68a-acde48001122
+document.lines[5].line  # More text.
+document.lines[5].annotations[-2]  # Attachment(0:10, attach_6de4dc06-0b75-11ee-a68a-acde48001122)
 
 """Using metadata extractors"""
-from dedoc.metadata_extractors import DocxMetadataExtractor
-
 metadata_extractor = DocxMetadataExtractor()
 metadata_extractor.can_extract(document, file_dir, file_name, file_name, file_name)  # True
 document = metadata_extractor.add_metadata(document, file_dir, file_name, file_name, file_name)
-print(document.metadata)  # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795, 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '', 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726, 'last_printed_date': None}}
+document.metadata  # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795,
+# 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '',
+# 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726,
+# 'last_printed_date': None}}
 
 
 """Using attachments extractors"""
-from dedoc.attachments_extractors import DocxAttachmentsExtractor
-
 attachments_extractor = DocxAttachmentsExtractor()
 attachments_extractor.can_extract(file_extension, file_mime)  # True
 attachments = attachments_extractor.get_attachments(file_dir, file_name, {})
-print(attachments[0])  # <dedoc.data_structures.AttachedFile>
+attachments[0]  # <dedoc.data_structures.AttachedFile>
 
 
 """Using structure extractors"""
-from dedoc.structure_extractors import DefaultStructureExtractor
-
 structure_extractor = DefaultStructureExtractor()
-print(document.lines[0].metadata.hierarchy_level)  # None
+document.lines[0].metadata.hierarchy_level  # None
 document = structure_extractor.extract_structure(document, {})
-print(document.lines[0].metadata.hierarchy_level)  # HierarchyLevel(level_1=1, level_2=1, can_be_multiline=False, line_type=header)
+document.lines[0].metadata.hierarchy_level  # HierarchyLevel(level_1=1, level_2=1, can_be_multiline=False, line_type=header)
 
 
 """Using structure constructors"""
-from dedoc.structure_constructors import TreeConstructor
-
 constructor = TreeConstructor()
 parsed_document = constructor.structure_document(document)
-print(parsed_document)  # <dedoc.data_structures.ParsedDocument>
-print(list(vars(parsed_document)))  # ['metadata', 'content', 'attachments', 'version', 'warnings']
+parsed_document  # <dedoc.data_structures.ParsedDocument>
+list(vars(parsed_document))  # ['metadata', 'content', 'attachments', 'version', 'warnings']
 
-print(list(vars(parsed_document.content)))  # ['tables', 'structure', 'warnings']
-print(list(vars(parsed_document.content.structure)))  # ['node_id', 'text', 'annotations', 'metadata', 'subparagraphs', 'parent']
-print(parsed_document.content.structure.subparagraphs[0].text)  # Document example
+list(vars(parsed_document.content))  # ['tables', 'structure', 'warnings']
+list(vars(parsed_document.content.structure))  # ['node_id', 'text', 'annotations', 'metadata', 'subparagraphs', 'parent']
+parsed_document.content.structure.subparagraphs[0].text  # Document example
 
 
 """Run the whole pipeline"""
-from dedoc import DedocManager
-
 manager = DedocManager()
 result = manager.parse(file_path=file_path, parameters={})
 
-print(result)  # <dedoc.data_structures.ParsedDocument>
-print(result.to_dict())  # OrderedDict([('version', ''), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ('text', ''), ('annotations', []), ('metadata', OrderedDict([('page_id', 0), ('line_id', 0), ('paragraph_type', 'root'), ('other_fields', {})])), ...
+result  # <dedoc.data_structures.ParsedDocument>
+result.to_dict()  # OrderedDict([('version', ''), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ('text', ''),
+# ('annotations', []), ('metadata', OrderedDict([('page_id', 0), ('line_id', 0), ('paragraph_type', 'root'), ('other_fields', {})])), ...
diff --git a/docs/source/_static/code_examples/djvu_converter.py b/docs/source/_static/code_examples/djvu_converter.py
@@ -0,0 +1,18 @@
+import os
+from typing import Optional
+
+from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter
+
+
+class DjvuConverter(AbstractConverter):
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config=config)
+
+    def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
+        return extension == ".djvu"
+
+    def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str:
+        os.system(f"ddjvu -format=pdf {tmp_dir}/{filename}{extension} {tmp_dir}/{filename}.pdf")
+        self._await_for_conversion(filename + ".pdf", tmp_dir)
+        return filename + ".pdf"