Skip to content

Commit

Permalink
TLDR-451 tutorial new doc type (#331)
Browse files Browse the repository at this point in the history
* docs added

* add code testing

* some fixes

* some fixes

* add tabula and some fixes

* add python-djvulibre

* delete python-djvulibre and add djvulibre-bin

* add poppler-utils

* add tesseract

* some fixes

* flake8 stylefix

* fix docs after flake8

* update last part of adding_new_doc_type_tutorial

* rewrite dedoc_add_new_doc_type_tutorial

* minor fixes

* minor fixes

* minor fixes

* some fixes

* add more code examples

* some fixes

---------

Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru>
Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
  • Loading branch information
3 people authored Sep 25, 2023
1 parent 8fe955f commit db202e7
Show file tree
Hide file tree
Showing 15 changed files with 513 additions and 137 deletions.
1 change: 0 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ exclude =
resources,
dedoc/scripts,
examples,
docs,
venv,
build,
dedoc.egg-info
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:

- name: Install dependencies
run: |
sudo apt-get install -y libreoffice
sudo apt-get install -y libreoffice djvulibre-bin poppler-utils tesseract-ocr libtesseract-dev tesseract-ocr-rus tesseract-ocr-eng
python -m pip install --upgrade --no-cache-dir pip setuptools
python -m pip install --exists-action=w --no-cache-dir -r requirements.txt
python -m pip install --upgrade --upgrade-strategy eager --no-cache-dir .[torch,docs]
Expand All @@ -30,3 +30,4 @@ jobs:
python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build
cd docs/source/_static/code_examples
python dedoc_usage_tutorial.py
python dedoc_add_new_doc_type_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import mimetypes
import os

from djvu_converter import DjvuConverter
from pdf_reader import PdfReader

from dedoc import DedocManager
from dedoc.attachments_handler import AttachmentsHandler
from dedoc.converters import FileConverterComposition
from dedoc.metadata_extractors import BaseMetadataExtractor, DocxMetadataExtractor, MetadataExtractorComposition
from dedoc.readers import ReaderComposition
from dedoc.structure_constructors import LinearConstructor, StructureConstructorComposition, TreeConstructor
from dedoc.structure_extractors import DefaultStructureExtractor, StructureExtractorComposition


file_dir, file_name = "test_dir", "example_with_table.djvu"
file_path = os.path.join(file_dir, file_name)


djvu_converter = DjvuConverter(config=dict())
pdf_reader = PdfReader()

name_wo_extension, file_extension = os.path.splitext(file_name)
file_mime = mimetypes.guess_type(file_path)[0]

djvu_converter.can_convert(file_extension, file_mime) # True
djvu_converter.do_convert(file_dir, name_wo_extension, file_extension) # 'example_with_table.pdf'

file_dir, file_name = "test_dir", "example_with_attachments_depth_1.pdf"
file_path = os.path.join(file_dir, file_name)

name_wo_extension, file_extension = os.path.splitext(file_name)
file_mime = mimetypes.guess_type(file_path)[0]
pdf_reader.can_read(file_path, file_mime, file_extension) # True

pdf_reader.read(file_path, parameters={"with_attachments": "true"}) # <dedoc.data_structures.UnstructuredDocument>

document = pdf_reader.read(file_path, parameters={"with_attachments": "true"})
list(vars(document)) # ['tables', 'lines', 'attachments', 'warnings', 'metadata']
len(document.attachments) # 2
len(document.lines) # 15

"""Adding the implemented handlers to the manager config"""
config = {}
manager_config = dict(
converter=FileConverterComposition(converters=[DjvuConverter(config=config)]),
reader=ReaderComposition(readers=[PdfReader()]),
structure_extractor=StructureExtractorComposition(extractors={DefaultStructureExtractor.document_type: DefaultStructureExtractor()}, default_key="other"),
structure_constructor=StructureConstructorComposition(
constructors={"linear": LinearConstructor(), "tree": TreeConstructor()},
default_constructor=LinearConstructor()
),
document_metadata_extractor=MetadataExtractorComposition(extractors=[DocxMetadataExtractor(), BaseMetadataExtractor()]),
attachments_handler=AttachmentsHandler(config=config),
)

manager = DedocManager(config=config, manager_config=manager_config)
result = manager.parse(file_path=file_path, parameters={"with_attachments": "true"})

result # <dedoc.data_structures.ParsedDocument>
result.to_dict() # OrderedDict([('version', '0.11.2'), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ...
36 changes: 18 additions & 18 deletions docs/source/_static/code_examples/dedoc_return_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,60 +6,60 @@


def basic_example() -> dict:
with open(filename, 'rb') as file:
files = {'file': (filename, file)}
with open(filename, "rb") as file:
files = {"file": (filename, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=dict())
result = r.content.decode('utf-8')
result = r.content.decode("utf-8")

assert r.status_code == 200
return json.loads(result)


def linear_structure_type_example() -> dict:
with open(filename, 'rb') as file:
files = {'file': (filename, file)}
with open(filename, "rb") as file:
files = {"file": (filename, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=dict(structure_type="linear"))
result = r.content.decode('utf-8')
result = r.content.decode("utf-8")

assert r.status_code == 200
return json.loads(result)


def with_attachments_example() -> dict:
with open(filename, 'rb') as file:
files = {'file': (filename, file)}
with open(filename, "rb") as file:
files = {"file": (filename, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=dict(with_attachments="true"))
result = r.content.decode('utf-8')
result = r.content.decode("utf-8")

assert r.status_code == 200
return json.loads(result)


def with_base64_attachments_example() -> dict:
with open(filename, 'rb') as file:
files = {'file': (filename, file)}
with open(filename, "rb") as file:
files = {"file": (filename, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=dict(with_attachments="true", return_base64="true"))
result = r.content.decode('utf-8')
result = r.content.decode("utf-8")

assert r.status_code == 200
return json.loads(result)


def with_parsed_attachments_example() -> dict:
with open(filename, 'rb') as file:
files = {'file': (filename, file)}
with open(filename, "rb") as file:
files = {"file": (filename, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=dict(with_attachments="true", need_content_analysis="true"))
result = r.content.decode('utf-8')
result = r.content.decode("utf-8")

assert r.status_code == 200
return json.loads(result)


def with_inserted_table_example() -> dict:
with open(filename, 'rb') as file:
files = {'file': (filename, file)}
with open(filename, "rb") as file:
files = {"file": (filename, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=dict(insert_table="true"))
result = r.content.decode('utf-8')
result = r.content.decode("utf-8")

assert r.status_code == 200
return json.loads(result)
Expand Down
140 changes: 63 additions & 77 deletions docs/source/_static/code_examples/dedoc_usage_tutorial.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
# noqa
"""Using converters."""
import mimetypes
import os

from dedoc import DedocManager
from dedoc.attachments_extractors import DocxAttachmentsExtractor
from dedoc.converters import DocxConverter
from dedoc.metadata_extractors import DocxMetadataExtractor
from dedoc.readers import DocxReader
from dedoc.structure_constructors import TreeConstructor
from dedoc.structure_extractors import DefaultStructureExtractor

"""Using converters."""
converter = DocxConverter(config={})

import os
import mimetypes

file_dir, file_name = "test_dir", "example.odt"
file_path = os.path.join(file_dir, file_name)

Expand All @@ -16,15 +21,9 @@
converter.can_convert(file_extension, file_mime) # True
converter.do_convert(file_dir, name_wo_extension, file_extension) # 'example.docx'


"""Using readers."""
from dedoc.readers import DocxReader

reader = DocxReader(config={})

import os
import mimetypes

file_dir, file_name = "test_dir", "example.docx"
file_path = os.path.join(file_dir, file_name)

Expand All @@ -35,97 +34,84 @@
reader.read(file_path, parameters={"with_attachments": "true"}) # <dedoc.data_structures.UnstructuredDocument>

document = reader.read(file_path, parameters={"with_attachments": "true"})
print(list(vars(document))) # ['tables', 'lines', 'attachments', 'warnings', 'metadata']

print(document.lines[0].line) # Document example
print(document.lines[0].metadata.tag_hierarchy_level.line_type) # header

print(document.lines[0].annotations[0]) # Indentation(0:16, 0)
print(document.lines[0].annotations[3]) # Style(0:16, Title)

print(document.lines[3].annotations[4]) # Size(0:14, 16.0)
print(document.lines[3].annotations[5]) # Size(19:26, 16.0)
print(document.lines[3].annotations[6]) # Bold(0:4, True)
print(document.lines[3].annotations[7]) # Italic(6:12, True)
print(document.lines[3].annotations[8]) # Size(14:19, 10.0)

print(document.tables[0].cells[0][0]) # N
print(document.tables[0].cells[1][3]) # Cell3
print(document.tables[1].cells[3]) # ['Text 3', 'Text 4']

print(document.tables[0].metadata.uid) # f2f08354fc2dbcb5ded8885479f498a6
print(document.tables[0].metadata.cell_properties[0][0].colspan) # 1
print(document.tables[0].metadata.cell_properties[0][0].rowspan) # 1
print(document.tables[0].metadata.cell_properties[0][0].invisible) # False

print(document.tables[1].metadata.cell_properties[0][0].invisible) # False
print(document.tables[1].metadata.cell_properties[0][1].invisible) # True

print(document.tables[1].metadata.cell_properties[0][0].colspan) # 2
print(document.tables[1].metadata.cell_properties[0][1].colspan) # 1

print(document.tables[1].cells[0][0]) # Table header
print(document.tables[1].cells[0][1]) # Table header

print(document.tables[0].metadata.uid) # f2f08354fc2dbcb5ded8885479f498a6
print(document.lines[3].line) # Bold, italic, small text.
print(document.lines[3].annotations[-1]) # Table(0:26, f2f08354fc2dbcb5ded8885479f498a6)

print(document.attachments[0].uid) # attach_6de4dc06-0b75-11ee-a68a-acde48001122
print(document.attachments[0].original_name) # image1.png
print(document.attachments[0].tmp_file_path) # test_dir/1686830947_714.png
print(document.attachments[0].need_content_analysis) # False

print(document.attachments[0].uid) # attach_6de4dc06-0b75-11ee-a68a-acde48001122
print(document.lines[5].line) # More text.
print(document.lines[5].annotations[-2]) # Attachment(0:10, attach_6de4dc06-0b75-11ee-a68a-acde48001122)

# Access and print the values without using 'result' variables or 'print' statements.
list(vars(document)) # ['tables', 'lines', 'attachments', 'warnings', 'metadata']

document.lines[0].line # Document example
document.lines[0].metadata.tag_hierarchy_level.line_type # header
document.lines[0].annotations[0] # Indentation(0:16, 0)
document.lines[0].annotations[3] # Style(0:16, Title)

document.lines[3].annotations[4] # Size(0:14, 16.0)
document.lines[3].annotations[5] # Size(19:26, 16.0)
document.lines[3].annotations[6] # Bold(0:4, True)
document.lines[3].annotations[7] # Italic(6:12, True)
document.lines[3].annotations[8] # Size(14:19, 10.0)

document.tables[0].cells[0][0] # N
document.tables[0].cells[1][3] # Cell3
document.tables[1].cells[3] # ['Text 3', 'Text 4']
document.tables[0].metadata.uid # f2f08354fc2dbcb5ded8885479f498a6
document.tables[0].metadata.cell_properties[0][0].colspan # 1
document.tables[0].metadata.cell_properties[0][0].rowspan # 1
document.tables[0].metadata.cell_properties[0][0].invisible # False
document.tables[1].metadata.cell_properties[0][0].invisible # False
document.tables[1].metadata.cell_properties[0][1].invisible # True
document.tables[1].metadata.cell_properties[0][0].colspan # 2
document.tables[1].metadata.cell_properties[0][1].colspan # 1
document.tables[1].cells[0][0] # Table header
document.tables[1].cells[0][1] # Table header
document.tables[0].metadata.uid # f2f08354fc2dbcb5ded8885479f498a6
document.lines[3].line # Bold, italic, small text.
document.lines[3].annotations[-1] # Table(0:26, f2f08354fc2dbcb5ded8885479f498a6)

document.attachments[0].uid # attach_6de4dc06-0b75-11ee-a68a-acde48001122
document.attachments[0].original_name # image1.png
document.attachments[0].tmp_file_path # test_dir/1686830947_714.png
document.attachments[0].need_content_analysis # False
document.attachments[0].uid # attach_6de4dc06-0b75-11ee-a68a-acde48001122
document.lines[5].line # More text.
document.lines[5].annotations[-2] # Attachment(0:10, attach_6de4dc06-0b75-11ee-a68a-acde48001122)

"""Using metadata extractors"""
from dedoc.metadata_extractors import DocxMetadataExtractor

metadata_extractor = DocxMetadataExtractor()
metadata_extractor.can_extract(document, file_dir, file_name, file_name, file_name) # True
document = metadata_extractor.add_metadata(document, file_dir, file_name, file_name, file_name)
print(document.metadata) # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795, 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '', 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726, 'last_printed_date': None}}
document.metadata # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795,
# 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '',
# 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726,
# 'last_printed_date': None}}


"""Using attachments extractors"""
from dedoc.attachments_extractors import DocxAttachmentsExtractor

attachments_extractor = DocxAttachmentsExtractor()
attachments_extractor.can_extract(file_extension, file_mime) # True
attachments = attachments_extractor.get_attachments(file_dir, file_name, {})
print(attachments[0]) # <dedoc.data_structures.AttachedFile>
attachments[0] # <dedoc.data_structures.AttachedFile>


"""Using structure extractors"""
from dedoc.structure_extractors import DefaultStructureExtractor

structure_extractor = DefaultStructureExtractor()
print(document.lines[0].metadata.hierarchy_level) # None
document.lines[0].metadata.hierarchy_level # None
document = structure_extractor.extract_structure(document, {})
print(document.lines[0].metadata.hierarchy_level) # HierarchyLevel(level_1=1, level_2=1, can_be_multiline=False, line_type=header)
document.lines[0].metadata.hierarchy_level # HierarchyLevel(level_1=1, level_2=1, can_be_multiline=False, line_type=header)


"""Using structure constructors"""
from dedoc.structure_constructors import TreeConstructor

constructor = TreeConstructor()
parsed_document = constructor.structure_document(document)
print(parsed_document) # <dedoc.data_structures.ParsedDocument>
print(list(vars(parsed_document))) # ['metadata', 'content', 'attachments', 'version', 'warnings']
parsed_document # <dedoc.data_structures.ParsedDocument>
list(vars(parsed_document)) # ['metadata', 'content', 'attachments', 'version', 'warnings']

print(list(vars(parsed_document.content))) # ['tables', 'structure', 'warnings']
print(list(vars(parsed_document.content.structure))) # ['node_id', 'text', 'annotations', 'metadata', 'subparagraphs', 'parent']
print(parsed_document.content.structure.subparagraphs[0].text) # Document example
list(vars(parsed_document.content)) # ['tables', 'structure', 'warnings']
list(vars(parsed_document.content.structure)) # ['node_id', 'text', 'annotations', 'metadata', 'subparagraphs', 'parent']
parsed_document.content.structure.subparagraphs[0].text # Document example


"""Run the whole pipeline"""
from dedoc import DedocManager

manager = DedocManager()
result = manager.parse(file_path=file_path, parameters={})

print(result) # <dedoc.data_structures.ParsedDocument>
print(result.to_dict()) # OrderedDict([('version', ''), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ('text', ''), ('annotations', []), ('metadata', OrderedDict([('page_id', 0), ('line_id', 0), ('paragraph_type', 'root'), ('other_fields', {})])), ...
result # <dedoc.data_structures.ParsedDocument>
result.to_dict() # OrderedDict([('version', ''), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ('text', ''),
# ('annotations', []), ('metadata', OrderedDict([('page_id', 0), ('line_id', 0), ('paragraph_type', 'root'), ('other_fields', {})])), ...
18 changes: 18 additions & 0 deletions docs/source/_static/code_examples/djvu_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os
from typing import Optional

from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter


class DjvuConverter(AbstractConverter):

def __init__(self, config: dict) -> None:
super().__init__(config=config)

def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
return extension == ".djvu"

def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str:
os.system(f"ddjvu -format=pdf {tmp_dir}/{filename}{extension} {tmp_dir}/{filename}.pdf")
self._await_for_conversion(filename + ".pdf", tmp_dir)
return filename + ".pdf"
Loading

0 comments on commit db202e7

Please sign in to comment.