diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index 37f4f98c..9b71a20d 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -95,11 +95,10 @@ def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) attachment.tmp_file_path = new_path def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa - unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[]) attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path()) - unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir, - filename=attachment_name, converted_filename=attachment_name, - original_filename=attachment.get_original_filename(), - parameters=parameters) - metadata = DocumentMetadata(**unstructured_document.metadata) + metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir, + filename=attachment_name, converted_filename=attachment_name, + original_filename=attachment.get_original_filename(), + parameters=parameters) + metadata = DocumentMetadata(**metadata) return ParsedDocument(content=get_empty_content(), metadata=metadata) diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index ee308f1a..1ee0b947 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -103,13 +103,13 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) self.logger.info(f"Finish parse file {file_name}") # Step 3 - Adding meta-information - unstructured_document = self.document_metadata_extractor.add_metadata(document=unstructured_document, - directory=tmp_dir, - filename=unique_filename, - converted_filename=converted_filename, - original_filename=file_name, - parameters=parameters, - other_fields=unstructured_document.metadata) + metadata = self.document_metadata_extractor.extract_metadata(directory=tmp_dir, + filename=unique_filename, + converted_filename=converted_filename, + original_filename=file_name, + parameters=parameters, + other_fields=unstructured_document.metadata) + unstructured_document.metadata = metadata self.logger.info(f"Add metadata of file {file_name}") # Step 4 - Extract structure diff --git a/dedoc/metadata_extractors/abstract_metadata_extractor.py b/dedoc/metadata_extractors/abstract_metadata_extractor.py index 6346d155..602ee68e 100644 --- a/dedoc/metadata_extractors/abstract_metadata_extractor.py +++ b/dedoc/metadata_extractors/abstract_metadata_extractor.py @@ -1,8 +1,6 @@ from abc import ABC, abstractmethod from typing import Optional -from dedoc.data_structures.unstructured_document import UnstructuredDocument - class AbstractMetadataExtractor(ABC): """ @@ -10,7 +8,6 @@ class AbstractMetadataExtractor(ABC): """ @abstractmethod def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -19,30 +16,28 @@ def can_extract(self, other_fields: Optional[dict] = None) -> bool: """ Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ pass @abstractmethod - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ - Add metadata to the document if possible, i.e. method :meth:`can_extract` returned True. + Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True. - :type document: document content that has been received from some of the readers - :type directory: path to the directory where the original and converted files are located - :type filename: name of the file after renaming (for example 23141.doc). \ + :param directory: path to the directory where the original and converted files are located + :param filename: name of the file after renaming (for example 23141.doc). \ The file gets a new name during processing by the dedoc manager (if used) - :type converted_filename: name of the file after renaming and conversion (for example 23141.docx) - :type original_filename: name of the file before renaming - :type parameters: additional parameters for document parsing - :type other_fields: other fields that should be added to the document's metadata - :return: document content with added metadata attribute (dict with information about the document) + :param converted_filename: name of the file after renaming and conversion (for example 23141.docx) + :param original_filename: name of the file before renaming + :param parameters: additional parameters for document parsing + :param other_fields: other fields that should be added to the document's metadata + :return: dict with metadata information about the document """ pass diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py index f545ee1e..e685becc 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py @@ -2,7 +2,6 @@ from base64 import b64encode from typing import Optional -from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor from dedoc.utils.utils import get_file_mime_type @@ -22,7 +21,6 @@ class BaseMetadataExtractor(AbstractMetadataExtractor): """ def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -35,17 +33,16 @@ def can_extract(self, """ return True - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ Gets the basic meta-information about the file. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters meta_info = self._get_base_meta_information(directory, filename, original_filename) @@ -59,8 +56,7 @@ def add_metadata(self, if other_fields is not None and len(other_fields) > 0: meta_info["other_fields"] = other_fields - document.metadata = meta_info - return document + return meta_info @staticmethod def _get_base_meta_information(directory: str, filename: str, name_actual: str) -> dict: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py index 377cba55..49b87001 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py @@ -5,7 +5,6 @@ import docx from docx.opc.exceptions import PackageNotFoundError -from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -24,7 +23,6 @@ class DocxMetadataExtractor(BaseMetadataExtractor): - created, modified and last printed date. """ def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -37,27 +35,26 @@ def can_extract(self, """ return converted_filename.lower().endswith("docx") - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: dict = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the docx documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters - result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, + original_filename=original_filename, parameters=parameters, other_fields=other_fields) file_path = os.path.join(directory, converted_filename) docx_other_fields = self._get_docx_fields(file_path) - result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **docx_other_fields} + result["other_fields"] = {**result.get("other_fields", {}), **docx_other_fields} return result def __convert_date(self, date: Optional[datetime]) -> Optional[int]: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index ac573d02..31062c72 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -7,7 +7,6 @@ from PIL import ExifTags, Image from dateutil import parser -from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -54,7 +53,6 @@ def __init__(self, *, config: dict) -> None: } def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -67,25 +65,24 @@ def can_extract(self, """ return filename.lower().endswith((".png", ".jpg", ".jpeg")) - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: dict = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for images. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ - result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, + original_filename=original_filename, parameters=parameters, other_fields=other_fields) path = os.path.join(directory, filename) exif_fields = self._get_exif(path) if len(exif_fields) > 0: - result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **exif_fields} + result["other_fields"] = {**result.get("other_fields", {}), **exif_fields} return result def __encode_exif(self, exif: Union[str, bytes]) -> Optional[str]: diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index 2708e5e6..18b49d6b 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -3,7 +3,6 @@ from typing import Optional from dedoc.common.exceptions.bad_file_error import BadFileFormatError -from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -18,7 +17,6 @@ def __init__(self) -> None: super().__init__() def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -31,17 +29,16 @@ def can_extract(self, """ return filename.lower().endswith(".note.pickle") - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: dict = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the .note.pickle documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ try: @@ -59,7 +56,6 @@ def add_metadata(self, created_time=note_dict["created_time"], modified_time=note_dict["modified_time"], other_fields=other_fields) - document.metadata = meta_info - return document + return meta_info except Exception: raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken") diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index 96682fc0..274a8d26 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -5,7 +5,6 @@ from PyPDF2 import PdfFileReader from PyPDF2.utils import PdfReadError -from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor from dedoc.utils.utils import convert_datetime @@ -47,7 +46,6 @@ def __init__(self, *, config: dict) -> None: self.logger = config.get("logger", logging.getLogger()) def can_extract(self, - document: UnstructuredDocument, directory: str, filename: str, converted_filename: str, @@ -60,24 +58,23 @@ def can_extract(self, """ return filename.lower().endswith(".pdf") - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: dict = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: dict = None, + other_fields: Optional[dict] = None) -> dict: """ Add the predefined list of metadata for the pdf documents. - Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. + Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters. """ - result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, - original_filename=original_filename, parameters=parameters, other_fields=other_fields) + result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename, + original_filename=original_filename, parameters=parameters, other_fields=other_fields) path = os.path.join(directory, filename) pdf_fields = self._get_pdf_info(path) if len(pdf_fields) > 0: - result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **pdf_fields} + result["other_fields"] = {**result.get("other_fields", {}), **pdf_fields} return result def _get_pdf_info(self, path: str) -> dict: diff --git a/dedoc/metadata_extractors/metadata_extractor_composition.py b/dedoc/metadata_extractors/metadata_extractor_composition.py index 68e308c9..e9c182d4 100644 --- a/dedoc/metadata_extractors/metadata_extractor_composition.py +++ b/dedoc/metadata_extractors/metadata_extractor_composition.py @@ -1,6 +1,5 @@ from typing import List, Optional -from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor @@ -13,36 +12,33 @@ class MetadataExtractorComposition: """ def __init__(self, extractors: List[AbstractMetadataExtractor]) -> None: """ - :param extractors: the list of extractors with methods can_extract() and add_metadata() to extract metadata from file + :param extractors: the list of extractors with methods can_extract() and extract_metadata() to extract metadata from file """ self.extractors = extractors - def add_metadata(self, - document: UnstructuredDocument, - directory: str, - filename: str, - converted_filename: str, - original_filename: str, - parameters: Optional[dict] = None, - other_fields: Optional[dict] = None) -> UnstructuredDocument: + def extract_metadata(self, + directory: str, + filename: str, + converted_filename: str, + original_filename: str, + parameters: Optional[dict] = None, + other_fields: Optional[dict] = None) -> dict: """ - Add metadata to the document using one of the extractors if suitable extractor was found. - Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` of the class + Extract metadata using one of the extractors if suitable extractor was found. + Look to the method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` of the class :class:`~dedoc.metadata_extractors.AbstractMetadataExtractor` documentation to get the information about method's parameters. """ for extractor in self.extractors: - if extractor.can_extract(document=document, - directory=directory, + if extractor.can_extract(directory=directory, filename=filename, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, other_fields=other_fields): - return extractor.add_metadata(document=document, - directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, - other_fields=other_fields) + return extractor.extract_metadata(directory=directory, + filename=filename, + converted_filename=converted_filename, + original_filename=original_filename, + parameters=parameters, + other_fields=other_fields) raise Exception(f"Can't extract metadata from from file {filename}") diff --git a/docs/source/_static/code_examples/dedoc_usage_tutorial.py b/docs/source/_static/code_examples/dedoc_usage_tutorial.py index 52bb29b4..122ea40e 100644 --- a/docs/source/_static/code_examples/dedoc_usage_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_usage_tutorial.py @@ -75,8 +75,8 @@ """Using metadata extractors""" metadata_extractor = DocxMetadataExtractor() -metadata_extractor.can_extract(document, file_dir, file_name, file_name, file_name) # True -document = metadata_extractor.add_metadata(document, file_dir, file_name, file_name, file_name) +metadata_extractor.can_extract(file_dir, file_name, file_name, file_name) # True +document.metadata = metadata_extractor.extract_metadata(file_dir, file_name, file_name, file_name) document.metadata # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795, # 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '', # 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726, diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst index 7eea3c3e..1114cb87 100644 --- a/docs/source/getting_started/usage.rst +++ b/docs/source/getting_started/usage.rst @@ -200,7 +200,7 @@ the metadata extractor can extract metadata from the given file: :language: python :lines: 78 -To extract metadata, one can add them to the document using :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.add_metadata` method. +To extract metadata, one can add them to the document using :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.extract_metadata` method. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python diff --git a/examples/create_unstructured_document.py b/examples/create_unstructured_document.py index 0da38dc9..cf724e1a 100644 --- a/examples/create_unstructured_document.py +++ b/examples/create_unstructured_document.py @@ -58,8 +58,5 @@ # HierarchyLevel(1, 1) for 1. # HierarchyLevel(1, 2) for 1.1. # HierarchyLevel(1, 4) for 1.2.1.1. and so on -unstructured_document = BaseMetadataExtractor().add_metadata(document=unstructured_document, - directory="./", - filename="example.docx", - converted_filename="example.doc", - original_filename="example.docx") +metadata = BaseMetadataExtractor().extract_metadata(directory="./", filename="example.docx", converted_filename="example.doc", original_filename="example.docx") +unstructured_document.metadata = metadata diff --git a/tests/unit_tests/test_doctype_law_txt_reader.py b/tests/unit_tests/test_doctype_law_txt_reader.py index 391075b5..62d3e739 100644 --- a/tests/unit_tests/test_doctype_law_txt_reader.py +++ b/tests/unit_tests/test_doctype_law_txt_reader.py @@ -20,7 +20,7 @@ def test_law_document_spaces_correctness(self) -> None: path = self._get_abs_path("коап_москвы_8_7_2015_utf.txt") directory, filename = os.path.split(path) document = self.txt_reader.read(path=path, document_type="law", parameters={}) - document = self.metadata_extractor.add_metadata(document, directory, filename, filename, filename) + document.metadata = self.metadata_extractor.extract_metadata(directory, filename, filename, filename) document = self.law_extractor.extract_structure(document, {}) self.assertListEqual([], document.attachments)