Skip to content

Commit

Permalink
TLDR-367 refactor metadata extractor (#359)
Browse files Browse the repository at this point in the history
* change add_metadata to extract_metadata in metadata readers

* fix usage of extract_metadata

* fix docs

* change output type to dict

* fix code style

* fix pr

---------

Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru>
  • Loading branch information
Travvy88 and Nikita Shevtsov authored Oct 20, 2023
1 parent bf1a60d commit 29eebb7
Show file tree
Hide file tree
Showing 13 changed files with 102 additions and 132 deletions.
11 changes: 5 additions & 6 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,10 @@ def _handle_attachments(self, document: UnstructuredDocument, parameters: dict)
attachment.tmp_file_path = new_path

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[])
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = DocumentMetadata(**unstructured_document.metadata)
metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = DocumentMetadata(**metadata)
return ParsedDocument(content=get_empty_content(), metadata=metadata)
14 changes: 7 additions & 7 deletions dedoc/dedoc_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,13 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str])
self.logger.info(f"Finish parse file {file_name}")

# Step 3 - Adding meta-information
unstructured_document = self.document_metadata_extractor.add_metadata(document=unstructured_document,
directory=tmp_dir,
filename=unique_filename,
converted_filename=converted_filename,
original_filename=file_name,
parameters=parameters,
other_fields=unstructured_document.metadata)
metadata = self.document_metadata_extractor.extract_metadata(directory=tmp_dir,
filename=unique_filename,
converted_filename=converted_filename,
original_filename=file_name,
parameters=parameters,
other_fields=unstructured_document.metadata)
unstructured_document.metadata = metadata
self.logger.info(f"Add metadata of file {file_name}")

# Step 4 - Extract structure
Expand Down
37 changes: 16 additions & 21 deletions dedoc/metadata_extractors/abstract_metadata_extractor.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
from abc import ABC, abstractmethod
from typing import Optional

from dedoc.data_structures.unstructured_document import UnstructuredDocument


class AbstractMetadataExtractor(ABC):
"""
This class is responsible for extracting metadata from the documents of different formats.
"""
@abstractmethod
def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
Expand All @@ -19,30 +16,28 @@ def can_extract(self,
other_fields: Optional[dict] = None) -> bool:
"""
Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
pass

@abstractmethod
def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> dict:
"""
Add metadata to the document if possible, i.e. method :meth:`can_extract` returned True.
Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True.
:type document: document content that has been received from some of the readers
:type directory: path to the directory where the original and converted files are located
:type filename: name of the file after renaming (for example 23141.doc). \
:param directory: path to the directory where the original and converted files are located
:param filename: name of the file after renaming (for example 23141.doc). \
The file gets a new name during processing by the dedoc manager (if used)
:type converted_filename: name of the file after renaming and conversion (for example 23141.docx)
:type original_filename: name of the file before renaming
:type parameters: additional parameters for document parsing
:type other_fields: other fields that should be added to the document's metadata
:return: document content with added metadata attribute (dict with information about the document)
:param converted_filename: name of the file after renaming and conversion (for example 23141.docx)
:param original_filename: name of the file before renaming
:param parameters: additional parameters for document parsing
:param other_fields: other fields that should be added to the document's metadata
:return: dict with metadata information about the document
"""
pass
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from base64 import b64encode
from typing import Optional

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.metadata_extractors.abstract_metadata_extractor import AbstractMetadataExtractor
from dedoc.utils.utils import get_file_mime_type

Expand All @@ -22,7 +21,6 @@ class BaseMetadataExtractor(AbstractMetadataExtractor):
"""

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
Expand All @@ -35,17 +33,16 @@ def can_extract(self,
"""
return True

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> dict:
"""
Gets the basic meta-information about the file.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
parameters = {} if parameters is None else parameters
meta_info = self._get_base_meta_information(directory, filename, original_filename)
Expand All @@ -59,8 +56,7 @@ def add_metadata(self,

if other_fields is not None and len(other_fields) > 0:
meta_info["other_fields"] = other_fields
document.metadata = meta_info
return document
return meta_info

@staticmethod
def _get_base_meta_information(directory: str, filename: str, name_actual: str) -> dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import docx
from docx.opc.exceptions import PackageNotFoundError

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor


Expand All @@ -24,7 +23,6 @@ class DocxMetadataExtractor(BaseMetadataExtractor):
- created, modified and last printed date.
"""
def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
Expand All @@ -37,27 +35,26 @@ def can_extract(self,
"""
return converted_filename.lower().endswith("docx")

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> dict:
"""
Add the predefined list of metadata for the docx documents.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
parameters = {} if parameters is None else parameters

result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)
result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)

file_path = os.path.join(directory, converted_filename)
docx_other_fields = self._get_docx_fields(file_path)

result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **docx_other_fields}
result["other_fields"] = {**result.get("other_fields", {}), **docx_other_fields}
return result

def __convert_date(self, date: Optional[datetime]) -> Optional[int]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from PIL import ExifTags, Image
from dateutil import parser

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor


Expand Down Expand Up @@ -54,7 +53,6 @@ def __init__(self, *, config: dict) -> None:
}

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
Expand All @@ -67,25 +65,24 @@ def can_extract(self,
"""
return filename.lower().endswith((".png", ".jpg", ".jpeg"))

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> dict:
"""
Add the predefined list of metadata for images.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)
result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)

path = os.path.join(directory, filename)
exif_fields = self._get_exif(path)
if len(exif_fields) > 0:
result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **exif_fields}
result["other_fields"] = {**result.get("other_fields", {}), **exif_fields}
return result

def __encode_exif(self, exif: Union[str, bytes]) -> Optional[str]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Optional

from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor


Expand All @@ -18,7 +17,6 @@ def __init__(self) -> None:
super().__init__()

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
Expand All @@ -31,17 +29,16 @@ def can_extract(self,
"""
return filename.lower().endswith(".note.pickle")

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> dict:
"""
Add the predefined list of metadata for the .note.pickle documents.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""

try:
Expand All @@ -59,7 +56,6 @@ def add_metadata(self,
created_time=note_dict["created_time"],
modified_time=note_dict["modified_time"],
other_fields=other_fields)
document.metadata = meta_info
return document
return meta_info
except Exception:
raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken")
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor
from dedoc.utils.utils import convert_datetime

Expand Down Expand Up @@ -47,7 +46,6 @@ def __init__(self, *, config: dict) -> None:
self.logger = config.get("logger", logging.getLogger())

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
Expand All @@ -60,24 +58,23 @@ def can_extract(self,
"""
return filename.lower().endswith(".pdf")

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> dict:
"""
Add the predefined list of metadata for the pdf documents.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)
result = super().extract_metadata(directory=directory, filename=filename, converted_filename=converted_filename,
original_filename=original_filename, parameters=parameters, other_fields=other_fields)
path = os.path.join(directory, filename)
pdf_fields = self._get_pdf_info(path)
if len(pdf_fields) > 0:
result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **pdf_fields}
result["other_fields"] = {**result.get("other_fields", {}), **pdf_fields}
return result

def _get_pdf_info(self, path: str) -> dict:
Expand Down
Loading

0 comments on commit 29eebb7

Please sign in to comment.