From 436bb361eb6907ff0ed70c51368009d3139c55cb Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 28 Sep 2023 14:34:01 +0300 Subject: [PATCH] TLDR-474 remove is_inserted attribute --- .../static/html_eng/format_description.html | 1 - .../static/html_rus/format_description.html | 1 - dedoc/data_structures/table_metadata.py | 6 +---- .../docx_reader/data_structures/table.py | 2 +- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 2 +- .../json_format_examples/basic_example.json | 13 +++++------ .../linear_structure_type.json | 13 +++++------ .../with_attachments.json | 23 +++++++++---------- .../with_base64_attachments.json | 23 +++++++++---------- .../with_parsed_attachments.json | 23 +++++++++---------- docs/source/dedoc_api_usage/return_format.rst | 10 ++++---- 11 files changed, 53 insertions(+), 64 deletions(-) diff --git a/dedoc/api/static/html_eng/format_description.html b/dedoc/api/static/html_eng/format_description.html index bcdabc55..8447ab52 100644 --- a/dedoc/api/static/html_eng/format_description.html +++ b/dedoc/api/static/html_eng/format_description.html @@ -71,7 +71,6 @@

TableMetadata

  1. uid: str (required field) - unique identifier.
  2. page_id: integer (optional field) - page number on which the table begins. Can be null.
  3. -
  4. is_inserted: bool (optional field) - was table inserted into document.

TreeNode

diff --git a/dedoc/api/static/html_rus/format_description.html b/dedoc/api/static/html_rus/format_description.html index dc43cc34..7f338393 100644 --- a/dedoc/api/static/html_rus/format_description.html +++ b/dedoc/api/static/html_rus/format_description.html @@ -72,7 +72,6 @@

TableMetadata. Метаинформация таблиц
  1. uid: str (обязательное поле) - уникальный идентификатор таблицы.
  2. page_id: int (необязательное поле) - номер страницы на которой начинается таблица.
  3. -
  4. is_inserted: bool (необязательное поле) - была ли таблица встроена в тело документа.

TreeNode. Древовидная структура документа.

diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index 3c9ea615..6b16c53c 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -11,23 +11,20 @@ class TableMetadata(Serializable): """ This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on. """ - def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False, rotated_angle: float = 0.0) -> None: + def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0) -> None: """ :param page_id: number of the page where table starts :param uid: unique identifier of the table - :param is_inserted: indicator if table was already inserted into paragraphs list :param rotated_angle: value of the rotation angle by which the table was rotated during recognition """ self.page_id = page_id self.uid = str(uuid.uuid1()) if not uid else uid - self.is_inserted = is_inserted self.rotated_angle = rotated_angle def to_dict(self) -> dict: res = OrderedDict() res["uid"] = self.uid res["page_id"] = self.page_id - res["is_inserted"] = self.is_inserted res["rotated_angle"] = self.rotated_angle return res @@ -36,6 +33,5 @@ def get_api_dict(api: Api) -> Model: return api.model("TableMetadata", { "page_id": fields.Integer(readonly=False, description="table start page number"), "uid": fields.String(description="table unique id"), - "is_inserted": fields.Boolean(description="was the table inserted into document body"), "rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes") }) diff --git a/dedoc/readers/docx_reader/data_structures/table.py b/dedoc/readers/docx_reader/data_structures/table.py index 46d46485..73a0ad47 100644 --- a/dedoc/readers/docx_reader/data_structures/table.py +++ b/dedoc/readers/docx_reader/data_structures/table.py @@ -90,7 +90,7 @@ def to_table(self) -> Table: result_row.append(cell) result_cells_with_meta.append(result_row) - return Table(cells=result_cells_with_meta, metadata=TableMetadata(page_id=None, uid=self.uid, is_inserted=False)) + return Table(cells=result_cells_with_meta, metadata=TableMetadata(page_id=None, uid=self.uid)) def __get_cell_text(self, cell: Tag) -> str: cell_text = "" diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 3692d19e..c746000a 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -152,7 +152,7 @@ def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[Table], List[Sc result_cells.append(result_row) table_bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) # noqa TODO add table location into TableMetadata - tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, is_inserted=False))) + tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number))) table_name = file_hash + str(page_number) + str(table_num) tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order)) diff --git a/docs/source/_static/json_format_examples/basic_example.json b/docs/source/_static/json_format_examples/basic_example.json index a470920e..d5007fc0 100644 --- a/docs/source/_static/json_format_examples/basic_example.json +++ b/docs/source/_static/json_format_examples/basic_example.json @@ -298,7 +298,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "attach_fa1143ae-5d3c-11ee-b518-0242ac120002" + "value": "attach_75af2486-5df1-11ee-bfc1-0242ac120002" } ], "metadata": { @@ -420,20 +420,19 @@ "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "is_inserted": false, "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_fa1f6786-5d3c-11ee-b518-0242ac120002", + "uid": "doc_uid_auto_75c93394-5df1-11ee-bfc1-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695822696_268.docx", + "temporary_file_name": "1695900213_314.docx", "size": 21270, - "modified_time": 1695822696, - "created_time": 1695822696, - "access_time": 1695822696, + "modified_time": 1695900213, + "created_time": 1695900213, + "access_time": 1695900213, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", diff --git a/docs/source/_static/json_format_examples/linear_structure_type.json b/docs/source/_static/json_format_examples/linear_structure_type.json index 848053fb..535aa687 100644 --- a/docs/source/_static/json_format_examples/linear_structure_type.json +++ b/docs/source/_static/json_format_examples/linear_structure_type.json @@ -388,7 +388,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "attach_fa23fd78-5d3c-11ee-b518-0242ac120002" + "value": "attach_75d13b70-5df1-11ee-bfc1-0242ac120002" } ], "metadata": { @@ -504,20 +504,19 @@ "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "is_inserted": false, "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_fa309d08-5d3c-11ee-b518-0242ac120002", + "uid": "doc_uid_auto_75e45e94-5df1-11ee-bfc1-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695822697_827.docx", + "temporary_file_name": "1695900214_259.docx", "size": 21270, - "modified_time": 1695822697, - "created_time": 1695822697, - "access_time": 1695822697, + "modified_time": 1695900213, + "created_time": 1695900213, + "access_time": 1695900214, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", diff --git a/docs/source/_static/json_format_examples/with_attachments.json b/docs/source/_static/json_format_examples/with_attachments.json index 053791ce..944837f6 100644 --- a/docs/source/_static/json_format_examples/with_attachments.json +++ b/docs/source/_static/json_format_examples/with_attachments.json @@ -298,7 +298,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "attach_fa355abe-5d3c-11ee-b518-0242ac120002" + "value": "attach_75ea598e-5df1-11ee-bfc1-0242ac120002" } ], "metadata": { @@ -420,20 +420,19 @@ "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "is_inserted": false, "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_fa4285e0-5d3c-11ee-b518-0242ac120002", + "uid": "doc_uid_auto_75fac01c-5df1-11ee-bfc1-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695822697_953.docx", + "temporary_file_name": "1695900214_51.docx", "size": 21270, - "modified_time": 1695822697, - "created_time": 1695822697, - "access_time": 1695822697, + "modified_time": 1695900214, + "created_time": 1695900214, + "access_time": 1695900214, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -476,13 +475,13 @@ "tables": [] }, "metadata": { - "uid": "attach_fa355abe-5d3c-11ee-b518-0242ac120002", + "uid": "attach_75ea598e-5df1-11ee-bfc1-0242ac120002", "file_name": "image1.png", - "temporary_file_name": "1695822697_181.png", + "temporary_file_name": "1695900214_864.png", "size": 14874, - "modified_time": 1695822697, - "created_time": 1695822697, - "access_time": 1695822697, + "modified_time": 1695900214, + "created_time": 1695900214, + "access_time": 1695900214, "file_type": "image/png", "other_fields": {} }, diff --git a/docs/source/_static/json_format_examples/with_base64_attachments.json b/docs/source/_static/json_format_examples/with_base64_attachments.json index 4fa1a2d3..5b674120 100644 --- a/docs/source/_static/json_format_examples/with_base64_attachments.json +++ b/docs/source/_static/json_format_examples/with_base64_attachments.json @@ -298,7 +298,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "attach_fa48dd8c-5d3c-11ee-b518-0242ac120002" + "value": "attach_7604e13c-5df1-11ee-bfc1-0242ac120002" } ], "metadata": { @@ -420,20 +420,19 @@ "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "is_inserted": false, "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_fa562866-5d3c-11ee-b518-0242ac120002", + "uid": "doc_uid_auto_76126078-5df1-11ee-bfc1-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695822697_293.docx", + "temporary_file_name": "1695900214_267.docx", "size": 21270, - "modified_time": 1695822697, - "created_time": 1695822697, - "access_time": 1695822697, + "modified_time": 1695900214, + "created_time": 1695900214, + "access_time": 1695900214, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -476,13 +475,13 @@ "tables": [] }, "metadata": { - "uid": "attach_fa48dd8c-5d3c-11ee-b518-0242ac120002", + "uid": "attach_7604e13c-5df1-11ee-bfc1-0242ac120002", "file_name": "image1.png", - "temporary_file_name": "1695822697_915.png", + "temporary_file_name": "1695900214_880.png", "size": 14874, - "modified_time": 1695822697, - "created_time": 1695822697, - "access_time": 1695822697, + "modified_time": 1695900214, + "created_time": 1695900214, + "access_time": 1695900214, "file_type": "image/png", "base64_encode": "", "other_fields": { diff --git a/docs/source/_static/json_format_examples/with_parsed_attachments.json b/docs/source/_static/json_format_examples/with_parsed_attachments.json index 2c519c3c..b8c33290 100644 --- a/docs/source/_static/json_format_examples/with_parsed_attachments.json +++ b/docs/source/_static/json_format_examples/with_parsed_attachments.json @@ -298,7 +298,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "attach_fa5c54ac-5d3c-11ee-b518-0242ac120002" + "value": "attach_7619127e-5df1-11ee-bfc1-0242ac120002" } ], "metadata": { @@ -420,20 +420,19 @@ "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, - "is_inserted": false, "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_fa7fdbc0-5d3c-11ee-b518-0242ac120002", + "uid": "doc_uid_auto_7630e7b4-5df1-11ee-bfc1-0242ac120002", "file_name": "example_return_format.docx", - "temporary_file_name": "1695822697_469.docx", + "temporary_file_name": "1695900214_402.docx", "size": 21270, - "modified_time": 1695822697, - "created_time": 1695822697, - "access_time": 1695822697, + "modified_time": 1695900214, + "created_time": 1695900214, + "access_time": 1695900214, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -913,13 +912,13 @@ "tables": [] }, "metadata": { - "uid": "attach_fa5c54ac-5d3c-11ee-b518-0242ac120002", + "uid": "attach_7619127e-5df1-11ee-bfc1-0242ac120002", "file_name": "image1.png", - "temporary_file_name": "1695822697_301.png", + "temporary_file_name": "1695900214_972.png", "size": 14874, - "modified_time": 1695822697, - "created_time": 1695822697, - "access_time": 1695822697, + "modified_time": 1695900214, + "created_time": 1695900214, + "access_time": 1695900214, "file_type": "image/png", "rotated_page_angles": [ 0 diff --git a/docs/source/dedoc_api_usage/return_format.rst b/docs/source/dedoc_api_usage/return_format.rst index c50c6daf..70d37ff0 100644 --- a/docs/source/dedoc_api_usage/return_format.rst +++ b/docs/source/dedoc_api_usage/return_format.rst @@ -64,13 +64,13 @@ The beginning of the document's metadata: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 429-437 + :lines: 428-436 The document's attachments: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 459 + :lines: 458 As we see, the `attachments` field is empty because the option `with_attachments` is set to `"false"` by default (see :ref:`table_parameters`). @@ -118,7 +118,7 @@ Unlike the previous examples, in this case we have `attachments` field filled: .. literalinclude:: ../_static/json_format_examples/with_attachments.json :language: json - :lines: 459-491 + :lines: 458-490 Example with base64 attachments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,7 +136,7 @@ The only difference is in the attachment's metadata: attachment's content is enc .. literalinclude:: ../_static/json_format_examples/with_base64_attachments.json :language: json - :lines: 459-494 + :lines: 458-493 Example with parsed attachments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -156,4 +156,4 @@ The beginning of the document's attachments: .. literalinclude:: ../_static/json_format_examples/with_parsed_attachments.json :language: json - :lines: 459-484 + :lines: 458-483