From d752ad2e4f8f517bfe2266cbc075c541cfe191a0 Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 28 Sep 2023 16:27:44 +0300 Subject: [PATCH] TLDR-475 fix table documentation (#338) * TLDR-475 fix table documentation * Small fixes --- dedoc/data_structures/cell_with_meta.py | 13 +- dedoc/data_structures/table.py | 4 +- dedoc/data_structures/table_metadata.py | 5 +- .../code_examples/dedoc_usage_tutorial.py | 12 +- .../json_format_examples/basic_example.json | 168 +++--- .../linear_structure_type.json | 168 +++--- .../with_attachments.json | 179 ++++--- .../with_base64_attachments.json | 179 ++++--- .../with_inserted_table.json | 168 +++--- .../with_parsed_attachments.json | 506 +++++++++++++++--- docs/source/dedoc_api_usage/return_format.rst | 12 +- docs/source/getting_started/usage.rst | 10 +- examples/README.md | 13 +- examples/create_structured_document.py | 3 +- examples/create_unstructured_document.py | 48 +- examples/example_doc_parser.py | 14 +- examples/example_img_parser.py | 14 +- examples/example_manager_input.py | 3 +- examples/example_pdf_parser.py | 22 +- examples/example_post.py | 3 +- 20 files changed, 1018 insertions(+), 526 deletions(-) diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index ab800f87..2ccbdd38 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -9,20 +9,23 @@ class CellWithMeta: """ - This class holds the information about the cell information: text of the cell, text annotations and cell properties (rowspan, colspan, invisible). + This class holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible). """ def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: """ - :param lines: text lines (LineWithMeta) of the cell - :param colspan: The value of the rowspan attribute represents the number of columns to span. Like HTML format. - :param rowspan: The value of the rowspan attribute represents the number of rows to span. Like HTML format. - :param invisible: Display or hide cell values + :param lines: textual lines of the cell + :param colspan: number of columns to span like in HTML format + :param rowspan: number of rows to span like in HTML format + :param invisible: indicator for displaying or hiding cell text """ self.lines = lines self.colspan = colspan self.rowspan = rowspan self.invisible = invisible + def __repr__(self) -> str: + return f"CellWithMeta({self.get_text()[:65]})" + def get_text(self) -> str: return "\n".join([line.line for line in self.lines]) diff --git a/dedoc/data_structures/table.py b/dedoc/data_structures/table.py index f04a04e4..c92125ea 100644 --- a/dedoc/data_structures/table.py +++ b/dedoc/data_structures/table.py @@ -16,8 +16,8 @@ class Table(Serializable): """ def __init__(self, cells: List[List[CellWithMeta]], metadata: TableMetadata) -> None: """ - :param cells: a list of lists of cells (cell has text, colspan and rowspan attributes). - :param metadata: some table metadata, as location, size and so on. + :param cells: a list of lists of cells (cell has text, colspan and rowspan attributes) + :param metadata: some table metadata as location, size and so on """ self.metadata = metadata self.cells = cells diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index 0ea8d1c3..3c9ea615 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -9,15 +9,14 @@ class TableMetadata(Serializable): """ - This class holds the information about the table location in the document and information about cell properties. + This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on. """ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False, rotated_angle: float = 0.0) -> None: """ :param page_id: number of the page where table starts :param uid: unique identifier of the table :param is_inserted: indicator if table was already inserted into paragraphs list - :param rotated_angle: the value of the rotation angle by which the table was rotated during recognition. Extracted boxes from a table will need to - be rotated by this angle. + :param rotated_angle: value of the rotation angle by which the table was rotated during recognition """ self.page_id = page_id self.uid = str(uuid.uuid1()) if not uid else uid diff --git a/docs/source/_static/code_examples/dedoc_usage_tutorial.py b/docs/source/_static/code_examples/dedoc_usage_tutorial.py index 176c7fe5..3e0df596 100644 --- a/docs/source/_static/code_examples/dedoc_usage_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_usage_tutorial.py @@ -48,13 +48,13 @@ document.lines[3].annotations[7] # Italic(6:12, True) document.lines[3].annotations[8] # Size(14:19, 10.0) -document.tables[0].cells[0][0].get_text() # N -document.tables[0].cells[1][3].get_text() # Cell3 -document.tables[1].cells[3][0].get_text() # 'Text 3' +cell = document.tables[0].cells[0][0] +cell # CellWithMeta(N) +cell.get_text() # N +cell.rowspan, cell.colspan, cell.invisible # (1, 1, False) document.tables[0].metadata.uid # f2f08354fc2dbcb5ded8885479f498a6 -document.tables[0].cells[0][0].colspan # 1 -document.tables[0].cells[0][0].rowspan # 1 -document.tables[0].cells[0][0].invisible # False +document.tables[0].metadata.page_id # None +document.tables[0].metadata.rotated_angle # 0.0 document.tables[1].cells[0][0].invisible # False document.tables[1].cells[0][1].invisible # True document.tables[1].cells[0][0].colspan # 2 diff --git a/docs/source/_static/json_format_examples/basic_example.json b/docs/source/_static/json_format_examples/basic_example.json index fafb554a..a470920e 100644 --- a/docs/source/_static/json_format_examples/basic_example.json +++ b/docs/source/_static/json_format_examples/basic_example.json @@ -1,5 +1,5 @@ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -298,7 +298,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "image1.png" + "value": "attach_fa1143ae-5d3c-11ee-b518-0242ac120002" } ], "metadata": { @@ -321,87 +321,119 @@ { "cells": [ [ - "Table header", - "Table header" + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 2, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + } ], [ - "Vertically merged cells", - "Text 1" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 2, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 1", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Vertically merged cells", - "Text 2" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + }, + { + "lines": [ + { + "text": "Text 2", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Text 3", - "Text 4" + { + "lines": [ + { + "text": "Text 3", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 4", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ] ], "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, "is_inserted": false, - "cell_properties": [ - [ - { - "colspan": 2, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": true - } - ], - [ - { - "colspan": 1, - "rowspan": 2, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": true - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ] - ] + "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_5cbfdc00-0e90-11ee-8789-4549ad8e7206", + "uid": "doc_uid_auto_fa1f6786-5d3c-11ee-b518-0242ac120002", "file_name": "example_return_format.docx", + "temporary_file_name": "1695822696_268.docx", "size": 21270, - "modified_time": 1687172368, - "created_time": 1687172368, - "access_time": 1687172368, + "modified_time": 1695822696, + "created_time": 1695822696, + "access_time": 1695822696, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -409,8 +441,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null, "other_fields": { "document_subject": "", @@ -419,8 +451,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null } }, diff --git a/docs/source/_static/json_format_examples/linear_structure_type.json b/docs/source/_static/json_format_examples/linear_structure_type.json index c7a971e3..848053fb 100644 --- a/docs/source/_static/json_format_examples/linear_structure_type.json +++ b/docs/source/_static/json_format_examples/linear_structure_type.json @@ -1,5 +1,5 @@ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -388,7 +388,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "image1.png" + "value": "attach_fa23fd78-5d3c-11ee-b518-0242ac120002" } ], "metadata": { @@ -405,87 +405,119 @@ { "cells": [ [ - "Table header", - "Table header" + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 2, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + } ], [ - "Vertically merged cells", - "Text 1" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 2, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 1", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Vertically merged cells", - "Text 2" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + }, + { + "lines": [ + { + "text": "Text 2", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Text 3", - "Text 4" + { + "lines": [ + { + "text": "Text 3", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 4", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ] ], "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, "is_inserted": false, - "cell_properties": [ - [ - { - "colspan": 2, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": true - } - ], - [ - { - "colspan": 1, - "rowspan": 2, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": true - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ] - ] + "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_5ce69246-0e90-11ee-8789-4549ad8e7206", + "uid": "doc_uid_auto_fa309d08-5d3c-11ee-b518-0242ac120002", "file_name": "example_return_format.docx", + "temporary_file_name": "1695822697_827.docx", "size": 21270, - "modified_time": 1687172369, - "created_time": 1687172369, - "access_time": 1687172369, + "modified_time": 1695822697, + "created_time": 1695822697, + "access_time": 1695822697, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -493,8 +525,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null, "other_fields": { "document_subject": "", @@ -503,8 +535,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null } }, diff --git a/docs/source/_static/json_format_examples/with_attachments.json b/docs/source/_static/json_format_examples/with_attachments.json index dc7bb9d7..053791ce 100644 --- a/docs/source/_static/json_format_examples/with_attachments.json +++ b/docs/source/_static/json_format_examples/with_attachments.json @@ -1,5 +1,5 @@ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -298,7 +298,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "image1.png" + "value": "attach_fa355abe-5d3c-11ee-b518-0242ac120002" } ], "metadata": { @@ -321,87 +321,119 @@ { "cells": [ [ - "Table header", - "Table header" + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 2, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + } ], [ - "Vertically merged cells", - "Text 1" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 2, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 1", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Vertically merged cells", - "Text 2" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + }, + { + "lines": [ + { + "text": "Text 2", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Text 3", - "Text 4" + { + "lines": [ + { + "text": "Text 3", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 4", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ] ], "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, "is_inserted": false, - "cell_properties": [ - [ - { - "colspan": 2, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": true - } - ], - [ - { - "colspan": 1, - "rowspan": 2, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": true - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ] - ] + "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_5d374ec0-0e90-11ee-8789-4549ad8e7206", + "uid": "doc_uid_auto_fa4285e0-5d3c-11ee-b518-0242ac120002", "file_name": "example_return_format.docx", + "temporary_file_name": "1695822697_953.docx", "size": 21270, - "modified_time": 1687172369, - "created_time": 1687172369, - "access_time": 1687172369, + "modified_time": 1695822697, + "created_time": 1695822697, + "access_time": 1695822697, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -409,8 +441,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null, "other_fields": { "document_subject": "", @@ -419,14 +451,14 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null } }, "attachments": [ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -444,12 +476,13 @@ "tables": [] }, "metadata": { - "uid": "attach_5d36badc-0e90-11ee-8789-4549ad8e7206", + "uid": "attach_fa355abe-5d3c-11ee-b518-0242ac120002", "file_name": "image1.png", + "temporary_file_name": "1695822697_181.png", "size": 14874, - "modified_time": 1687172369, - "created_time": 1687172369, - "access_time": 1687172369, + "modified_time": 1695822697, + "created_time": 1695822697, + "access_time": 1695822697, "file_type": "image/png", "other_fields": {} }, diff --git a/docs/source/_static/json_format_examples/with_base64_attachments.json b/docs/source/_static/json_format_examples/with_base64_attachments.json index afdf1f5f..4fa1a2d3 100644 --- a/docs/source/_static/json_format_examples/with_base64_attachments.json +++ b/docs/source/_static/json_format_examples/with_base64_attachments.json @@ -1,5 +1,5 @@ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -298,7 +298,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "image1.png" + "value": "attach_fa48dd8c-5d3c-11ee-b518-0242ac120002" } ], "metadata": { @@ -321,87 +321,119 @@ { "cells": [ [ - "Table header", - "Table header" + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 2, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + } ], [ - "Vertically merged cells", - "Text 1" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 2, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 1", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Vertically merged cells", - "Text 2" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + }, + { + "lines": [ + { + "text": "Text 2", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Text 3", - "Text 4" + { + "lines": [ + { + "text": "Text 3", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 4", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ] ], "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, "is_inserted": false, - "cell_properties": [ - [ - { - "colspan": 2, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": true - } - ], - [ - { - "colspan": 1, - "rowspan": 2, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": true - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ] - ] + "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_5d83dc9a-0e90-11ee-8789-4549ad8e7206", + "uid": "doc_uid_auto_fa562866-5d3c-11ee-b518-0242ac120002", "file_name": "example_return_format.docx", + "temporary_file_name": "1695822697_293.docx", "size": 21270, - "modified_time": 1687172370, - "created_time": 1687172370, - "access_time": 1687172370, + "modified_time": 1695822697, + "created_time": 1695822697, + "access_time": 1695822697, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -409,8 +441,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null, "other_fields": { "document_subject": "", @@ -419,14 +451,14 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null } }, "attachments": [ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -444,12 +476,13 @@ "tables": [] }, "metadata": { - "uid": "attach_5d834af0-0e90-11ee-8789-4549ad8e7206", + "uid": "attach_fa48dd8c-5d3c-11ee-b518-0242ac120002", "file_name": "image1.png", + "temporary_file_name": "1695822697_915.png", "size": 14874, - "modified_time": 1687172370, - "created_time": 1687172370, - "access_time": 1687172370, + "modified_time": 1695822697, + "created_time": 1695822697, + "access_time": 1695822697, "file_type": "image/png", "base64_encode": "", "other_fields": { diff --git a/docs/source/_static/json_format_examples/with_inserted_table.json b/docs/source/_static/json_format_examples/with_inserted_table.json index c49e4316..9262be2b 100644 --- a/docs/source/_static/json_format_examples/with_inserted_table.json +++ b/docs/source/_static/json_format_examples/with_inserted_table.json @@ -1,5 +1,5 @@ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -491,7 +491,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "image1.png" + "value": "attach_fce0c064-5d3c-11ee-b518-0242ac120002" } ], "metadata": { @@ -514,87 +514,119 @@ { "cells": [ [ - "Table header", - "Table header" + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 2, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + } ], [ - "Vertically merged cells", - "Text 1" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 2, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 1", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Vertically merged cells", - "Text 2" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + }, + { + "lines": [ + { + "text": "Text 2", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Text 3", - "Text 4" + { + "lines": [ + { + "text": "Text 3", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 4", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ] ], "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, "is_inserted": true, - "cell_properties": [ - [ - { - "colspan": 2, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": true - } - ], - [ - { - "colspan": 1, - "rowspan": 2, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": true - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ] - ] + "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_5fc70b1c-0e90-11ee-8789-4549ad8e7206", + "uid": "doc_uid_auto_fcedf964-5d3c-11ee-b518-0242ac120002", "file_name": "example_return_format.docx", + "temporary_file_name": "1695822701_99.docx", "size": 21270, - "modified_time": 1687172373, - "created_time": 1687172373, - "access_time": 1687172373, + "modified_time": 1695822701, + "created_time": 1695822701, + "access_time": 1695822701, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -602,8 +634,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null, "other_fields": { "document_subject": "", @@ -612,8 +644,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null } }, diff --git a/docs/source/_static/json_format_examples/with_parsed_attachments.json b/docs/source/_static/json_format_examples/with_parsed_attachments.json index 8f404c96..2c519c3c 100644 --- a/docs/source/_static/json_format_examples/with_parsed_attachments.json +++ b/docs/source/_static/json_format_examples/with_parsed_attachments.json @@ -1,5 +1,5 @@ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -298,7 +298,7 @@ "start": 0, "end": 14, "name": "attachment", - "value": "image1.png" + "value": "attach_fa5c54ac-5d3c-11ee-b518-0242ac120002" } ], "metadata": { @@ -321,87 +321,119 @@ { "cells": [ [ - "Table header", - "Table header" + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 2, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Table header", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + } ], [ - "Vertically merged cells", - "Text 1" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 2, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 1", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Vertically merged cells", - "Text 2" + { + "lines": [ + { + "text": "Vertically merged cells", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": true + }, + { + "lines": [ + { + "text": "Text 2", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ], [ - "Text 3", - "Text 4" + { + "lines": [ + { + "text": "Text 3", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + }, + { + "lines": [ + { + "text": "Text 4", + "annotations": [] + } + ], + "colspan": 1, + "rowspan": 1, + "invisible": false + } ] ], "metadata": { "uid": "3a327789721e09b3fa6fd9560f3ee263", "page_id": null, "is_inserted": false, - "cell_properties": [ - [ - { - "colspan": 2, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": true - } - ], - [ - { - "colspan": 1, - "rowspan": 2, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": true - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ], - [ - { - "colspan": 1, - "rowspan": 1, - "invisible": false - }, - { - "colspan": 1, - "rowspan": 1, - "invisible": false - } - ] - ] + "rotated_angle": 0.0 } } ] }, "metadata": { - "uid": "doc_uid_auto_5dc67f28-0e90-11ee-8789-4549ad8e7206", + "uid": "doc_uid_auto_fa7fdbc0-5d3c-11ee-b518-0242ac120002", "file_name": "example_return_format.docx", + "temporary_file_name": "1695822697_469.docx", "size": 21270, - "modified_time": 1687172370, - "created_time": 1687172370, - "access_time": 1687172370, + "modified_time": 1695822697, + "created_time": 1695822697, + "access_time": 1695822697, "file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "document_subject": "", "keywords": "", @@ -409,8 +441,8 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null, "other_fields": { "document_subject": "", @@ -419,14 +451,14 @@ "comments": "", "author": "", "last_modified_by": "", - "created_date": 1568736411, - "modified_date": 1686923436, + "created_date": 1568725611, + "modified_date": 1686912636, "last_printed_date": null } }, "attachments": [ { - "version": "2023.05.26", + "version": "0.11.2", "warnings": [], "content": { "structure": { @@ -446,15 +478,57 @@ "annotations": [ { "start": 0, - "end": 27, + "end": 3, + "name": "confidence", + "value": "0.96" + }, + { + "start": 10, + "end": 17, + "name": "confidence", + "value": "0.96" + }, + { + "start": 0, + "end": 3, + "name": "bounding box", + "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.27358490566037735, \"width\": 0.0527086383601757, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 4, + "end": 9, + "name": "confidence", + "value": "0.95" + }, + { + "start": 4, + "end": 9, + "name": "bounding box", + "value": "{\"x_top_left\": 0.11566617862371889, \"y_top_left\": 0.27358490566037735, \"width\": 0.09077598828696926, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 10, + "end": 17, + "name": "bounding box", + "value": "{\"x_top_left\": 0.212298682284041, \"y_top_left\": 0.27358490566037735, \"width\": 0.11859443631039532, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 18, + "end": 26, + "name": "confidence", + "value": "0.77" + }, + { + "start": 18, + "end": 26, "name": "bounding box", - "value": "{\"x_top_left\": 37, \"y_top_left\": 29, \"width\": 304, \"height\": 13}" + "value": "{\"x_top_left\": 0.3396778916544656, \"y_top_left\": 0.27358490566037735, \"width\": 0.1595900439238653, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" }, { "start": 0, "end": 27, - "name": "bold", - "value": "True" + "name": "bounding box", + "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.27358490566037735, \"width\": 0.445095168374817, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" }, { "start": 0, @@ -498,11 +572,95 @@ "node_id": "0.1.0", "text": "1) Fisrst item with some english text\n", "annotations": [ + { + "start": 0, + "end": 2, + "name": "confidence", + "value": "0.93" + }, + { + "start": 0, + "end": 2, + "name": "bounding box", + "value": "{\"x_top_left\": 0.05710102489019034, \"y_top_left\": 0.4811320754716981, \"width\": 0.020497803806734993, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 3, + "end": 9, + "name": "confidence", + "value": "0.81" + }, + { + "start": 3, + "end": 9, + "name": "bounding box", + "value": "{\"x_top_left\": 0.08345534407027819, \"y_top_left\": 0.4811320754716981, \"width\": 0.0629575402635432, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 10, + "end": 14, + "name": "confidence", + "value": "0.96" + }, + { + "start": 15, + "end": 19, + "name": "confidence", + "value": "0.96" + }, + { + "start": 20, + "end": 24, + "name": "confidence", + "value": "0.96" + }, + { + "start": 25, + "end": 32, + "name": "confidence", + "value": "0.96" + }, + { + "start": 33, + "end": 37, + "name": "confidence", + "value": "0.96" + }, + { + "start": 10, + "end": 14, + "name": "bounding box", + "value": "{\"x_top_left\": 0.15373352855051245, \"y_top_left\": 0.4811320754716981, \"width\": 0.04978038067349927, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 15, + "end": 19, + "name": "bounding box", + "value": "{\"x_top_left\": 0.2108345534407028, \"y_top_left\": 0.4811320754716981, \"width\": 0.048316251830161056, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 20, + "end": 24, + "name": "bounding box", + "value": "{\"x_top_left\": 0.2679355783308931, \"y_top_left\": 0.5188679245283019, \"width\": 0.05856515373352855, \"height\": 0.08490566037735849, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 25, + "end": 32, + "name": "bounding box", + "value": "{\"x_top_left\": 0.33382137628111275, \"y_top_left\": 0.4811320754716981, \"width\": 0.07906295754026355, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 33, + "end": 37, + "name": "bounding box", + "value": "{\"x_top_left\": 0.4216691068814056, \"y_top_left\": 0.5, \"width\": 0.0424597364568082, \"height\": 0.10377358490566038, \"page_width\": 683, \"page_height\": 106}" + }, { "start": 0, "end": 38, "name": "bounding box", - "value": "{\"x_top_left\": 39, \"y_top_left\": 51, \"width\": 278, \"height\": 17}" + "value": "{\"x_top_left\": 0.05710102489019034, \"y_top_left\": 0.4811320754716981, \"width\": 0.40702781844802344, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" }, { "start": 0, @@ -535,11 +693,191 @@ "node_id": "0.1.1", "text": "2) Second item with some even more english text. Let me speak from my heart\n", "annotations": [ + { + "start": 0, + "end": 2, + "name": "confidence", + "value": "0.94" + }, + { + "start": 0, + "end": 2, + "name": "bounding box", + "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.6981132075471698, \"width\": 0.02342606149341142, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 3, + "end": 9, + "name": "confidence", + "value": "0.96" + }, + { + "start": 10, + "end": 14, + "name": "confidence", + "value": "0.96" + }, + { + "start": 15, + "end": 19, + "name": "confidence", + "value": "0.96" + }, + { + "start": 20, + "end": 24, + "name": "confidence", + "value": "0.96" + }, + { + "start": 25, + "end": 29, + "name": "confidence", + "value": "0.96" + }, + { + "start": 30, + "end": 34, + "name": "confidence", + "value": "0.96" + }, + { + "start": 35, + "end": 42, + "name": "confidence", + "value": "0.96" + }, + { + "start": 43, + "end": 48, + "name": "confidence", + "value": "0.96" + }, + { + "start": 53, + "end": 55, + "name": "confidence", + "value": "0.96" + }, + { + "start": 56, + "end": 61, + "name": "confidence", + "value": "0.96" + }, + { + "start": 62, + "end": 66, + "name": "confidence", + "value": "0.96" + }, + { + "start": 67, + "end": 69, + "name": "confidence", + "value": "0.96" + }, + { + "start": 70, + "end": 75, + "name": "confidence", + "value": "0.96" + }, + { + "start": 3, + "end": 9, + "name": "bounding box", + "value": "{\"x_top_left\": 0.0849194729136164, \"y_top_left\": 0.6981132075471698, \"width\": 0.08052708638360176, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 10, + "end": 14, + "name": "bounding box", + "value": "{\"x_top_left\": 0.17423133235724744, \"y_top_left\": 0.6981132075471698, \"width\": 0.04978038067349927, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 15, + "end": 19, + "name": "bounding box", + "value": "{\"x_top_left\": 0.23133235724743778, \"y_top_left\": 0.6981132075471698, \"width\": 0.048316251830161056, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 20, + "end": 24, + "name": "bounding box", + "value": "{\"x_top_left\": 0.2884333821376281, \"y_top_left\": 0.7358490566037735, \"width\": 0.05856515373352855, \"height\": 0.08490566037735849, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 25, + "end": 29, + "name": "bounding box", + "value": "{\"x_top_left\": 0.35431918008784774, \"y_top_left\": 0.7358490566037735, \"width\": 0.05124450951683748, \"height\": 0.08490566037735849, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 30, + "end": 34, + "name": "bounding box", + "value": "{\"x_top_left\": 0.4143484626647145, \"y_top_left\": 0.7358490566037735, \"width\": 0.05710102489019034, \"height\": 0.08490566037735849, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 35, + "end": 42, + "name": "bounding box", + "value": "{\"x_top_left\": 0.4787701317715959, \"y_top_left\": 0.6981132075471698, \"width\": 0.08052708638360176, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 43, + "end": 48, + "name": "bounding box", + "value": "{\"x_top_left\": 0.5666178623718887, \"y_top_left\": 0.7169811320754716, \"width\": 0.048316251830161056, \"height\": 0.10377358490566038, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 49, + "end": 52, + "name": "confidence", + "value": "0.97" + }, + { + "start": 49, + "end": 52, + "name": "bounding box", + "value": "{\"x_top_left\": 0.623718887262079, \"y_top_left\": 0.6981132075471698, \"width\": 0.036603221083455345, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 53, + "end": 55, + "name": "bounding box", + "value": "{\"x_top_left\": 0.6676427525622255, \"y_top_left\": 0.7358490566037735, \"width\": 0.03367496339677892, \"height\": 0.08490566037735849, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 56, + "end": 61, + "name": "bounding box", + "value": "{\"x_top_left\": 0.7086383601756955, \"y_top_left\": 0.6981132075471698, \"width\": 0.06442166910688141, \"height\": 0.16037735849056603, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 62, + "end": 66, + "name": "bounding box", + "value": "{\"x_top_left\": 0.780380673499268, \"y_top_left\": 0.6886792452830188, \"width\": 0.05417276720351391, \"height\": 0.1320754716981132, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 67, + "end": 69, + "name": "bounding box", + "value": "{\"x_top_left\": 0.8418740849194729, \"y_top_left\": 0.7358490566037735, \"width\": 0.03513909224011713, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, + { + "start": 70, + "end": 75, + "name": "bounding box", + "value": "{\"x_top_left\": 0.8843338213762811, \"y_top_left\": 0.6981132075471698, \"width\": 0.055636896046852125, \"height\": 0.12264150943396226, \"page_width\": 683, \"page_height\": 106}" + }, { "start": 0, "end": 76, "name": "bounding box", - "value": "{\"x_top_left\": 37, \"y_top_left\": 73, \"width\": 605, \"height\": 18}" + "value": "{\"x_top_left\": 0.05417276720351391, \"y_top_left\": 0.6886792452830188, \"width\": 0.8857979502196194, \"height\": 0.16981132075471697, \"page_width\": 683, \"page_height\": 106}" }, { "start": 0, @@ -575,14 +913,22 @@ "tables": [] }, "metadata": { - "uid": "attach_5dc5cb0a-0e90-11ee-8789-4549ad8e7206", + "uid": "attach_fa5c54ac-5d3c-11ee-b518-0242ac120002", "file_name": "image1.png", + "temporary_file_name": "1695822697_301.png", "size": 14874, - "modified_time": 1687172370, - "created_time": 1687172370, - "access_time": 1687172370, + "modified_time": 1695822697, + "created_time": 1695822697, + "access_time": 1695822697, "file_type": "image/png", - "other_fields": {} + "rotated_page_angles": [ + 0 + ], + "other_fields": { + "rotated_page_angles": [ + 0 + ] + } }, "attachments": [] } diff --git a/docs/source/dedoc_api_usage/return_format.rst b/docs/source/dedoc_api_usage/return_format.rst index 90c87789..a75aa65f 100644 --- a/docs/source/dedoc_api_usage/return_format.rst +++ b/docs/source/dedoc_api_usage/return_format.rst @@ -58,19 +58,19 @@ The beginning of the document's tables: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 320-350 + :lines: 320-346 The beginning of the document's metadata: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 398-405 + :lines: 429-437 The document's attachments: .. literalinclude:: ../_static/json_format_examples/basic_example.json :language: json - :lines: 427 + :lines: 459 As we see, the `attachments` field is empty because the option `with_attachments` is set to `"false"` by default (see :ref:`table_parameters`). @@ -118,7 +118,7 @@ Unlike the previous examples, in this case we have `attachments` field filled: .. literalinclude:: ../_static/json_format_examples/with_attachments.json :language: json - :lines: 427-458 + :lines: 459-491 Example with base64 attachments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,7 +136,7 @@ The only difference is in the attachment's metadata: attachment's content is enc .. literalinclude:: ../_static/json_format_examples/with_base64_attachments.json :language: json - :lines: 427-461 + :lines: 459-494 Example with parsed attachments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -156,7 +156,7 @@ The beginning of the document's attachments: .. literalinclude:: ../_static/json_format_examples/with_parsed_attachments.json :language: json - :lines: 427-452 + :lines: 459-484 Example with inserted table diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst index 88a80f26..57329ce9 100644 --- a/docs/source/getting_started/usage.rst +++ b/docs/source/getting_started/usage.rst @@ -122,20 +122,20 @@ Document tables The attribute `tables` in the :class:`dedoc.data_structures.UnstructuredDocument` is a list of :class:`dedoc.data_structures.Table`. -Each table is represented as a list of table rows, each row is a list of strings with cells text. +Each table is represented as a list of table rows, each row is a list of cells with additional metadata :class:`dedoc.data_structures.CellWithMeta`. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 51-53 + :lines: 51-54 -It also has metadata, containing table's unique identifier, cells properties (information about rowspan and colspan). +It also has metadata, containing table's unique identifier, rotation angle (if table has been rotated - for images) and so on. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 54-57 + :lines: 55-57 All tables have rectangular form, so if the cells are merged, in the intermediate representation they aren't and have the same contents. -Use cells properties for getting information about merged cells. +Use cells metadata for getting information about merged cells. .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python diff --git a/examples/README.md b/examples/README.md index 39938547..09b8ca55 100644 --- a/examples/README.md +++ b/examples/README.md @@ -7,25 +7,24 @@ This is the easiest way, since this class automatically determines the format of As shown in corresponding examples, you can create this manager with following lines: ``` -from dedoc.config import get_config from dedoc import DedocManager -manager = DedocManager(config=get_config()) +manager = DedocManager() ``` And after that you can get parsed document with one simple line, just replace `"your_file_name"` with the path to your chosen file: ``` parsed_document = manager.parse(file_path="your_file_name") ``` -To get more information, look at Dedoc usage tutorial. +To get more information, look at [Dedoc usage tutorial](https://dedoc.readthedocs.io/en/latest/getting_started/usage.html). -If you want to call a specific parser, you can look at some examples in this directory. File `example_doc_parser.py` shows how you can use `DocxReader`, -`example_pdf_parser.py` shows examples with PDF file parsing. In order to parse img-like file you can call `PdfImageReader` like it's shown in +If you want to call a specific parser, you can look at some examples in this directory. File `example_doc_parser.py` shows how to use `DocxReader`, +`example_pdf_parser.py` shows examples with PDF file parsing. In order to parse image-like file you can call `PdfImageReader` like it's shown in `example_img_parser.py`. -Also you can look at the example of using a post-request to parse documents while Dedoc container is working. This example is written in `example_post.py`. +Also, you can look at the example of using a post-request to parse documents while Dedoc container is working. This example is written in `example_post.py`. You can check an example like this: ```bash cd examples -python create_structured_document.py +python3 create_structured_document.py ``` \ No newline at end of file diff --git a/examples/create_structured_document.py b/examples/create_structured_document.py index 0a4d468d..907df01a 100644 --- a/examples/create_structured_document.py +++ b/examples/create_structured_document.py @@ -5,7 +5,6 @@ # to create structured document you can use TreeConstructor and apply it to unstructured document # in this example we'll use unstructured_document from create_unstructured_document.py structure_constructor = TreeConstructor() -parsed_document = structure_constructor.structure_document(document=unstructured_document, - structure_type="tree") +parsed_document = structure_constructor.structure_document(document=unstructured_document, structure_type="tree") print(parsed_document.to_dict()) diff --git a/examples/create_unstructured_document.py b/examples/create_unstructured_document.py index eb75f81a..0da38dc9 100644 --- a/examples/create_unstructured_document.py +++ b/examples/create_unstructured_document.py @@ -1,10 +1,10 @@ # noqa -# in this example we create UnstructuredDocument, lets construct document corresponding to example.docx +# in this example we create UnstructuredDocument, let's construct document corresponding to example.docx from dedoc.data_structures import LineMetadata, Table, UnstructuredDocument from dedoc.data_structures import TableMetadata from dedoc.data_structures import LineWithMeta -# First of all lets create some table, table consist of cells (list of rows, and row is a list of strings +# First of all let's create some table, table consists of cells (list of rows, and row is a list of cells with metadata) from dedoc.data_structures import HierarchyLevel from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.metadata_extractors import BaseMetadataExtractor @@ -14,56 +14,42 @@ ["N", "Second name", "Name", "Organization", "Phone", "Notes"], ["1", "Ivanov", "Ivan", "ISP RAS", "8-800"], ] -cell_with_meta = [[CellWithMeta(lines=[LineWithMeta(line=cell_text, - metadata=LineMetadata(page_id=0, - line_id=None), - annotations=[])]) for cell_text in row] for row in table_cells] -# table also has some metadata, lets assume that our table is on first page +cells_with_meta = [[CellWithMeta(lines=[LineWithMeta(line=cell_text, + metadata=LineMetadata(page_id=0, line_id=None), + annotations=[])]) for cell_text in row] for row in table_cells] +# table also has some metadata, let's assume that our table is on the first page table_metadata = TableMetadata(page_id=0, uid="table 1") # let's build table -table = Table(cells=cell_with_meta, metadata=table_metadata) +table = Table(cells=cells_with_meta, metadata=table_metadata) # Documents also contain some text. -# Logical structure of document may be represented by tree (see example_tree.png) -# but unstructured document consist of flat list of lines with text and metadata +# Logical structure of document may be represented by tree (see example_tree.png) +# but unstructured document consists of flat list of lines with text and metadata # hierarchy structure hidden in HierarchyLevel attribute of LineWithMeta -# lets build firs line, it is document tree root: +# let's build first line, it is document tree root: -# hierarchy level define position of this line in document tree. +# hierarchy level defines position of this line in a document tree. hierarchy_level = HierarchyLevel( # most important parameters of HierarchyLevel is level_1 and level_2 # hierarchy level compares by tuple (level_1, level_2) lesser -> closer to the root of the tree level_1=0, level_2=0, - # can_be_multiline and paragraph_type - some parts of the document (for example title) may take more - # than one line - # if can_be_multiline is true than several lines in a row with same level_1, level_2 and paragraph_type - # will be merged in one tree node + # can_be_multiline and line_type - some parts of the document (for example title) may take more than one line + # if can_be_multiline is true then several lines in a row with same level_1, level_2 and line_type will be merged in one tree node can_be_multiline=True, line_type="header" ) text = "DOCUMENT TITLE" -metadata = LineMetadata(page_id=0, - line_id=1, - tag_hierarchy_level=None, - hierarchy_level=hierarchy_level, - other_fields=None) +metadata = LineMetadata(page_id=0, line_id=1, tag_hierarchy_level=None, hierarchy_level=hierarchy_level, other_fields=None) -# Annotations: one may specify some information about some part of the text, for example that some word -# written in italic font. +# Annotations: one may specify some information about some part of the text, for example that some word written in italic font. annotations = [] -line1 = LineWithMeta( - line=text, - metadata=metadata, - annotations=annotations -) +line1 = LineWithMeta(line=text, metadata=metadata, annotations=annotations) -unstructured_document = UnstructuredDocument(tables=[table], - lines=[line1], - attachments=[]) +unstructured_document = UnstructuredDocument(tables=[table], lines=[line1], attachments=[]) # I hope you understand some concepts of the LineWithMeta, but you may ask why it need level_1 and level_2 # parameters. Why is only level_1 not enough. Imagine that we have lists like these: diff --git a/examples/example_doc_parser.py b/examples/example_doc_parser.py index a30960eb..bb1cc9f8 100644 --- a/examples/example_doc_parser.py +++ b/examples/example_doc_parser.py @@ -10,25 +10,25 @@ # we get unstructured file with lines and tables unstructured_document = docx_reader.read(path=file_name, document_type="example") -# let's look at content of unstructured_file, it consists of tables and lines +# let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) -# first of all lets look at the table +# first of all let's look at the table table = unstructured_document.tables[0] # table consists of cells (we assume that table is rectangle) -# so cells is list of rows and row is list of strings +# cell is a list of rows and row is a list of cells with metadata for row in table.cells: for cell in row: print(cell.get_text().replace("\n", "\t") + " ", end="") print("\n") -# there is also some metadata in table +# there is also some metadata in the table print(table.metadata) -# and now lets look at lines. lines it is list of object of class LineWithMeta +# and now let's look at lines. lines is a list of objects of class LineWithMeta lines = unstructured_document.lines -# let's look at first line +# let's look at the first line line = lines[0] print(line) -# line consist of line (text), metadata, hierarchy level +# line consists of line (text), metadata, hierarchy level print(line.line, line.metadata, line.metadata.hierarchy_level) diff --git a/examples/example_img_parser.py b/examples/example_img_parser.py index 589d651c..3f136cf1 100644 --- a/examples/example_img_parser.py +++ b/examples/example_img_parser.py @@ -11,21 +11,21 @@ # we get unstructured file with lines and tables unstructured_document = img_reader.read(path=file_name, document_type="example") -# let's look at content of unstructured_file, it consists of tables and lines +# let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) -# first of all lets look at the table +# first of all let's look at the table table = unstructured_document.tables[0] # table consists of cells (we assume that table is rectangle) -# so cells is list of rows and row is list of strings +# so cells is a list of rows and row is a list of cells with metadata print(table.cells) -# there is also some metadata in table +# there is also some metadata in the table print(table.metadata) -# and now lets look at lines. lines it is list of object of class LineWithMeta +# and now let's look at lines. lines is a list of objects of class LineWithMeta lines = unstructured_document.lines -# let's look at first line +# let's look at the first line line = lines[0] print(line) -# line consist of line (text), metadata, hierarchy level +# line consists of line (text), metadata, hierarchy level print(line.line, line.metadata, line.metadata.hierarchy_level) diff --git a/examples/example_manager_input.py b/examples/example_manager_input.py index bd5a1b41..afb9bfff 100644 --- a/examples/example_manager_input.py +++ b/examples/example_manager_input.py @@ -2,9 +2,8 @@ import json from dedoc import DedocManager -from dedoc.config import get_config -manager = DedocManager(config=get_config()) +manager = DedocManager() filename_docx = "example.docx" parsed_docx_document = manager.parse(file_path=filename_docx, parameters={}) diff --git a/examples/example_pdf_parser.py b/examples/example_pdf_parser.py index 5d7507b8..36568546 100644 --- a/examples/example_pdf_parser.py +++ b/examples/example_pdf_parser.py @@ -11,23 +11,23 @@ # we get unstructured file with lines and tables unstructured_document = pdf_txt_layer_reader.read(path=file_name, document_type="example") -# let's look at content of unstructured_file, it consists of tables and lines +# let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) -# first of all lets look at the table +# first of all let's look at the table table = unstructured_document.tables[0] # table consists of cells (we assume that table is rectangle) -# so cells is list of rows and row is list of strings +# so cells is a list of rows and row is a list of cells with metadata print(table.cells) -# there is also some metadata in table +# there is also some metadata in the table print(table.metadata) -# and now lets look at lines. lines it is list of object of class LineWithMeta +# and now let's look at lines. lines is a list of objects of class LineWithMeta lines = unstructured_document.lines # let's look at first line line = lines[0] print(line) -# line consist of line (text), metadata, hierarchy level +# line consists of line (text), metadata, hierarchy level print(line.line, line.metadata, line.metadata.hierarchy_level) @@ -40,21 +40,21 @@ # we get unstructured file with lines and tables unstructured_document = pdf_image_reader.read(path=file_name, document_type="example") -# let's look at content of unstructured_file, it consists of tables and lines +# let's look at the content of unstructured_file, it consists of tables and lines print(unstructured_document.tables, unstructured_document.lines) # first of all lets look at the table table = unstructured_document.tables[0] # table consists of cells (we assume that table is rectangle) -# so cells is list of rows and row is list of strings +# so cells is list of rows and row is list of cells with metadata print(table.cells) -# there is also some metadata in table +# there is also some metadata in the table print(table.metadata) -# and now lets look at lines. lines it is list of object of class LineWithMeta +# and now let's look at lines. lines is a list of objects of class LineWithMeta lines = unstructured_document.lines # let's look at first line line = lines[0] print(line) -# line consist of line (text), metadata, hierarchy level +# line consists of line (text), metadata, hierarchy level print(line.line, line.metadata, line.metadata.hierarchy_level) diff --git a/examples/example_post.py b/examples/example_post.py index 97317c81..1eaf3c61 100644 --- a/examples/example_post.py +++ b/examples/example_post.py @@ -14,9 +14,8 @@ # file we want to parse files = {'file': (file_name, file)} # dict with additional parameters - # to parse pdf with text layer add parameter "pdf_with_text_layer":"true" data = {"document_type": ""} - # and now we send post request with attached file and paremeters. + # and now we send post request with attached file and parameters. r = requests.post("http://localhost:1231/upload", files=files, data=data) # wait for response, parse json result and print it result = json.loads(r.content.decode())