diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml index de74c6df..d46ad036 100644 --- a/.github/workflows/test_on_push.yaml +++ b/.github/workflows/test_on_push.yaml @@ -28,7 +28,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: - python-version: '3.8' + python-version: '3.9' - name: Run lint run: | python3 -m pip install --upgrade pip diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index bc23713e..1c260b37 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -1,100 +1,54 @@ -from typing import Any, Optional - -from fastapi import Body -from pydantic import BaseModel - - -class QueryParameters(BaseModel): - document_type: Optional[str] - structure_type: Optional[str] - return_format: Optional[str] - - with_attachments: Optional[str] - need_content_analysis: Optional[str] - recursion_deep_attachments: Optional[str] - return_base64: Optional[str] - attachments_dir: Optional[str] - - need_pdf_table_analysis: Optional[str] - table_type: Optional[str] - orient_analysis_cells: Optional[str] - orient_cell_angle: Optional[str] - - pdf_with_text_layer: Optional[str] - language: Optional[str] - pages: Optional[str] - is_one_column_document: Optional[str] - document_orientation: Optional[str] - need_header_footer_analysis: Optional[str] - need_binarization: Optional[str] - - delimiter: Optional[str] - encoding: Optional[str] - html_fields: Optional[str] - handle_invisible_table: Optional[str] - - def __init__(self, - # type of document structure parsing - document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), # noqa - structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), # noqa - return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), # noqa - - # attachments handling - with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), # noqa - need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), # noqa - recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), # noqa - return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), # noqa - attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # noqa - - # tables handling - need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), # noqa - table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), # noqa - orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), # noqa - orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), # noqa - - # pdf handling - pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), # noqa - language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), # noqa - pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), # noqa - is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), # noqa - document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), # noqa - need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), # noqa - need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), # noqa - - # other formats handling - delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), # noqa - encoding: Optional[str] = Body(description="a document encoding", default=None), # noqa - html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), # noqa - handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), # noqa - - - **data: Any) -> None: # noqa - - super().__init__(**data) - self.document_type: str = document_type or "" - self.structure_type: str = structure_type or "tree" - self.return_format: str = return_format or "json" - - self.with_attachments: str = with_attachments or "false" - self.need_content_analysis: str = need_content_analysis or "false" - self.recursion_deep_attachments: str = recursion_deep_attachments or "10" - self.return_base64: str = return_base64 or "false" - self.attachments_dir: str = attachments_dir - - self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true" - self.table_type: str = table_type or "" - self.orient_analysis_cells: str = orient_analysis_cells or "false" - self.orient_cell_angle: str = orient_cell_angle or "90" - - self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby" - self.language: str = language or "rus+eng" - self.pages: str = pages or ":" - self.is_one_column_document: str = is_one_column_document or "auto" - self.document_orientation: str = document_orientation or "auto" - self.need_header_footer_analysis: str = need_header_footer_analysis or "false" - self.need_binarization: str = need_binarization or "false" - - self.delimiter: str = delimiter - self.encoding: str = encoding - self.html_fields: str = html_fields or "" - self.handle_invisible_table: str = handle_invisible_table or "false" +from dataclasses import asdict, dataclass +from typing import Optional + +from fastapi import Form + + +@dataclass +class QueryParameters: + # type of document structure parsing + document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain") + structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type") + return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"], + description="Response representation, most types (except json) are used for debug purposes only") + + # attachments handling + with_attachments: str = Form("false", enum=["true", "false"], description="Enable attached files extraction") + need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files") + recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true") + return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format") + attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments") + + # tables handling + need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf") + table_type: str = Form("", description="Pipeline mode for table recognition") + orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers") + orient_cell_angle: str = Form("90", enum=["90", "270"], + description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation') + + # pdf handling + pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], + description="Extract text from a text layer of PDF or using OCR methods for image-like documents") + language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language") + pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right') + is_one_column_document: str = Form("auto", enum=["auto", "true", "false"], + description='One or multiple column document, "auto" - predict number of page columns automatically') + document_orientation: str = Form("auto", enum=["auto", "no_change"], + description='Orientation of the document pages, "auto" - predict orientation (0, 90, 180, 270 degrees), ' + '"no_change" - set vertical orientation of the document without using an orientation classifier') + need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result") + need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)") + + # other formats handling + delimiter: Optional[str] = Form(None, description="Column separator for CSV files") + encoding: Optional[str] = Form(None, description="Document encoding") + html_fields: str = Form("", description="List of fields for JSON documents to be parsed as HTML documents") + handle_invisible_table: str = Form("false", enum=["true", "false"], description="Handle tables without visible borders as tables in HTML") + + def to_dict(self) -> dict: + parameters = {} + + for parameter_name, parameter_value in asdict(self).items(): + parameters[parameter_name] = getattr(parameter_value, "default", parameter_value) + + return parameters diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index 7e295c7a..f2dcc520 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -1,6 +1,9 @@ +import dataclasses import importlib +import json import os import tempfile +from typing import Optional import uvicorn from fastapi import Depends, FastAPI, File, Request, Response, UploadFile @@ -19,11 +22,11 @@ config = get_config() PORT = config["api_port"] -static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static/") +static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "web") static_files_dirs = config.get("static_files_dirs") app = FastAPI() -app.mount("/static", StaticFiles(directory=config.get("static_path", static_path)), name="static") +app.mount("/web", StaticFiles(directory=config.get("static_path", static_path)), name="web") module_api_args = importlib.import_module(config["import_path_init_api_args"]) logger = config["logger"] @@ -36,14 +39,12 @@ def get_info() -> Response: Root URL "/" is need start with simple Flask before rest-plus. API otherwise you will get 404 Error. It is bug of rest-plus lib. """ - return FileResponse(os.path.join(static_path, "html_eng/info.html")) + return FileResponse(os.path.join(static_path, "index.html")) @app.get("/static_file") def get_static_file(request: Request) -> Response: path = _get_static_file_path(request) - # TODO check as_attachment - as_attachment = request.query_params.get("as_attachment") == "true" # noqa return FileResponse(path) @@ -61,13 +62,10 @@ def _get_static_file_path(request: Request) -> str: @app.post("/upload") async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa - parameters = query_params.dict(by_alias=True) - + parameters = dataclasses.asdict(query_params) if not file or file.filename == "": raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__) - # check if the post request_post has the file part - logger.info(f"Get file {file.filename} with parameters {parameters}") with tempfile.TemporaryDirectory() as tmpdir: file_path = save_upload_file(file, tmpdir) document_tree = manager.parse(file_path, parameters=dict(parameters)) @@ -75,21 +73,34 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D return_format = str(parameters.get("return_format", "json")).lower() if return_format == "html": html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0) - return HTMLResponse(content=html_content, status_code=200) + return HTMLResponse(content=html_content) elif return_format == "plain_text": txt_content = json2txt(paragraph=document_tree.content.structure) - return PlainTextResponse(content=txt_content, status_code=200) + return PlainTextResponse(content=txt_content) elif return_format == "tree": html_content = json2tree(paragraph=document_tree.content.structure) - return HTMLResponse(content=html_content, status_code=200) + return HTMLResponse(content=html_content) elif return_format == "ujson": - return UJSONResponse(content=document_tree.to_dict(), status_code=200) - elif str(parameters.get("return_format", "json")).lower() == "collapsed_tree": + return UJSONResponse(content=document_tree.to_dict()) + elif return_format == "collapsed_tree": html_content = json2collapsed_tree(paragraph=document_tree.content.structure) - return HTMLResponse(content=html_content, status_code=200) + return HTMLResponse(content=html_content) + elif return_format == "pretty_json": + return PlainTextResponse(content=json.dumps(document_tree.to_dict(), ensure_ascii=False, indent=2)) else: logger.info(f"Send result. File {file.filename} with parameters {parameters}") - return ORJSONResponse(content=document_tree.to_dict(), status_code=200) + return ORJSONResponse(content=document_tree.to_dict()) + + +@app.get("/upload_example") +async def upload_example(file_name: str, return_format: Optional[str] = None) -> Response: + file_path = os.path.join(static_path, "examples", file_name) + parameters = {} if return_format is None else {"return_format": return_format} + document_tree = manager.parse(file_path, parameters=parameters) + + if return_format == "html": + return HTMLResponse(content=json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)) + return ORJSONResponse(content=document_tree.to_dict(), status_code=200) @app.exception_handler(DedocError) diff --git a/dedoc/api/static/books_2.csv b/dedoc/api/static/books_2.csv deleted file mode 100644 index 7fdf8f1a..00000000 --- a/dedoc/api/static/books_2.csv +++ /dev/null @@ -1,11 +0,0 @@ -id,cat,name,price,inStock,author,series_t,sequence_i,genre_s -0553573403,book,"A Game of Throne, kings and other stuff",7.99,True,George R.R. Martin,A Song of Ice and Fire,1,fantasy -0553579908,book,"A Clash of ""Kings""",7.99,True,George R.R. Martin,A Song of Ice and Fire,2,fantasy -055357342X,book,A Storm of Swords,7.99,True,George R.R. Martin,A Song of Ice and Fire,3,fantasy -0553293354,book,Foundation,7.99,True,Isaac Asimov,Foundation Novels,1,scifi -0812521390,book,The Black Company,6.99,False,Glen Cook,The Chronicles of The Black Company,1,fantasy -0812550706,book,Ender's Game,6.99,True,Orson Scott Card,Ender,1,scifi -0441385532,book,Jhereg,7.95,False,Steven Brust,Vlad Taltos,1,fantasy -0380014300,book,Nine Princes In Amber,6.99,True,Roger Zelazny,the Chronicles of Amber,1,fantasy -0805080481,book,The Book of Three,5.99,True,Lloyd Alexander,The Chronicles of Prydain,1,fantasy -080508049X,book,The Black Cauldron,5.99,True,Lloyd Alexander,The Chronicles of Prydain,2,fantasy diff --git a/dedoc/api/static/components.js b/dedoc/api/static/components.js deleted file mode 100644 index 45f1e23e..00000000 --- a/dedoc/api/static/components.js +++ /dev/null @@ -1,11 +0,0 @@ -function CheckBoxVisibility(checkboxId, divId, isDivHide = true) { - let checkbox = document.getElementById(checkboxId) - let div = document.getElementById(divId) - - if (isDivHide) - div.style.display = "none" - - checkbox.onchange = function() { - div.style.display = checkbox.checked ? "block" : "none" - } -} diff --git a/dedoc/api/static/csv_semicolon.csv b/dedoc/api/static/csv_semicolon.csv deleted file mode 100644 index 5a4abd67..00000000 --- a/dedoc/api/static/csv_semicolon.csv +++ /dev/null @@ -1,3 +0,0 @@ -1;2;3 -2;1;5 -5;3;1 diff --git a/dedoc/api/static/example.zip b/dedoc/api/static/example.zip deleted file mode 100644 index dc8e926a..00000000 Binary files a/dedoc/api/static/example.zip and /dev/null differ diff --git a/dedoc/api/static/example_with_images.xls b/dedoc/api/static/example_with_images.xls deleted file mode 100644 index 6cdf5e6d..00000000 Binary files a/dedoc/api/static/example_with_images.xls and /dev/null differ diff --git a/dedoc/api/static/example_with_images.xlsx b/dedoc/api/static/example_with_images.xlsx deleted file mode 100644 index 7e3f85d4..00000000 Binary files a/dedoc/api/static/example_with_images.xlsx and /dev/null differ diff --git a/dedoc/api/static/exampletable.json b/dedoc/api/static/exampletable.json deleted file mode 100644 index ea2ddd03..00000000 --- a/dedoc/api/static/exampletable.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "location": { - "page_number": 0, - "bbox": [ - 286, - 2918, - 2278, - 3220 - ] - }, - "header": [ - [ - "№\nп/п", - "п/п" - ], - [ - "Наименование позиции", - "Наименование позиции" - ], - [ - "Начальная (максимальная) цена за единицу\nпродукции", - "рублей, включая НДС\n(20%) -" - ], - [ - "Начальная (максимальная) цена за единицу\nпродукции", - "рублей, без учета НДС\n(20%)" - ] - ], - "data": [ - [ - "Г.", - "Клапан регулирующий 142.", - "9499 692.00", - "7916 410,00" - ] - ] -} \ No newline at end of file diff --git a/dedoc/api/static/html_eng/code_example.html b/dedoc/api/static/html_eng/code_example.html deleted file mode 100644 index 53d27920..00000000 --- a/dedoc/api/static/html_eng/code_example.html +++ /dev/null @@ -1,11 +0,0 @@ - - - - Code example - - -

Code Example

-

<- go to the main page 

-

Python code example

-

An example can be found in the file:  

-

dedoc_project/dedoc/examples/example_post.py

\ No newline at end of file diff --git a/dedoc/api/static/html_eng/errors.html b/dedoc/api/static/html_eng/errors.html deleted file mode 100644 index 2d9f142d..00000000 --- a/dedoc/api/static/html_eng/errors.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - Possible Exceptions - - -

Possible Exceptions

-

<- go to the main page 

-
    -
  1. Document format not supported (for example mkv) response 415 will be returned
  2. -
  3. Service checks if the POST-request has the file part. The application accesses the file from the - files dictionary on the request object. If there is no file, the service returns 400.
  4. -
diff --git a/dedoc/api/static/html_eng/form_input.html b/dedoc/api/static/html_eng/form_input.html deleted file mode 100644 index 2a447d21..00000000 --- a/dedoc/api/static/html_eng/form_input.html +++ /dev/null @@ -1,166 +0,0 @@ - - - - Upload New File - - - - - - - - - -
-

Structure Document Recognition

-

<- go to the main page 

-
- - - -
- -
-
-
-

- return_format -

-

-

pages

- -
-

- -

-
-

- -

-

- -

-
- -
-

- -

-

-

-

-

- - -
-
- -
-
-
- -

- -

-

- structure_type -

- - -

- -

- -
-
-
-
-
-
-
-
- - - - - \ No newline at end of file diff --git a/dedoc/api/static/html_eng/format_description.html b/dedoc/api/static/html_eng/format_description.html deleted file mode 100644 index 8447ab52..00000000 --- a/dedoc/api/static/html_eng/format_description.html +++ /dev/null @@ -1,149 +0,0 @@ - - - - Recognized Document Structure - - -

Recognized Document Structure

-

<- go to the main page 

-

The ParsedDocument structure is returned, the structure is returned as a Json format.

-

Dedoc supports linear and tree format. In linear format every documents line is a child of the root. In -tree structure dedoc tries to reconstruct logical structure of the document as tree

- -

ParsedDocument

-
    -
  1. version: str (required field) - - Dedoc version -
  2. -
  3. warnings: List[str] (required field) - - any warning, occur in the process of document processing -
  4. -
  5. metadata: DocumentMetadata (required field) - - document meta-information -
  6. -
  7. content: DocumentContent (required field) - parsed document structure -
  8. -
  9. attachments: List[ ParsedDocument ] (optional field) - - attached documents, returned only if the condition for processing attached files is set. See the "with_attachment" parameter in the POST-request -
  10. -
- -

DocumentMetadata

-

Contains document meta-information (for example file name, size, access time).

-
    -
  1. uid: str (required field) - unique document identifier (example: "doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0")
  2. -
  3. file_name: string (required field) - file name (example: "example.odt")
  4. -
  5. bucket_name: str (optional field) - bucket name in which the file is located. - Included when analyzing a file from a cloud storage (example: "dedoc")
  6. -
  7. size: integer (required field) - file size in bytes (example: 20060)
  8. -
  9. modified_time: integer (required field) - modification date of the document in the format UnixTime (example: 1590579805)
  10. -
  11. created_time: integer (required field) - creation date of the document in the format UnixTime (example: 1590579805)
  12. -
  13. access_time: integer (required field) - file access date in format UnixTime (example: 1590579805)
  14. -
  15. file_type: string (optional field) - mime-type file (example: - "application/vnd.oasis.opendocument.text")
  16. -
  17. other_fields: dict (optional field) - each file type has its own set of meta information, - here is a detailed description of the other_fields
  18. -
- - -

DocumentContent

-

Contains document content structure

-
    -
  1. tables: List[Table] (required field) - list of tables in a document
  2. -
  3. structure: TreeNode (required field) - document tree structure -
  4. -
- -

Table

-Detected and parsed table in a document -
    -
  1. cells: List[List[string]] (required field) - list of table cell lists. The cell contains text. -
  2. -
  3. metadata: TableMetadata (required field) - table meta information -
  4. -
- -

TableMetadata

-

Contains table meta information

-
    -
  1. uid: str (required field) - unique identifier.
  2. -
  3. page_id: integer (optional field) - page number on which the table begins. Can be null.
  4. -
- -

TreeNode

-

Contains document tree structure

-
    -
  1. node_id : string (required field) - document item identifier. It is unique in for one tree (i.e. in this tree there will not be another such node_id, but in attachment it can occur). - The identifier has the form 0.2.1 where each number symbolizes a serial number at an appropriate level - иерархии.
    For example node_id 0.2.1 means that this element is the second subhead of the third chapter. The first number is the root of the document. Numbering goes from 0. -
  2. -
  3. text: string (required field) - element text;
  4. -
  5. annotations: List[ Annotation ] (required field) - the field describes any properties of the text, for example, boldness, font size, etc. -
  6. -
  7. metadata: ParagraphMetadata (required field) - - meta-information relevant to the entire subparagraph, such as page number and position on this page. -
  8. -
  9. subparagraphs: List[ TreeNode ] (required field) - - "children" of the current item (for example, sub-chapters for a chapter). The structure of "children" is similar to the current one. -
  10. -
- - -

Annotation

-

Contains text annotations

-
    -
  1. start : integer (required field) - annotation start index.
  2. -
  3. end : integer (required field) - annotation end index. - The index of the last character (associated with this annotation) + 1. - For example, if the first character is annotated, then start = 0, end = 1; - if all line is annotated, then start = 0, end = length of s.
  4. -
  5. name : string (required field) - annotation type (size, italic etc).
  6. -
  7. value : string (required field) - annotation value - (to learn more see Concrete types of annotations).
  8. -
- -

Concrete types of annotations.

- - -

ParagraphMetadata

-

Contains paragraph meta information

-
    -
  1. paragraph_type : string (required field) - paragraph type (paragraph, list item and so on). - Possible values depend on the type of document. - Default values: ['root', 'paragraph', 'raw_text', 'list', 'list_item', 'named_header'] - Values for document_type 'law': ['root', 'raw_text', 'struct_unit', 'item', 'article', 'subitem', - 'footer', 'header', 'title', 'part'] -
  2. -
  3. predicted_classes : Dict[str -> float] (optional field) - classifier results, paragraph type - is the probability that the paragraph is of this type, the list of keys depends on the type of document. -
  4. -
  5. page_id : integer (optional field) - page on which this paragraph begins.
  6. -
  7. line_id : integer (optional field) - The line number on which this paragraph begins.
  8. -
- - diff --git a/dedoc/api/static/html_eng/info.html b/dedoc/api/static/html_eng/info.html deleted file mode 100644 index d8b282b3..00000000 --- a/dedoc/api/static/html_eng/info.html +++ /dev/null @@ -1,54 +0,0 @@ - - - - - DeDoc| info page - - - - - -
-

How to parse documents

- -

Upload the file here and look at the result. 

- -

Build and Run

-

DeDoc service start commands:

-
docker build . -t dedoc_container
-
docker run -p 1231:1231 --rm dedoc_container:latest python3 /dedoc/main.py
- -
-> Service should rise at port 1231
- -

Service parameters are configured in the config file (dedoc_project/dedoc/config.py)

-

The config is a python file, so you can use everything that standard python can do, for example, calculate the maximum file size as 512 * 1024 * 1024

-

How to use

-

You can send the file using the POST request to the address - host:1231/upload 

-

The name of the downloaded file should appear on the form

- -

Additional query options:

-
    -
  1. language: string - document recognition language. The default value is "rus+eng". Available values: "rus+eng", "rus", "eng".
  2. -
  3. with_attachments: boolean - option including analysis of attached files. The option is False by default. Available values: True, False.
  4. -
  5. return_format: str - an option to return the response in pretty_json, html, json or tree form. - The default value is json. Use the pretty_json, tree and html format for debug only.
    - Warning: html-format is used only for viewing the recognition result (in a readable form). - For further analysis, we recommend using the output json format. -
  6. structure_type: string - type output structure ('linear' or 'tree')
  7. -
-

Other useful links

- -
- - - - diff --git a/dedoc/api/static/html_eng/metadata_description.html b/dedoc/api/static/html_eng/metadata_description.html deleted file mode 100644 index f2ebb607..00000000 --- a/dedoc/api/static/html_eng/metadata_description.html +++ /dev/null @@ -1,31 +0,0 @@ - - - - - Structure of additional metadata - - - -

Structure of additional metadata for different file types

- -

Для всех файлов помещенных или загруженных в/из облака

-
    -
  1. bucket_name: str (required field) - bucket name in which the file is located. - Included when analyzing a file from a cloud storage (example: "dedoc")
  2. -
  3. cloud_file_path: str (required field) absolute path in the "bucket_name" on the cloud.
  4. -
- - -

Docx/doc/odt files

-
    -
  1. document_subject: str (optional field) - the topic of the content of the document.
  2. -
  3. keywords: str (optional field) - a delimited set of keywords to support searching and indexing.
  4. -
  5. category: str (optional field) - a categorization of the content of this document. Example values for this property might include: Resume, Letter, Financial Forecast, Proposal, Technical Presentation, and so on.
  6. -
  7. author: str (optional field) - an entity primarily responsible for making the content of the document.
  8. -
  9. last_modified_by: str (optional field) - the user who performed the last modification. The identification is environment-specific. Examples include a name, email address, or employee ID.
  10. -
  11. created_date: str (optional field) - date of creation of the resource.
  12. -
  13. modified_date: str (optional field) - date on which the resource was changed.
  14. -
  15. last_printed_date: str (optional field) - the date and time of the last printing.
  16. -
- - \ No newline at end of file diff --git a/dedoc/api/static/html_eng/supported_formats.html b/dedoc/api/static/html_eng/supported_formats.html deleted file mode 100644 index 731a6789..00000000 --- a/dedoc/api/static/html_eng/supported_formats.html +++ /dev/null @@ -1,96 +0,0 @@ - - - - Supported Formats - - -

Supported Formats

-

<- go to the main page 

-

Word

-
    -
  1. doc ( - result, - result in html - ) -
  2. -
  3. docx ( - result, - result in html - ) - -
  4. -
  5. odt ( - result, - result in html - ) -
  6. -
-

Excel

-
    -
  1. ods (openoffice excel) ( - result - result in html - ) -
  2. -
  3. xls (old excel format) ( - result, - result in html - ) -
  4. -
  5. xlsx (new excel format) ( - result, - result in html - ) -
  6. -
-

PowerPoint

-
    -
  1. ppt ( - result - result in html - ) -
  2. -
  3. pptx ( - result, - result in html - ) -
  4. -
  5. odp ( - result, - result in html - ) -
  6. -
-

CSV comma separated values

-

If you use a non-standard separator, you can pass its value in the "delimiter" parameter into POST-request.
- For example delimiter=";". Use cases you can find in dedoc_project/dedoc/tests/test_api_csv.py -

-
    -
  1. - csv ( - result, - result in html - ) - -
  2. -
  3. - tsv ( - result, - result in html - ) -
  4. -
-

Others

-
    -
  1. txt file ( - result, - result in html - ) -
  2. -
  3. json file ( - result, - result in html - ) - Here, the keys will be analyzed as vertex types, and lists are treated as lists. -
  4. -
\ No newline at end of file diff --git a/dedoc/api/static/html_rus/code_example.html b/dedoc/api/static/html_rus/code_example.html deleted file mode 100644 index 091860bc..00000000 --- a/dedoc/api/static/html_rus/code_example.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - - Пример кода - - - -

Пример кода

- -

Пример кода на python

-

Примеры использования сервиса вы можете найти в файле:

-

dedoc_project/dedoc/examples/example_post.py

- - diff --git a/dedoc/api/static/html_rus/errors.html b/dedoc/api/static/html_rus/errors.html deleted file mode 100644 index fbd5ec4c..00000000 --- a/dedoc/api/static/html_rus/errors.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - - Возможные исключения - - - -

Возможные исключения

-
    -
  1. Формат документа не поддерживается (например на документ .mkv система вернет ответ 415
  2. -
  3. Сервис проверяет есть ли POST-запросе файл. Приложение берет файл из словаря files объекта запроса. - В случае отсутствия файла, система возвращает 400
  4. -
- - \ No newline at end of file diff --git a/dedoc/api/static/html_rus/form_input.html b/dedoc/api/static/html_rus/form_input.html deleted file mode 100644 index 0d6630af..00000000 --- a/dedoc/api/static/html_rus/form_input.html +++ /dev/null @@ -1,75 +0,0 @@ - - - - - Загрузка файла - - - - - - - - - -
-

Распознавание структуры документа

-

<- вернуться на главную страницу 

-
- - - -
- -
-
-
-

- return_format -

- with_attachments

-

-

-

- -

-

- structure_type -

-
-
-
-
-
-
-
-
- - \ No newline at end of file diff --git a/dedoc/api/static/html_rus/format_description.html b/dedoc/api/static/html_rus/format_description.html deleted file mode 100644 index 7f338393..00000000 --- a/dedoc/api/static/html_rus/format_description.html +++ /dev/null @@ -1,152 +0,0 @@ - - - - - Структура распознанного документа - - -

Структура распознанного документа

-

Возвращается структура ParsedDocument, структура возвращается в виде Json.

-

Dedoc поддерживает линейную и древовидную структуру. В случае линейной структуры все строки документа являются - наследниками корня дерева. В случае древовидной структуры (tree) dedoc пытается восстановить логическую структуру - документа в виде дерева. -

- -

ParsedDocument

-
    -
  1. version: str (обязательное поле) - - версия Dedoc -
  2. -
  3. warnings: List[str] (обязательное поле) - - любые ворнинги, возникшие в процессе обработки документа -
  4. -
  5. metadata: DocumentMetadata (обязательное поле) - - метаинформация документа -
  6. -
  7. content: DocumentContent (обязательное поле) - разобранная - структура документа -
  8. -
  9. attachments: List[ ParsedDocument ] (необязательное поле) - - вложенные документы, возвращается только - если задано условие обработки вложенных файлов. -
  10. -
- -

DocumentMetadata. Метаинформация документа

-
    -
  1. uid: str (обязательное поле) - уникальный идентификатор документа (пример: "doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0")
  2. -
  3. file_name: str (обязательное поле) - имя файла (пример: "example.pdf")
  4. -
  5. size: int (обязательное поле) - размер файла в байтах (пример: 20060)
  6. -
  7. modified_time: int (обязательное поле) - дата модификации документа в формате UnixTime (пример: 1590579805)
  8. -
  9. created_time: int (обязательное поле) - дата создания документа в формате UnixTime (пример: 1590579805)
  10. -
  11. access_time: int (обязательное поле) - дата доступа к файло в формате UnixTime (пример: 1590579805)
  12. -
  13. file_type: str (необязательное поле) - mime-тип файла (пример: - "application/pdf")
  14. -
  15. other_fields: dict (необязательное поле) - у каждого типа файла свой набор метаинформации, - здесь представлено подробное описание поля other_fields
  16. -
- - -

DocumentContent. Структура содержимого документа

-
    -
  1. tables: List[Table] (обязательное поле) - список таблиц
  2. -
  3. structure: TreeNode (обязательное поле) - древовидная структура - документа -
  4. -
- -

Table. Разобранные таблицы.

-
    -
  1. cells: List[List[str]] (обязательное поле) - список списков строк таблицы, строка - таблицы представляет из себя списко содержимого ячеек. Ячейка содрежит текст, -
  2. -
  3. metadata: TableMetadata (обязательное поле) - метаинформация о - таблице -
  4. -
- -

TableMetadata. Метаинформация таблицы.

-
    -
  1. uid: str (обязательное поле) - уникальный идентификатор таблицы.
  2. -
  3. page_id: int (необязательное поле) - номер страницы на которой начинается таблица.
  4. -
- -

TreeNode. Древовидная структура документа.

-
    -
  1. node_id : str (обязательное поле) - идентификатор элемента документа. Уникален в рамках одного - дерева (т.е. в этом дереве не будет другого такого node_id, а в attachment может встретиться) - Идентификатор имеет вид 0.2.1 где каждое число символизирует порядковый номер на соотвтетствующем уровне - иерархии.
    Например node_id 0.2.1 означает что этот элемент это вторая подглова третьей главы - (нумерация с 0, первое число это корень документа); -
  2. -
  3. text: str (обязательное поле) - текст элемента;
  4. -
  5. annotations: List[ Annotation ] (обязательное поле) - поле описывает - какие-либо свойства текста, например жирность, размер шрифта etc. -
  6. -
  7. metadata: ParagraphMetadata (обязательное поле) - - метоинформация, относящаяся ко всему субпараграфу, например номер страницы и положение на этой странице. -
  8. -
  9. subparagraphs: List[ TreeNode ] (обязательное поле) - - "дети" текущего элемента (например подглавы для главы). Структура "детей" аналогична текущей. -
  10. -
- - -

Annotation. Аннотации текста.

-
    -
  1. start : int (обязательное поле) - индекс начала аннотации.
  2. -
  3. end : int (обязательное поле) - индекс конца аннотации. - Равен индексу последнего символа (относящегося к данной аннотации) + 1. - Например, если в строке проаннотирован только первый символ, то start = 0, end = 1; - если проаннотирована вся строка s, то start = 0, end = длина s.
  4. -
  5. name : string (обязательное поле) - тип аннотации (размер шрифта, курсив и т.д.).
  6. -
  7. value : str (обязательное поле) - значение аннотации - (подробнее в ConcreteAnnotations).
  8. -
- -

Concrete annotations. Конкретные виды аннотаций.

- - -

ParagraphMetadata. Метаинформация, относящаяся к параграфу.

-
    -
  1. paragraph_type : str (обязательное поле) - тип параграфа (заголовок, элемент списка и так далее). - Возможные значения зависят от типа документа. - Значения по умолчанию: ['root', 'paragraph', 'raw_text', 'list', 'list_item', 'named_header'] - Значения для типа документа='law': ['root', 'raw_text', 'struct_unit', 'item', 'article', 'subitem', - 'footer', 'header', 'title', 'part'] -
  2. -
  3. predicted_classes : Dict[str -> float] (необязательное поле) - результаты работы - классификатора, - тип параграфа -> вероятность того, что параграф относится к такому типу, список ключей зависит от типа - документа. -
  4. -
  5. page_id : int (необязательное поле) - страница на которой начинается этот параграф.
  6. -
  7. line_id : int (необязательное поле) - номер строки, на которой начинается этот параграф.
  8. -
- - diff --git a/dedoc/api/static/html_rus/info.html b/dedoc/api/static/html_rus/info.html deleted file mode 100644 index 6f028a16..00000000 --- a/dedoc/api/static/html_rus/info.html +++ /dev/null @@ -1,57 +0,0 @@ - - - - - DeDoc| info page - - - - - -
-

Как парсить документы

-

Идём сюда загружаем файл, смотрим результат. 

- -

Сборка и запуск

-

Команды запуска сервиса DeDoc:

-
docker build . -t dedoc_container
-
docker run -p 1231:1231 --rm dedoc_container:latest python3 /dedoc/main.py
- -
-> Cервис поднимется на порту 1231
-

Параметры можно указать в конфигурационном файле (dedoc_project/dedoc/config.py)

-

Конфиг это файл на языке python, поэтому можно пользоваться - всем, что умеет стандартный python, например вычислять максимальный размер файла как 512 * 1024 * 1024

- -

Как использовать

-

Необходимо отправить файл с помощью POST запроса по адресу - host:1231/upload

- Имя вгруженного файла должно появиться на форме. - -

Дополнительные опции запроса:

-
    -
  1. language: str - язык рапознавания документа. По-умолчанию установлено значение "rus+eng". - Доступные значения: "rus+eng", "rus", "eng".
  2. -
  3. with_attachments: boolean - опция включающая анализ вложенных файлов. По-умолчанию установлено - значение False. Доступные значения True, False.
  4. -
  5. return_format: str - опция для возврата ответа в html-виде, в виде дерева или в виде json. - Возможные значения html, tree, json, pretty_json - По-умолчанию установлено значение json, остальные методы стоит использовать только для дебага
    - Предупреждение: html-формат используется исключительно для просмотра результата распознавания (в читабельном виде). - В целях дальнейшего разбора рекомендуем использовать выходной json-формат.
  6. -
  7. structure_type: string - тип выходной структуры ('linear' или 'tree')
  8. -
- -

Другие полезные ссылки

- -
- - - \ No newline at end of file diff --git a/dedoc/api/static/html_rus/metadata_description.html b/dedoc/api/static/html_rus/metadata_description.html deleted file mode 100644 index 10ff929a..00000000 --- a/dedoc/api/static/html_rus/metadata_description.html +++ /dev/null @@ -1,31 +0,0 @@ - - - - - Структура дополнительных метаданных файлов - - - -

Структура дополнительных метаданных для разных типов файлов

- -

Для всех файлов помещенных или загруженных в/из облака

-
    -
  1. bucket_name: str (обязательное поле) - имя bucket-а в котором лежит файл. - Включено при анализе файла из облачной хранилки (пример: "dedoc")
  2. -
  3. cloud_file_path: str (обязательное поле) абслоютный путь файла в бакете "bucket_name" на облаке.
  4. -
- - -

Docx/doc/odt файлов

-
    -
  1. document_subject: str (необязательное поле) - тема содержимого документа.
  2. -
  3. keywords: str (необязательное поле) - набор ключевых слов с разделителями для поддержки поиска и индексации.
  4. -
  5. category: str (необязательное поле) - категоризация содержимого документа. Примеры значений этого свойства могут включать: резюме, письмо, финансовый прогноз, предложение, техническую презентацию и т.д.
  6. -
  7. author: str (необязательное поле) - имя автора содержимого документа.
  8. -
  9. last_modified_by: str (необязательное поле) - Пользователь, выполнивший последнюю модификацию. Идентификация зависит от среды. Примеры включают имя, адрес электронной почты или идентификатор сотрудника.
  10. -
  11. created_date: str (необязательное поле) - дата создания документа.
  12. -
  13. modified_date: str (необязательное поле) - последняя дата модификации документа.
  14. -
  15. last_printed_date: str (необязательное поле) - дата и время последней печати документа.
  16. -
- - \ No newline at end of file diff --git a/dedoc/api/static/html_rus/supported_formats.html b/dedoc/api/static/html_rus/supported_formats.html deleted file mode 100644 index c7cb4820..00000000 --- a/dedoc/api/static/html_rus/supported_formats.html +++ /dev/null @@ -1,96 +0,0 @@ - - - - - Поддерживаемые форматы - - -

Поддерживаемые форматы

-

Word

-
    -
  1. doc ( - результат, - результат в html - ) -
  2. -
  3. docx ( - результат, - результат в html - ) - -
  4. -
  5. odt ( - результат, - результат в html - ) -
  6. -
-

Excel

-
    -
  1. ods (openoffice excel) ( - результат - результат в html - ) -
  2. -
  3. xls (old excel format) ( - результат, - результат в html - ) -
  4. -
  5. xlsx (new excel format) ( - результат, - результат в html - ) -
  6. -
-

PowerPoint

-
    -
  1. ppt ( - результат, - результат в html - ) -
  2. -
  3. pptx ( - результат, - результат в html - ) -
  4. -
  5. odp ( - результат, - результат в html - ) -
  6. -
-

CSV и значения разделителя

-

Если вы используете нестандартный разделитель, вы можете задать его значение в параметре "delimiter" вашего POST-запроса. - Например delimiter=";". Примеры использования вы можете найти в dedoc_project/dedoc/tests/test_api_csv.py -

-
    -
  1. - csv ( - результат, - результат в html - ) - -
  2. -
  3. - tsv ( - результат, - результат в html - ) -
  4. -
-

Другие форматы

-
    -
  1. txt file ( - результат, - результат в html - ) -
  2. -
  3. json file ( - результат, - результат в html - ) - Здесь ключи анализируются как типы вершины, списки обрабатываются как списки. -
  4. -
\ No newline at end of file diff --git a/dedoc/api/static/realistic_json.json b/dedoc/api/static/realistic_json.json deleted file mode 100644 index 8dac0b64..00000000 --- a/dedoc/api/static/realistic_json.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "next_page": "Next Page »", - "type": "news", - "platform": "infosecnews", - "start_url": "https://infosecnews.org", - "next_page-href": "https://www.infosecnews.org/page/3/", - "news_link": "Leading privacy and cybersecurity law firm investigates Tandem Diabetes Care data breach", - "news_link-href": "https://www.infosecnews.org/leading-privacy-and-cybersecurity-law-firm-investigates-tandem-diabetes-care-data-breach/", - "title": "Leading privacy and cybersecurity law firm investigates Tandem Diabetes Care data breach", - "publication_date": "April 20, 2020", - "author": "William Knowles", - "text": [ - "We take the privacy and confidentiality of our customers’ information very seriously and apologize for any inconvenience or concern this incident may cause our customers.", - "With the next sentence…", - "Tandem Diabetes Care, Inc. (“Tandem”) is committed to protecting the confidentiality and security of our customers’ information. Regrettably, this notice is to inform our customers of a recent phishing incident that may have involved some customer information.", - "Some customer information is “reputational risk management code” for only 140,781 customers.", - "We are continuing to invest heavily in cyber security and data protection safeguards. We are also implementing additional email security controls, strengthening our user authorization and authentication processes, and limiting the types of data permitted to be transferred via email.", - "On January 17, 2020, Tandem Diabetes Care learned that an unauthorized person gained access to a Tandem employee’s email account through a security incident commonly known as “phishing.”", - "Once we learned about the incident, we immediately secured the account and a cyber security firm was engaged to assist in our investigation. Our investigation determined that a limited number of Tandem employee email accounts may have been accessed by an unauthorized user between January 17, 2020 and January 20, 2020.", - "Through the investigation, Tandem Diabetes Care learned that some customers’ information may have been contained in one or more of the Tandem email accounts affected by the incident. The affected email accounts may have contained customer names, contact information, information related to those customers’ use of Tandem’s products or services, clinical data regarding their diabetes therapy, and in a few limited instances, Social Security numbers.", - "On LinkedIn, Tandem Diabetes Care lists some 935 employees, but only three security people (understandably some of the security team might have temporarily pulled their profiles offline) and currently Tandem is looking for a Security Analyst II and a VP, Information Technology but neither of the job descriptions mention having knowing how to perform phishing exercises.", - "While you would think all this bad news is terrible for Tandem Diabetes Care’s stock price, guess again, when the data breach was submitted to the U.S. Department of Health and Human Services on March 13, 2020, TNDM – Tandem Diabetes Care, Inc closed at $46.55 a share and closed on Apri 18, 2020 at $72.94 a share.", - "So it should come to no surprised that Stueve Siegel Hanson LLP, a small Kansas City law firm known for their eight-figure legal outcomes would explore legal options for this data breach.", - "KANSAS CITY, Mo., April 1, 2020 /PRNewswire-PRWeb/ — Stueve Siegel Hanson LLP, a national leader in privacy and cybersecurity litigation, is investigating the data breach at Tandem Diabetes Care, Inc. that compromised the sensitive personal information of 140,000 patients, the firm announced today.", - "On January 17, Tandem discovered its email system had been hacked through a “phishing” scheme. An internal investigation showed several employee email accounts were compromised for three days between January 17 and January 20. The compromised information included names, email addresses, contact information, Social Security numbers and a range of patient data, including details related to customers’ use of Tandem products or services, and clinical data about diabetes therapy.", - "Tandem announced the data breach on March 16 and said it would notify affected customers. Individuals who receive these notifications can contact Stueve Siegel Hanson at 816.714.7105 or online to discuss their legal options.", - "Recognized by Law360 as “Cybersecurity & Privacy Group of the Year,” Stueve Siegel Hanson has prosecuted cases involving the largest data breaches in U.S. history, securing billions of dollars for affected customers. In 2019, the firm’s work included:" - ], - "url": "https://www.infosecnews.org/leading-privacy-and-cybersecurity-law-firm-investigates-tandem-diabetes-care-data-breach/" -} \ No newline at end of file diff --git a/dedoc/api/static/styles.css b/dedoc/api/static/styles.css deleted file mode 100644 index 0fe1f2f5..00000000 --- a/dedoc/api/static/styles.css +++ /dev/null @@ -1,43 +0,0 @@ -.content { - max-width: 1200px; - margin: 0 auto; -} - -.body-upload { - margin: 5px; - background: #A6A6A6 -} - -.body-2 { - margin: 15px; -} - -/* Tab Navigation */ -.nav-tabs { - margin: 0; - padding: 0; - border: 0; -} -.nav-tabs > li > a { - background: #DADADA; - border-radius: 0; - box-shadow: inset 0 -8px 7px -9px rgba(0,0,0,.4),-2px -2px 5px -2px rgba(0,0,0,.4); -} -.nav-tabs > li.active > a, -.nav-tabs > li.active > a:hover { - background: #F5F5F5; - box-shadow: inset 0 0 0 0 rgba(0,0,0,.4),-2px -3px 5px -2px rgba(0,0,0,.4); -} - -/* Tab Content */ -.tab-pane { - background: #F5F5F5; - box-shadow: 0 0 4px rgba(0,0,0,.4); - border-radius: 0; - text-align: left; - padding: 10px; -} - -.red { - color:#f00 -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/components/annotation.json b/dedoc/api/static/swagger/components/annotation.json deleted file mode 100644 index 239422be..00000000 --- a/dedoc/api/static/swagger/components/annotation.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "Annotation": { - "type": "object", - "description": "Аннотации текста", - "properties": { - "start": { - "type": "integer", - "format": "int32", - "description": "индекс начала аннотации", - "required": true - }, - "end": { - "type": "integer", - "format": "int32", - "description": "индекс конца аннотации", - "required": true - }, - "value": { - "type": "string", - "description": "значение аннотации. Может содержать название стиля (тогда начинается со строки \"style:\") или другие значения", - "required": true, - "enum": ["style: имя_шрифта", "bold", "italic", "underground"] - } - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/components/document_content.json b/dedoc/api/static/swagger/components/document_content.json deleted file mode 100644 index 73835129..00000000 --- a/dedoc/api/static/swagger/components/document_content.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "DocumentContent": { - "type": "object", - "description": "Структура содержимого документа", - "properties": { - "tables":{ - "type": "array", - "items": { - "$ref": "table.json#/Table" - } - }, - "structure": { - "$ref": "tree_node.json#/TreeNode" - } - - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/components/document_metadata.json b/dedoc/api/static/swagger/components/document_metadata.json deleted file mode 100644 index 9fa6e885..00000000 --- a/dedoc/api/static/swagger/components/document_metadata.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "DocumentMetadata": { - "type": "object", - "description": "Метаинформация документа", - "properties": { - "file_name": { - "type": "string", - "required": true, - "description": "file name", - "example": "example.odt" - }, - "bucket_name": { - "type": "string", - "required": false, - "description": "bucket name in which the file is located. Included when analyzing a file from a cloud storage", - "example": "dedoc" - }, - "size": { - "type": "integer", - "format": "int32", - "required": true, - "description": "file size in bytes", - "example": "20060" - }, - "modified_time": { - "type": "integer", - "format": "int32", - "required": true, - "description": "modification date of the document in the format UnixTime", - "example": "1590579805" - }, - "created_time": { - "type": "integer", - "format": "int32", - "required": true, - "description": "creation date of the document in the format UnixTime", - "example": "1590579805" - }, - "access_time": { - "type": "integer", - "format": "int32", - "required": true, - "description": "file access date in format UnixTime", - "example": "1590579805" - }, - "file_type": { - "type": "string", - "required": false, - "description": "mime-type file", - "example": "application/vnd.oasis.opendocument.text" - } - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/components/paragraph_metadata.json b/dedoc/api/static/swagger/components/paragraph_metadata.json deleted file mode 100644 index 1c5a434f..00000000 --- a/dedoc/api/static/swagger/components/paragraph_metadata.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "ParagraphMetadata": { - "type": "object", - "description": "Метаинформация, относящаяся к параграфу", - "properties": { - "paragraph_type": { - "type": "string", - "required": true, - "description": "тип параграфа (заголовок, элемент списка и так далее).. Возможные значения зависят от типа документа. Значения по умолчанию: ['root', 'paragraph', 'raw_text', 'list', 'list_item', 'named_header'] Значения для типа документа='law': ['root', 'raw_text', 'struct_unit', 'item', 'article', 'subitem', 'footer', 'header', 'title', 'part'] " - }, - "predicted_classes": { - "required": false, - "type": "object", - "description": "результаты работы классификатора, [{ тип параграфа: вероятность }]. Вероятность - это вероятность отношения параграфа к такому типу, список типов параграфов зависит от типа документа" - }, - "page_id": { - "type": "integer", - "format": "int32", - "description": "страница, с которой начинается этот параграф", - "required": false - }, - "line_id": { - "type": "integer", - "format": "int32", - "description": "номер строки, с которой начинается этот параграф", - "required": false - } - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/components/parsed_document.json b/dedoc/api/static/swagger/components/parsed_document.json deleted file mode 100644 index 5ef3c76d..00000000 --- a/dedoc/api/static/swagger/components/parsed_document.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "ParsedDocument": { - "type": "object", - "properties": { - "metadata": { - "$ref": "document_metadata.json#/DocumentMetadata" - }, - "content": { - "$ref": "document_content.json#/DocumentContent" - }, - "attachments": { - "type": "array", - "required": false, - "items": { - "$ref": "#/ParsedDocument" - } - } - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/components/table.json b/dedoc/api/static/swagger/components/table.json deleted file mode 100644 index 75138184..00000000 --- a/dedoc/api/static/swagger/components/table.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "Table": { - "type": "object", - "description": "Распознанные таблицы", - "required": true, - "properties": { - "cells": { - "type": "array", - "items": { - "type": "array", - "items": { - "type": "string", - "format": "string", - "description": "Ячейка содрежит текст" - } - }, - "description": "список списков строк таблицы, строка таблицы представляет из себя списко содержимого ячеек. Ячейка содрежит текст" - }, - "metadata": { - "$ref": "table_metadata.json#/TableMetadata" - } - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/components/table_metadata.json b/dedoc/api/static/swagger/components/table_metadata.json deleted file mode 100644 index ce575a07..00000000 --- a/dedoc/api/static/swagger/components/table_metadata.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "TableMetadata": { - "type": "object", - "description": "Метаинформация таблицы", - "properties": { - "page_id": { - "type": "integer", - "format": "int32", - "required": false, - "description": "номер страницы на которой начинается таблица" - } - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/components/tree_node.json b/dedoc/api/static/swagger/components/tree_node.json deleted file mode 100644 index ea45e685..00000000 --- a/dedoc/api/static/swagger/components/tree_node.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "TreeNode": { - "type": "object", - "description": "Древовидная структура документа", - "properties": { - "node_id": { - "type": "string", - "description": "идентификатор элемента документа. Уникален в рамках одного дерева (т.е. в этом дереве не будет другого такого node_id, а в attachment может встретиться) Идентификатор имеет вид 0.2.1 где каждое число символизирует порядковый номер на соотвтетствующем уровне иерархии.", - "required": true, - "example": "0.2.1" - }, - "text": { - "type": "string", - "description": "текст элемента", - "required": true - }, - "annotations": { - "type": "array", - "items": { - "$ref": "annotation.json#/Annotation" - }, - "description": "описывает какие-либо свойства текста, например жирность, размер шрифта и т.д.", - "required": true - }, - "metadata": { - "$ref": "paragraph_metadata.json#/ParagraphMetadata" - }, - "subparagraphs": { - "type": "array", - "items": { - "$ref": "#/TreeNode" - }, - "description": "\"дети\" типа \"TreeNode\" текущего элемента (например подглавы для главы). Структура \"детей\" аналогична текущей.", - "required": true - } - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/swagger/swagger.json b/dedoc/api/static/swagger/swagger.json deleted file mode 100644 index 9c8b143b..00000000 --- a/dedoc/api/static/swagger/swagger.json +++ /dev/null @@ -1,177 +0,0 @@ -{ - "openapi": "3.0.1", - "info": { - "description": "API description", - "version": "1.0.0", - "title": "Automatic structure document extractor", - "license": { - "name": "MIT", - "url": "https://opensource.org/licenses/MIT" - } - }, - "servers": [ - { - "url": "/" - } - ], - "tags": [ - { - "name": "Upload File", - "description": "Example API for requesting and return document structure requests" - } - ], - "paths": { - "/upload": { - "post": { - "tags": [ - "Upload File" - ], - "summary": "Returns reconized document structure", - "requestBody": { - "content": { - "multipart/form-data": { - "schema": { - "type": "object", - "properties": { - "file": { - "type": "string", - "format": "binary" - } - } - } - } - } - }, - "parameters": [ - { - "in": "query", - "name": "language", - "required": false, - "description": "язык рапознавания документа.", - "schema": { - "type": "string", - "default": "rus+eng", - "enum": ["rus+eng", "rus", "eng"] - } - }, - { - "in": "query", - "name": "with_attachments", - "required": false, - "description": "опция включающая анализ вложенных файлов.", - "schema": { - "type": "boolean", - "default": false - } - }, - { - "in": "query", - "name": "return_html", - "required": false, - "description": "опция для возврата ответа в html-виде.", - "schema": { - "type": "boolean", - "default": false - } - }, - { - "in": "query", - "name": "document_type", - "required": false, - "description": "тип документа. ", - "schema": { - "type": "string", - "enum": ["", "law", "article"], - "default": "" - } - }, - { - "in": "query", - "name": "structure_type", - "required": false, - "description": "тип выходной структуры (линейная или древовидная)", - "schema": { - "type": "string", - "enum": ["linear", "tree"], - "default": "linear" - } - }, - { - "in": "query", - "name": "pdf_with_text_layer", - "required": false, - "description": "опция плагина Docreader для извлечения текста из текстового слоя в PDF или с помощью OCR-методов из изображения документов.", - "schema": { - "type": "boolean", - "default": false - } - }, - { - "in": "query", - "name": "orient_analysis_cells", - "required": false, - "description": "опция плагина Docreader модуля распознавания таблиц включение анализа повернутых ячеек в заголовках таблиц.", - "schema": { - "type": "boolean", - "default": false - } - }, - { - "in": "query", - "name": "orient_cell_angle", - "required": false, - "description": "опция плагина Docreader для установки ориентации ячеек в заголовках таблиц. \"90\" - ячейки повернуты на 90 градусов по часовой стрелке, \"270\" - ячейки повернуты на 90 градусов против часовой стрелки (или 270 по часовой)", - "schema": { - "type": "string", - "enum": [ - "90", - "270" - ], - "default": "270" - } - } - ], - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ParsedDocument" - } - } - } - } - } - } - } - }, - "components": { - "schemas": { - "ParsedDocument": { - "$ref": "components/parsed_document.json#/ParsedDocument" - }, - "DocumentMetadata": { - "$ref": "components/document_metadata.json#/DocumentMetadata" - }, - "DocumentContent": { - "$ref": "components/document_content.json#/DocumentContent" - }, - "Table": { - "$ref": "components/table.json#/Table" - }, - "TableMetadata": { - "$ref": "components/table_metadata.json#/TableMetadata" - }, - "TreeNode": { - "$ref": "components/tree_node.json#/TreeNode" - }, - "Annotation": { - "$ref": "components/annotation.json#/Annotation" - }, - "ParagraphMetadata": { - "$ref": "components/paragraph_metadata.json#/ParagraphMetadata" - } - } - } -} \ No newline at end of file diff --git a/dedoc/api/static/train_dataset/refit_classifier.html b/dedoc/api/static/train_dataset/refit_classifier.html deleted file mode 100644 index f59f2642..00000000 --- a/dedoc/api/static/train_dataset/refit_classifier.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - Dedoc | Обучение классификаторов - - - - - -
-

Дообучение классификаторов

- -

Классификатор типа строк

-

- Переобучение классификаторов происходит с помощью скриптов dedoc/scripts/train_clf_type_line_classifier.py, - где clf_type - тип классификатора (tz, law, diploma). -

- -

Классификатор ориентации изображения документа

-

- Переобучение классификатора ориентации происходит с помощью скрипта dedoc/scripts/train_acc_orientation_classifier.py -

- -

Использование:

- - train_acc_orientation_classifier.py [-h] -t TRAIN [-s CHECKPOINT_SAVE][-l CHECKPOINT_LOAD][-f FROM_CHECKPOINT] -d INPUT_DATA_FOLDER - - -

Параметры:

- -

- - -h, --help информация по аргументам - -

- -

- - -t TRAIN, --train TRAIN (--train True) или проверка модели (--train False) - -

- -

- - -s CHECKPOINT_SAVE, --checkpoint_save CHECKPOINT_SAVE путь до весов checkpoint.pth для сохранения (используется только для этапа тренировки) - -

- -

- - -l CHECKPOINT_LOAD, --checkpoint_load CHECKPOINT_LOAD путь до весов checkpoint.pth, которые будут загружены в модель - -

- -

- - -f FROM_CHECKPOINT, --from_checkpoint FROM_CHECKPOINT флаг устанавливается, если обучение нужно продолжить из чекпоинта, указанного в CHECKPOINT_LOAD - -

- -

- - -d INPUT_DATA_FOLDER, --input_data_folder INPUT_DATA_FOLDER путь до папки с тренировочными (подпапка train) и тестовыми данными (подпапка test). - -

-
\ No newline at end of file diff --git a/dedoc/api/train_dataset/api_args.py b/dedoc/api/train_dataset/api_args.py deleted file mode 100644 index 53eb0d2c..00000000 --- a/dedoc/api/train_dataset/api_args.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import Optional - -from fastapi import Body - -from dedoc.api.api_args import QueryParameters - - -class TrainDatasetParameters(QueryParameters): - type_of_task: Optional[str] - task_size: Optional[str] - - def __init__(self, - type_of_task: Optional[str] = Body(description="Type of the task to create", default=None), # noqa - task_size: Optional[str] = Body(description="Maximum number of images in one task", default=None), # noqa - - document_type: Optional[str] = Body(default=None), # noqa - pdf_with_text_layer: Optional[str] = Body(default=None), # noqa - language: Optional[str] = Body(default=None), # noqa - need_header_footer_analysis: Optional[str] = Body(default=None), # noqa - - **data: dict) -> None: - - super().__init__(**data) - self.type_of_task: str = type_of_task or "" - self.task_size: str = task_size or "250" - - self.document_type = document_type or "" - self.pdf_with_text_layer = pdf_with_text_layer or "auto" - self.language = language or "rus+eng" - self.need_header_footer_analysis = need_header_footer_analysis or "false" diff --git a/dedoc/api/train_dataset/api_collect_train_dataset.py b/dedoc/api/train_dataset/train_dataset_api.py similarity index 89% rename from dedoc/api/train_dataset/api_collect_train_dataset.py rename to dedoc/api/train_dataset/train_dataset_api.py index 047edfe7..4493b51d 100644 --- a/dedoc/api/train_dataset/api_collect_train_dataset.py +++ b/dedoc/api/train_dataset/train_dataset_api.py @@ -1,15 +1,18 @@ +import dataclasses import logging import os import shutil +from dataclasses import dataclass +from typing import Optional import uvicorn -from fastapi import Depends, FastAPI, File, Request, Response, UploadFile +from fastapi import Depends, FastAPI, File, Form, Request, Response, UploadFile from fastapi.staticfiles import StaticFiles from starlette.responses import FileResponse, HTMLResponse from starlette.templating import Jinja2Templates +from dedoc.api.api_args import QueryParameters from dedoc.api.dedoc_api import _get_static_file_path -from dedoc.api.train_dataset.api_args import TrainDatasetParameters from dedoc.api.train_dataset.async_archive_handler import AsyncHandler from dedoc.config import get_config from dedoc.dedoc_manager import DedocManager @@ -21,16 +24,28 @@ from dedoc.train_dataset.train_dataset_utils import get_path_original_documents from dedoc.utils.utils import calculate_file_hash + +@dataclass +class TrainDatasetParameters(QueryParameters): + type_of_task: Optional[str] = Form("law_classifier", + enum=[ + "law_classifier", "tz_classifier", "diploma_classifier", "header_classifier", "paragraph_classifier", + "tables_classifier" + ], + description="Type of the task to create") + task_size: Optional[str] = Form("250", description="Maximum number of images in one task") + + config = get_config() PORT = config["api_port"] -static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "static") +static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "web") static_files_dirs = config.get("static_files_dirs") logger = config.get("logger", logging.getLogger()) app = FastAPI() -app.mount("/static", StaticFiles(directory=static_path), name="static") +app.mount("/web", StaticFiles(directory=static_path), name="web") templates = Jinja2Templates(directory=os.path.join(static_path, "train_dataset")) manager = DedocManager(config=config) @@ -161,7 +176,7 @@ def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParam Run the whole pipeline of task making. """ clear() - parameters = query_params.dict(by_alias=True) + parameters = dataclasses.asdict(query_params) uid = handler.handle(file=file, parameters=parameters) return HTMLResponse(f'Successfully handle file. UID=

get_result_archive/?uid={uid}

', status_code=201) @@ -186,11 +201,6 @@ def get_result_archive(request: Request, uid: str) -> Response: return HTMLResponse(response, status_code=202) -@app.get("/info_classifiers") -def get_classifiers_info() -> Response: - return FileResponse(os.path.join(static_path, "train_dataset/refit_classifier.html")) - - @app.get("/static_file") def get_static_file(request: Request) -> Response: path = _get_static_file_path(request) diff --git a/dedoc/api/static/example.bmp b/dedoc/api/web/examples/example.bmp similarity index 100% rename from dedoc/api/static/example.bmp rename to dedoc/api/web/examples/example.bmp diff --git a/dedoc/api/static/csv_coma.csv b/dedoc/api/web/examples/example.csv similarity index 100% rename from dedoc/api/static/csv_coma.csv rename to dedoc/api/web/examples/example.csv diff --git a/dedoc/api/web/examples/example.djvu b/dedoc/api/web/examples/example.djvu new file mode 100644 index 00000000..fad4e491 Binary files /dev/null and b/dedoc/api/web/examples/example.djvu differ diff --git a/dedoc/api/static/example.doc b/dedoc/api/web/examples/example.doc similarity index 100% rename from dedoc/api/static/example.doc rename to dedoc/api/web/examples/example.doc diff --git a/dedoc/api/static/example.docx b/dedoc/api/web/examples/example.docx similarity index 100% rename from dedoc/api/static/example.docx rename to dedoc/api/web/examples/example.docx diff --git a/dedoc/api/web/examples/example.eml b/dedoc/api/web/examples/example.eml new file mode 100644 index 00000000..38974420 --- /dev/null +++ b/dedoc/api/web/examples/example.eml @@ -0,0 +1,193 @@ +Delivered-To: example-to@mail.ru +Return-path: example-from@mail.ru +Received: by f165.i.mail.ru with local (envelope-from ) + id 1qnJkQ-00076T-JV + for example-from@mail.ru; Mon, 02 Oct 2023 17:18:11 +0300 +Received: by e.mail.ru with HTTP; + Mon, 02 Oct 2023 17:18:10 +0300 +From: example-from@mail.ru +To: example-from@mail.ru +Subject: =?UTF-8?B?0J/RgNC40LzQtdGAINC00L7QutGD0LzQtdC90YLQsA==?= +MIME-Version: 1.0 +X-Mailer: Mail.Ru Mailer 1.0 +X-SenderField-Remind: 0 +Date: Mon, 02 Oct 2023 17:18:10 +0300 +X-Priority: 3 (Normal) +Message-ID: <1696256290.920042842@f702.i.mail.ru> +Reply-To: example-from@mail.ru +Content-Type: multipart/alternative; + boundary="--ALT--fE8cBf3B888BF99ccBe7796dD7741dBb1696256290" +Authentication-Results: f165.i.mail.ru; auth=pass smtp.auth=example-from@mail.ru smtp.mailfrom=example-from@mail.ru +X-7564579A: 646B95376F6C166E +X-77F55803: 119C1F4DF6A9251C576A02CBD53F3616E4335C139B4C675ECE87FDE172BB3E448FD872164937FA4C432AC4BF9B9FFA550404E91E000CFA55A3D6A2C43D240DA1314E81A09D0172C5 +X-7FA49CB5: 70AAF3C13DB70168C09775C1D3CA48CF1CFE956E66B77AE0B2086D80B0504778CF19DD082D7633A0ACBFF42033827DA764CD17681C2FEB7A7680F9384605B90310F4037EAD50AE02C4224003CC836476ABE134FDCE4E2725BFD28B28ED4578739E625A9149C048EEB1593CA6EC85F86D36C75B72B9FDC350B287FD4696A6DC2FA8DF7F3B2552694A4E2F5AFA99E116B42401471946AA11AF1661749BA6B977356CEA61CADDE926D952120BFB3F63BC185F65E78799B30205C33C3ADAEA971F8E611E41BBFE2FEB2B210D8477AC7857873939F2FE46A7CD00815222C924CCE7D18EEF46B7454FC60B9742502CCDD46D0D9E541A154B51D14BF6B57BC7E64490618DEB871D839B73339E8FC8737B5C22498424CA1AAF98A6958941B15DA834481FCF19DD082D7633A0EF3E4896CB9E6436389733CBF5DBD5E9D5E8D9A59859A8B68424CA1AAF98A6958941B15DA834481F9449624AB7ADAF372E808ACE2090B5E14AD6D5ED66289B5259CC434672EE63711DD303D21008E298D5E8D9A59859A8B6B372FE9A2E580EFC725E5C173C3A84C3FF9565596324FD1A35872C767BF85DA2F004C90652538430E4A6367B16DE6309 +X-C1DE0DAB: 0D63561A33F958A5FDEA6B56F8F2A28121A36F7F1A8E37510FB76DD771BE18D0F87CCE6106E1FC07E67D4AC08A07B9B05DBE5CE84B47B0E4BDAD6C7F3747799A +X-C8649E89: 1C3962B70DF3F0ADE2815F1F17DA719077DD89D51EBB77422CCB5A6D6581D03D0776B5B2C279835F17BCBE6708A5A68D02015372BE9702A28680ACF6754497CA6B69A086469952FD198833F085E5D146262D039804A19DA37DF08FB5148B8A039B815B7972F6FBBE8B71F33FBE42112FD420D76BB3D91AA152EE4E5D9E54FDA44C41F94D744909CE6757CB019CE3CF4AEDC07D886187A58A21243E0B3CBCF429720F7641D4628F22CC2E138FFB4ACBED +X-D57D3AED: 3ZO7eAau8CL7WIMRKs4sN3D3tLDjz0dLbV79QFUyzQ2Ujvy7cMT6pYYqY16iZVKkSc3dCLJ7zSJH7+u4VD18S7Vl4ZUrpaVfd2+vE6kuoey4m4VkSEu530nj6fImhcD4MUrOEAnl0W826KZ9Q+tr5+wYjsrrSY/u8Y3PrTqANeitKFiSd6Yd7yPpbiiZ/d5BsxIjK0jGQgCHUM3Ry2Lt2G3MDkMauH3h0dBdQGj+BB/iPzQYh7XS329fgu+/vnDh+nImolR1znLvBa2J/FN+MA== +X-F696D7D5: rbLgF9MhFk2XOxPPJRpEg7zhFriRLlmUK3clX68L3zdYDGS3dWy+KvpyJqJUdc5y7wWtifxTfjA= +X-Mailru-Sender: 583F1D7ACE8F49BD906739E5DAD9EEDC2B0103DF9737AD9E2984A434242ABEC421603B1BB836D9433DB0444B1FEC91ABA75367E14081DD76BE5713C6538D88B35A1FDCB81296DE7A211C1FB5AE3B037A4C9521D7C2F46DACC2C3514047B3292006380BDB115D312DB4A721A3011E896F +X-Mras: Ok +X-Spam: undefined +X-Mailru-Intl-Transport: d,5bf591a + + +----ALT--fE8cBf3B888BF99ccBe7796dD7741dBb1696256290 +Content-Type: text/plain; charset=utf-8 +Content-Transfer-Encoding: base64 + +CsKgCtCf0YDQuNC80LXRgCDQtNC+0LrRg9C80LXQvdGC0LAK0JPQu9Cw0LLQsCAxCtCa0LDQutC4 +0LUg0YLQviDQvtC/0YDQtdC00LXQu9C10L3QuNGPCtCh0YLQsNGC0YzRjyAxCtCe0L/RgNC10LTQ +tdC70LjQvCDQvtC/0YDQtNC10LvQtdC90LjRjwrQodGC0LDRgtGM0Y8gMgrQlNCw0LTQuNC8INC/ +0L7Rj9GB0L3QtdC90LjRjwoxLjIuMSDQn9C+0Y/RgdC90LjQvCDQt9CwINC90LXQv9C+0L3Rj9GC +0L3QvtC1CjEuMi4yLiDQn9C+0Y/RgdC90LjQvCDQt9CwINC/0L7QvdGP0YLQvdC+0LUK0LApINGN +0YLQviDQtNCw0LbQtSDQtdC20YMg0L/QvtC90Y/RgtC90L4K0LEpINGN0YLQviDQtdC20YMg0L3Q +tSDQv9C+0L3Rj9GC0L3QvgoxLjIuMwrCoApOCtCk0LDQvNC40LvQuNGPCtCY0LzRjwrQntGA0LPQ +sNC90LjQt9Cw0YbQuNGPCtCi0LXQu9C10YTQvtC9CtCf0YDQuNC80LXRh9Cw0L3QuNGPCjEK0JjQ +stCw0L3QvtCyCtCY0LLQsNC9CtCY0KHQnwo4LTgwMArCoArCoArCoArQpNCw0LzQuNC70LjRjwrQ +mNC80Y8K0J7RgtGH0LXRgdGC0LLQvgrQmNCy0LDQvdC+0LIK0JjQstCw0L0K0JjQstCw0L3QvtCy +0LjRhwrQn9C10YLRgNC+0LIK0J/RkdGC0YAK0J/QtdGC0YDQvtCy0LjRhwrQodC40LTQvtGA0L7Q +sgrQodC40LTQvtGACtCh0LjQtNC+0YDQvtCy0LjRhwrCoA== + +----ALT--fE8cBf3B888BF99ccBe7796dD7741dBb1696256290 +Content-Type: text/html; charset=utf-8 +Content-Transfer-Encoding: base64 + +CjxIVE1MPjxCT0RZPjxkaXYgY2xhc3M9ImpzLWhlbHBlciBqcy1yZWFkbXNnLW1zZyI+PGRpdiBp +ZD0ic3R5bGVfMTY5NjI1NjIzODE2NjA4OTc0NzEiPjxkaXYgaWQ9InN0eWxlXzE2OTYyNTYyMzgx +NjYwODk3NDcxX0JPRFkiPjxkaXYgY2xhc3M9ImNsXzMxOTE0MCI+PGRpdj4mbmJzcDs8L2Rpdj48 +ZGl2PjxwIHN0eWxlPSJjb2xvcjojMDAwMDAwOyBmb250OjE4LjBweCBUaW1lcyI+PGI+0J/RgNC4 +0LzQtdGAINC00L7QutGD0LzQtdC90YLQsDwvYj48L3A+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7 +IGZvbnQ6MTYuMHB4IFRpbWVzIj48Yj7Qk9C70LDQstCwIDE8L2I+PC9wPjxwIHN0eWxlPSJjb2xv +cjojMDAwMDAwOyBmb250OjEyLjBweCBUaW1lcyI+0JrQsNC60LjQtSDRgtC+INC+0L/RgNC10LTQ +tdC70LXQvdC40Y88L3A+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVz +Ij48Yj7QodGC0LDRgtGM0Y8gMTwvYj48L3A+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6 +MTIuMHB4IFRpbWVzIj7QntC/0YDQtdC00LXQu9C40Lwg0L7Qv9GA0LTQtdC70LXQvdC40Y88L3A+ +PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzIj48Yj7QodGC0LDRgtGM +0Y8gMjwvYj48L3A+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzIj7Q +lNCw0LTQuNC8INC/0L7Rj9GB0L3QtdC90LjRjzwvcD48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsg +Zm9udDoxMi4wcHggVGltZXMiPjEuMi4xINCf0L7Rj9GB0L3QuNC8INC30LAg0L3QtdC/0L7QvdGP +0YLQvdC+0LU8L3A+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzIj4x +LjIuMi4g0J/QvtGP0YHQvdC40Lwg0LfQsCDQv9C+0L3Rj9GC0L3QvtC1PC9wPjxwIHN0eWxlPSJj +b2xvcjojMDAwMDAwOyBmb250OjEyLjBweCBUaW1lcyI+0LApINGN0YLQviDQtNCw0LbQtSDQtdC2 +0YMg0L/QvtC90Y/RgtC90L48L3A+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4 +IFRpbWVzIj7QsSkg0Y3RgtC+INC10LbRgyDQvdC1INC/0L7QvdGP0YLQvdC+PC9wPjxwIHN0eWxl +PSJjb2xvcjojMDAwMDAwOyBmb250OjEyLjBweCBUaW1lcyI+MS4yLjM8L3A+PHAgc3R5bGU9ImNv +bG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzOyBtaW4taGVpZ2h0OjE0LjBweCI+Jm5ic3A7 +PC9wPjx0YWJsZSBzdHlsZT0iYm9yZGVyLWNvbGxhcHNlOiBjb2xsYXBzZSIgY2VsbHNwYWNpbmc9 +IjAiIGNlbGxwYWRkaW5nPSIwIj48dGJvZHk+PHRyPjx0ZCB2YWxpZ249InRvcCIgc3R5bGU9Indp +ZHRoOiA3MS4wcHg7aGVpZ2h0OiAxMy4wcHg7Ym9yZGVyLXN0eWxlOiBzb2xpZDtib3JkZXItd2lk +dGg6IDEuMHB4IDEuMHB4IDEuMHB4IDEuMHB4O2JvcmRlci1jb2xvcjogIzAwMDAwMCAjMDAwMDAw +ICMwMDAwMDAgIzAwMDAwMDtwYWRkaW5nOiA0LjBweCA0LjBweCA0LjBweCA0LjBweCI+PHAgc3R5 +bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzIj5OPC9wPjwvdGQ+PHRkIHZhbGln +bj0idG9wIiBzdHlsZT0id2lkdGg6IDcyLjBweDtoZWlnaHQ6IDEzLjBweDtib3JkZXItc3R5bGU6 +IHNvbGlkO2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9y +OiAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQu +MHB4IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPtCk +0LDQvNC40LvQuNGPPC9wPjwvdGQ+PHRkIHZhbGlnbj0idG9wIiBzdHlsZT0id2lkdGg6IDcxLjBw +eDtoZWlnaHQ6IDEzLjBweDtib3JkZXItc3R5bGU6IHNvbGlkO2JvcmRlci13aWR0aDogMS4wcHgg +MS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9yOiAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMCAj +MDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQuMHB4IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6 +IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPtCY0LzRjzwvcD48L3RkPjx0ZCB2YWxpZ249InRv +cCIgc3R5bGU9IndpZHRoOiA3MS4wcHg7aGVpZ2h0OiAxMy4wcHg7Ym9yZGVyLXN0eWxlOiBzb2xp +ZDtib3JkZXItd2lkdGg6IDEuMHB4IDEuMHB4IDEuMHB4IDEuMHB4O2JvcmRlci1jb2xvcjogIzAw +MDAwMCAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMDtwYWRkaW5nOiA0LjBweCA0LjBweCA0LjBweCA0 +LjBweCI+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzIj7QntGA0LPQ +sNC90LjQt9Cw0YbQuNGPPC9wPjwvdGQ+PHRkIHZhbGlnbj0idG9wIiBzdHlsZT0id2lkdGg6IDcy +LjBweDtoZWlnaHQ6IDEzLjBweDtib3JkZXItc3R5bGU6IHNvbGlkO2JvcmRlci13aWR0aDogMS4w +cHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9yOiAjMDAwMDAwICMwMDAwMDAgIzAwMDAw +MCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQuMHB4IDQuMHB4Ij48cCBzdHlsZT0iY29s +b3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPtCi0LXQu9C10YTQvtC9PC9wPjwvdGQ+PHRk +IHZhbGlnbj0idG9wIiBzdHlsZT0id2lkdGg6IDcxLjBweDtoZWlnaHQ6IDEzLjBweDtib3JkZXIt +c3R5bGU6IHNvbGlkO2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVy +LWNvbG9yOiAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQu +MHB4IDQuMHB4IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGlt +ZXMiPtCf0YDQuNC80LXRh9Cw0L3QuNGPPC9wPjwvdGQ+PC90cj48dHI+PHRkIHZhbGlnbj0idG9w +IiBzdHlsZT0id2lkdGg6IDcxLjBweDtoZWlnaHQ6IDE0LjBweDtib3JkZXItc3R5bGU6IHNvbGlk +O2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9yOiAjMDAw +MDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQuMHB4IDQu +MHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPjE8L3A+PC90 +ZD48dGQgdmFsaWduPSJ0b3AiIHN0eWxlPSJ3aWR0aDogNzIuMHB4O2hlaWdodDogMTQuMHB4O2Jv +cmRlci1zdHlsZTogc29saWQ7Ym9yZGVyLXdpZHRoOiAxLjBweCAxLjBweCAxLjBweCAxLjBweDti +b3JkZXItY29sb3I6ICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwICMwMDAwMDA7cGFkZGluZzogNC4w +cHggNC4wcHggNC4wcHggNC4wcHgiPjxwIHN0eWxlPSJjb2xvcjojMDAwMDAwOyBmb250OjEyLjBw +eCBUaW1lcyI+0JjQstCw0L3QvtCyPC9wPjwvdGQ+PHRkIHZhbGlnbj0idG9wIiBzdHlsZT0id2lk +dGg6IDcxLjBweDtoZWlnaHQ6IDE0LjBweDtib3JkZXItc3R5bGU6IHNvbGlkO2JvcmRlci13aWR0 +aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9yOiAjMDAwMDAwICMwMDAwMDAg +IzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQuMHB4IDQuMHB4Ij48cCBzdHls +ZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPtCY0LLQsNC9PC9wPjwvdGQ+PHRk +IHZhbGlnbj0idG9wIiBzdHlsZT0id2lkdGg6IDcxLjBweDtoZWlnaHQ6IDE0LjBweDtib3JkZXIt +c3R5bGU6IHNvbGlkO2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVy +LWNvbG9yOiAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQu +MHB4IDQuMHB4IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGlt +ZXMiPtCY0KHQnzwvcD48L3RkPjx0ZCB2YWxpZ249InRvcCIgc3R5bGU9IndpZHRoOiA3Mi4wcHg7 +aGVpZ2h0OiAxNC4wcHg7Ym9yZGVyLXN0eWxlOiBzb2xpZDtib3JkZXItd2lkdGg6IDEuMHB4IDEu +MHB4IDEuMHB4IDEuMHB4O2JvcmRlci1jb2xvcjogIzAwMDAwMCAjMDAwMDAwICMwMDAwMDAgIzAw +MDAwMDtwYWRkaW5nOiA0LjBweCA0LjBweCA0LjBweCA0LjBweCI+PHAgc3R5bGU9ImNvbG9yOiMw +MDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzIj44LTgwMDwvcD48L3RkPjx0ZCB2YWxpZ249InRvcCIg +c3R5bGU9IndpZHRoOiA3MS4wcHg7aGVpZ2h0OiAxNC4wcHg7Ym9yZGVyLXN0eWxlOiBzb2xpZDti +b3JkZXItd2lkdGg6IDEuMHB4IDEuMHB4IDEuMHB4IDEuMHB4O2JvcmRlci1jb2xvcjogIzAwMDAw +MCAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMDtwYWRkaW5nOiA0LjBweCA0LjBweCA0LjBweCA0LjBw +eCI+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzOyBtaW4taGVpZ2h0 +OjE0LjBweCI+Jm5ic3A7PC9wPjwvdGQ+PC90cj48L3Rib2R5PjwvdGFibGU+PHAgc3R5bGU9ImNv +bG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzOyBtaW4taGVpZ2h0OjE0LjBweCI+Jm5ic3A7 +PC9wPjxwIHN0eWxlPSJjb2xvcjojMDAwMDAwOyBmb250OjEyLjBweCBUaW1lczsgbWluLWhlaWdo +dDoxNC4wcHgiPiZuYnNwOzwvcD48dGFibGUgc3R5bGU9ImJvcmRlci1jb2xsYXBzZTogY29sbGFw +c2UiIGNlbGxzcGFjaW5nPSIwIiBjZWxscGFkZGluZz0iMCI+PHRib2R5Pjx0cj48dGQgdmFsaWdu +PSJ0b3AiIHN0eWxlPSJ3aWR0aDogMTUyLjBweDtoZWlnaHQ6IDEzLjBweDtib3JkZXItc3R5bGU6 +IHNvbGlkO2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9y +OiAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQu +MHB4IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPtCk +0LDQvNC40LvQuNGPPC9wPjwvdGQ+PHRkIHZhbGlnbj0idG9wIiBzdHlsZT0id2lkdGg6IDE1MS4w +cHg7aGVpZ2h0OiAxMy4wcHg7Ym9yZGVyLXN0eWxlOiBzb2xpZDtib3JkZXItd2lkdGg6IDEuMHB4 +IDEuMHB4IDEuMHB4IDEuMHB4O2JvcmRlci1jb2xvcjogIzAwMDAwMCAjMDAwMDAwICMwMDAwMDAg +IzAwMDAwMDtwYWRkaW5nOiA0LjBweCA0LjBweCA0LjBweCA0LjBweCI+PHAgc3R5bGU9ImNvbG9y +OiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzIj7QmNC80Y88L3A+PC90ZD48dGQgdmFsaWduPSJ0 +b3AiIHN0eWxlPSJ3aWR0aDogMTUyLjBweDtoZWlnaHQ6IDEzLjBweDtib3JkZXItc3R5bGU6IHNv +bGlkO2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9yOiAj +MDAwMDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQuMHB4 +IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPtCe0YLR +h9C10YHRgtCy0L48L3A+PC90ZD48L3RyPjx0cj48dGQgdmFsaWduPSJ0b3AiIHN0eWxlPSJ3aWR0 +aDogMTUyLjBweDtoZWlnaHQ6IDE0LjBweDtib3JkZXItc3R5bGU6IHNvbGlkO2JvcmRlci13aWR0 +aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9yOiAjMDAwMDAwICMwMDAwMDAg +IzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQuMHB4IDQuMHB4Ij48cCBzdHls +ZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPtCY0LLQsNC90L7QsjwvcD48L3Rk +Pjx0ZCB2YWxpZ249InRvcCIgc3R5bGU9IndpZHRoOiAxNTEuMHB4O2hlaWdodDogMTQuMHB4O2Jv +cmRlci1zdHlsZTogc29saWQ7Ym9yZGVyLXdpZHRoOiAxLjBweCAxLjBweCAxLjBweCAxLjBweDti +b3JkZXItY29sb3I6ICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwICMwMDAwMDA7cGFkZGluZzogNC4w +cHggNC4wcHggNC4wcHggNC4wcHgiPjxwIHN0eWxlPSJjb2xvcjojMDAwMDAwOyBmb250OjEyLjBw +eCBUaW1lcyI+0JjQstCw0L08L3A+PC90ZD48dGQgdmFsaWduPSJ0b3AiIHN0eWxlPSJ3aWR0aDog +MTUyLjBweDtoZWlnaHQ6IDE0LjBweDtib3JkZXItc3R5bGU6IHNvbGlkO2JvcmRlci13aWR0aDog +MS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNvbG9yOiAjMDAwMDAwICMwMDAwMDAgIzAw +MDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4IDQuMHB4IDQuMHB4Ij48cCBzdHlsZT0i +Y29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMiPtCY0LLQsNC90L7QstC40Yc8L3A+PC90 +ZD48L3RyPjx0cj48dGQgdmFsaWduPSJ0b3AiIHN0eWxlPSJ3aWR0aDogMTUyLjBweDtoZWlnaHQ6 +IDEzLjBweDtib3JkZXItc3R5bGU6IHNvbGlkO2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4w +cHggMS4wcHg7Ym9yZGVyLWNvbG9yOiAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3Bh +ZGRpbmc6IDQuMHB4IDQuMHB4IDQuMHB4IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsg +Zm9udDoxMi4wcHggVGltZXMiPtCf0LXRgtGA0L7QsjwvcD48L3RkPjx0ZCB2YWxpZ249InRvcCIg +c3R5bGU9IndpZHRoOiAxNTEuMHB4O2hlaWdodDogMTMuMHB4O2JvcmRlci1zdHlsZTogc29saWQ7 +Ym9yZGVyLXdpZHRoOiAxLjBweCAxLjBweCAxLjBweCAxLjBweDtib3JkZXItY29sb3I6ICMwMDAw +MDAgIzAwMDAwMCAjMDAwMDAwICMwMDAwMDA7cGFkZGluZzogNC4wcHggNC4wcHggNC4wcHggNC4w +cHgiPjxwIHN0eWxlPSJjb2xvcjojMDAwMDAwOyBmb250OjEyLjBweCBUaW1lcyI+0J/RkdGC0YA8 +L3A+PC90ZD48dGQgdmFsaWduPSJ0b3AiIHN0eWxlPSJ3aWR0aDogMTUyLjBweDtoZWlnaHQ6IDEz +LjBweDtib3JkZXItc3R5bGU6IHNvbGlkO2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4wcHgg +MS4wcHg7Ym9yZGVyLWNvbG9yOiAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3BhZGRp +bmc6IDQuMHB4IDQuMHB4IDQuMHB4IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsgZm9u +dDoxMi4wcHggVGltZXMiPtCf0LXRgtGA0L7QstC40Yc8L3A+PC90ZD48L3RyPjx0cj48dGQgdmFs +aWduPSJ0b3AiIHN0eWxlPSJ3aWR0aDogMTUyLjBweDtoZWlnaHQ6IDEzLjBweDtib3JkZXItc3R5 +bGU6IHNvbGlkO2JvcmRlci13aWR0aDogMS4wcHggMS4wcHggMS4wcHggMS4wcHg7Ym9yZGVyLWNv +bG9yOiAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMCAjMDAwMDAwO3BhZGRpbmc6IDQuMHB4IDQuMHB4 +IDQuMHB4IDQuMHB4Ij48cCBzdHlsZT0iY29sb3I6IzAwMDAwMDsgZm9udDoxMi4wcHggVGltZXMi +PtCh0LjQtNC+0YDQvtCyPC9wPjwvdGQ+PHRkIHZhbGlnbj0idG9wIiBzdHlsZT0id2lkdGg6IDE1 +MS4wcHg7aGVpZ2h0OiAxMy4wcHg7Ym9yZGVyLXN0eWxlOiBzb2xpZDtib3JkZXItd2lkdGg6IDEu +MHB4IDEuMHB4IDEuMHB4IDEuMHB4O2JvcmRlci1jb2xvcjogIzAwMDAwMCAjMDAwMDAwICMwMDAw +MDAgIzAwMDAwMDtwYWRkaW5nOiA0LjBweCA0LjBweCA0LjBweCA0LjBweCI+PHAgc3R5bGU9ImNv +bG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVzIj7QodC40LTQvtGAPC9wPjwvdGQ+PHRkIHZh +bGlnbj0idG9wIiBzdHlsZT0id2lkdGg6IDE1Mi4wcHg7aGVpZ2h0OiAxMy4wcHg7Ym9yZGVyLXN0 +eWxlOiBzb2xpZDtib3JkZXItd2lkdGg6IDEuMHB4IDEuMHB4IDEuMHB4IDEuMHB4O2JvcmRlci1j +b2xvcjogIzAwMDAwMCAjMDAwMDAwICMwMDAwMDAgIzAwMDAwMDtwYWRkaW5nOiA0LjBweCA0LjBw +eCA0LjBweCA0LjBweCI+PHAgc3R5bGU9ImNvbG9yOiMwMDAwMDA7IGZvbnQ6MTIuMHB4IFRpbWVz +Ij7QodC40LTQvtGA0L7QstC40Yc8L3A+PC90ZD48L3RyPjwvdGJvZHk+PC90YWJsZT48L2Rpdj48 +ZGl2PiZuYnNwOzwvZGl2PjwvZGl2PjwvZGl2PjwvZGl2PjwvZGl2PjwvQk9EWT48L0hUTUw+Cg== + +----ALT--fE8cBf3B888BF99ccBe7796dD7741dBb1696256290-- diff --git a/dedoc/api/web/examples/example.html b/dedoc/api/web/examples/example.html new file mode 100644 index 00000000..0a6238e5 --- /dev/null +++ b/dedoc/api/web/examples/example.html @@ -0,0 +1,150 @@ + + + + + + + +

Пример документа

+ + +

Глава 1

+

Какие то определения

+ +

Статья 1

+ Определим определения + +

Текст бывает разных стилей

+ BOGUS TEXT + +
+

Статья 2

+

Дадим пояснения

+ + +
+ + + + + + + + + + + + + + + + + + +
NФамилияИмяОрганизацияТелефонПримечания
1ИвановИванИСП8-800
+ + + + + + + + + + + + + + + + + + + + + + +
ФамилияИмяОтчество
ИвановИванИванович
ПетровПётрПетрович
СидоровСидорСидорович
+ + \ No newline at end of file diff --git a/dedoc/api/static/example.json b/dedoc/api/web/examples/example.json similarity index 100% rename from dedoc/api/static/example.json rename to dedoc/api/web/examples/example.json diff --git a/dedoc/api/web/examples/example.mhtml b/dedoc/api/web/examples/example.mhtml new file mode 100644 index 00000000..5a70113c --- /dev/null +++ b/dedoc/api/web/examples/example.mhtml @@ -0,0 +1,129 @@ +MIME-Version: 1.0 +Content-Type: Multipart/related; boundary="boundary";type=Text/HTML + +--boundary +Content-Type: text/html; charset=utf-8 +Content-Transfer-Encoding: quoted-printable +Content-ID: <1> +Content-Location: file:///c:/temp/3e3c73a2-ec23-476e-bc5f-15640dc10796/input/example.html + + + + + + + + +

=D0=9F=D1=80=D0=B8=D0=BC=D0=B5=D1=80 =D0=B4=D0=BE=D0=BA=D1=83=D0=BC=D0= +=B5=D0=BD=D1=82=D0=B0

+ + +

=D0=93=D0=BB=D0=B0=D0=B2=D0=B0 1

+

=D0=9A=D0=B0=D0=BA=D0=B8=D0=B5 =D1=82=D0=BE =D0=BE=D0=BF=D1=80=D0=B5=D0=B4=D0=B5=D0=BB=D0=B5=D0=BD=D0=B8= +=D1=8F

+ +

=D0=A1=D1=82=D0=B0=D1=82=D1=8C=D1=8F 1

+ =D0=9E=D0=BF=D1=80=D0=B5=D0=B4=D0=B5=D0=BB=D0=B8=D0=BC =D0= +=BE=D0=BF=D1=80=D0=B5=D0=B4=D0=B5=D0=BB=D0=B5=D0=BD=D0=B8=D1=8F <= +/span> + +

=D0=A2=D0=B5=D0=BA=D1=81=D1=82 =D0=B1=D1=8B=D0=B2= +=D0=B0=D0=B5=D1=82 =D1=80=D0=B0=D0=B7=D0=BD=D1=8B=D1=85 = +=D1=81=D1=82=D0=B8=D0=BB=D0=B5=D0=B9

+ BOGUS TEXT + +
+

=D0=A1=D1=82=D0=B0=D1=82=D1=8C=D1=8F 2

+

=D0=94=D0=B0=D0=B4=D0=B8=D0=BC =D0=BF=D0=BE= +=D1=8F=D1=81=D0=BD=D0=B5=D0=BD=D0=B8=D1=8F

+ +
    +
  • 1.2.1. =D0=9F=D0=BE=D1=8F=D1=81=D0=BD=D0=B8=D0=BC =D0=B7=D0=B0 =D0=BD=D0=B5=D0=BF=D0=BE=D0=BD=D1=8F= +=D1=82=D0=BD=D0=BE=D0=B5
  • +
  • 1.2.2. =D0=9F=D0=BE=D1=8F=D1=81=D0=BD=D0=B8=D0= +=BC =D0=B7=D0=B0 =D0=BF=D0=BE=D0=BD=D1=8F=D1=82=D0=BD=D0=BE=D0=B5 +
      +
    1. =D1=8D=D1=82=D0=BE =D0=B4= +=D0=B0=D0=B6=D0=B5 =D0=B5=D0=B6=D1=83 =D0=BF=D0=BE=D0=BD=D1=8F=D1=82=D0=BD= +=D0=BE
    2. +
    3. =D1=8D=D1=82=D0=BE =D0=B5=D0=B6=D1=83 =D0=BD=D0=B5 =D0= +=BF=D0=BE=D0=BD=D1=8F=D1=82=D0=BD=D0=BE
    4. +
    +
  • +
  • 1.2.3.
  • +
+
+ + + + + + + + + + + + + + + + + +
N=D0=A4=D0=B0=D0=BC=D0=B8=D0=BB=D0=B8=D1=8F=D0=98=D0=BC=D1=8F=D0=9E=D1=80=D0=B3=D0=B0=D0=BD=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D1= +=8F=D0=A2=D0=B5=D0=BB=D0=B5=D1=84=D0=BE=D0=BD=D0=9F=D1=80=D0=B8=D0=BC=D0=B5=D1=87=D0=B0=D0=BD=D0=B8=D1=8F +
1=D0=98=D0=B2=D0=B0=D0=BD=D0=BE=D0=B2=D0=98=D0=B2=D0=B0=D0=BD=D0=98=D0=A1=D0=9F8-800
+ + + + + + + + + + + + + + + + + + + + + +
=D0=A4=D0=B0=D0=BC=D0=B8=D0=BB=D0=B8=D1=8F + =D0=98=D0=BC=D1=8F=D0=9E=D1=82=D1=87=D0=B5=D1=81=D1=82=D0=B2=D0=BE
=D0=98=D0=B2=D0=B0=D0=BD=D0=BE=D0=B2=D0=98=D0=B2=D0=B0=D0=BD=D0=98=D0=B2=D0=B0=D0=BD=D0=BE=D0=B2=D0=B8=D1=87
=D0=9F=D0=B5=D1=82=D1=80=D0=BE=D0=B2=D0=9F=D1=91=D1=82=D1=80=D0=9F=D0=B5=D1=82=D1=80=D0=BE=D0=B2=D0=B8=D1=87
=D0=A1=D0=B8=D0=B4=D0=BE=D1=80=D0=BE=D0=B2=D0=A1=D0=B8=D0=B4=D0=BE=D1=80=D0=A1=D0=B8=D0=B4=D0=BE=D1=80=D0=BE=D0=B2=D0=B8=D1=87
+ + + +--boundary-- \ No newline at end of file diff --git a/dedoc/api/static/example.odp b/dedoc/api/web/examples/example.odp similarity index 100% rename from dedoc/api/static/example.odp rename to dedoc/api/web/examples/example.odp diff --git a/dedoc/api/static/example.ods b/dedoc/api/web/examples/example.ods similarity index 100% rename from dedoc/api/static/example.ods rename to dedoc/api/web/examples/example.ods diff --git a/dedoc/api/static/example.odt b/dedoc/api/web/examples/example.odt similarity index 100% rename from dedoc/api/static/example.odt rename to dedoc/api/web/examples/example.odt diff --git a/dedoc/api/web/examples/example.pdf b/dedoc/api/web/examples/example.pdf new file mode 100644 index 00000000..a6aa8711 Binary files /dev/null and b/dedoc/api/web/examples/example.pdf differ diff --git a/dedoc/api/web/examples/example.png b/dedoc/api/web/examples/example.png new file mode 100644 index 00000000..322f7a60 Binary files /dev/null and b/dedoc/api/web/examples/example.png differ diff --git a/dedoc/api/static/example.ppt b/dedoc/api/web/examples/example.ppt similarity index 100% rename from dedoc/api/static/example.ppt rename to dedoc/api/web/examples/example.ppt diff --git a/dedoc/api/static/example.pptx b/dedoc/api/web/examples/example.pptx similarity index 100% rename from dedoc/api/static/example.pptx rename to dedoc/api/web/examples/example.pptx diff --git a/dedoc/api/static/csv_tab.tsv b/dedoc/api/web/examples/example.tsv similarity index 100% rename from dedoc/api/static/csv_tab.tsv rename to dedoc/api/web/examples/example.tsv diff --git a/dedoc/api/web/examples/example.txt b/dedoc/api/web/examples/example.txt new file mode 100644 index 00000000..fda72a0c --- /dev/null +++ b/dedoc/api/web/examples/example.txt @@ -0,0 +1,16 @@ +Пример документа + +Глава 1 +Какие то определения + +Статья 1 +Определим опрделения + +Статья 2 +Дадим пояснения + +1.2.1 Поясним за непонятное +1.2.2. Поясним за понятное + а) это даже ежу понятно + б) это ежу не понятно +1.2.3 diff --git a/dedoc/api/static/example.xls b/dedoc/api/web/examples/example.xls similarity index 100% rename from dedoc/api/static/example.xls rename to dedoc/api/web/examples/example.xls diff --git a/dedoc/api/static/example.xlsx b/dedoc/api/web/examples/example.xlsx similarity index 100% rename from dedoc/api/static/example.xlsx rename to dedoc/api/web/examples/example.xlsx diff --git a/dedoc/api/static/favicon.ico b/dedoc/api/web/favicon.ico similarity index 100% rename from dedoc/api/static/favicon.ico rename to dedoc/api/web/favicon.ico diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html new file mode 100644 index 00000000..27b72fdb --- /dev/null +++ b/dedoc/api/web/index.html @@ -0,0 +1,207 @@ + + + + + Dedoc | info page + + + + + + + + +

Dedoc

+ +

Dedoc is an open universal system for converting textual documents of different formats to a unified output representation.

+

See dedoc documentation to get more information about dedoc and its API parameters.

+ +

Parameters configuration

+ +
+ +
+

Type of document structure parsing

+
document_type, structure_type, return_format +
+

+ +

+ +

+ +

+ +

+ +

+
+
+ + +
+

Attachments handling

+
with_attachments, need_content_analysis, recursion_deep_attachments, return_base64, attachments_dir +
+

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+
+
+ + +
+

Tables handling

+
need_pdf_table_analysis, orient_analysis_cells, orient_cell_angle +
+

+ +

+ +

+ +

+ +

+ +

+
+
+ + +
+

PDF handling

+
pdf_with_text_layer, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization +
+

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+ +

+
+
+ +
+

Other formats handling

+
delimiter, encoding, handle_invisible_table +
+

+ +

+ +

+ +

+ +

+ +

+
+
+ +
+
+
+
+
+ +
+ + +

Useful links

+ + + + + \ No newline at end of file diff --git a/dedoc/api/web/supported_formats.html b/dedoc/api/web/supported_formats.html new file mode 100644 index 00000000..872d1b25 --- /dev/null +++ b/dedoc/api/web/supported_formats.html @@ -0,0 +1,160 @@ + + + + + Supported Formats + + + + + + + +

Supported Formats

+ +

PDF and image-like formats

+
    +
  1. + pdf ( + result, + result in html ) +
  2. + +
  3. + djvu ( + result, + result in html ) +
  4. + +
  5. + png ( + result, + result in html ) +
  6. + +
  7. + bmp ( + result, + result in html ) +
  8. +
+ + +

Rich text formats

+
    +
  1. + doc ( + result, + result in html ) +
  2. + +
  3. + docx ( + result, + result in html ) +
  4. + +
  5. + odt ( + result, + result in html ) +
  6. +
+ + +

Table formats

+
    +
  1. + xls ( + result, + result in html ) +
  2. + +
  3. + xlsx ( + result, + result in html ) +
  4. + +
  5. + ods ( + result, + result in html ) +
  6. + +
  7. + csv ( + result, + result in html ) +
  8. + +
  9. + tsv ( + result, + result in html ) +
  10. +
+ + +

Presentation formats

+
    +
  1. + ppt ( + result, + result in html ) +
  2. + +
  3. + pptx ( + result, + result in html ) +
  4. + +
  5. + odp ( + result, + result in html ) +
  6. +
+ + +

HTML-based formats

+
    +
  1. + html ( + result, + result in html ) +
  2. + +
  3. + mhtml ( + result, + result in html ) +
  4. + +
  5. + eml ( + result, + result in html ) +
  6. +
+ + +

Other formats

+
    +
  1. + txt ( + result, + result in html ) +
  2. + +
  3. + json ( + result, + result in html ) +
  4. +
+ +

<- go to the main page

+ + \ No newline at end of file diff --git a/dedoc/api/static/train_dataset/download.html b/dedoc/api/web/train_dataset/download.html similarity index 100% rename from dedoc/api/static/train_dataset/download.html rename to dedoc/api/web/train_dataset/download.html diff --git a/dedoc/api/static/train_dataset/form_input_archive.html b/dedoc/api/web/train_dataset/form_input_archive.html similarity index 68% rename from dedoc/api/static/train_dataset/form_input_archive.html rename to dedoc/api/web/train_dataset/form_input_archive.html index baa474b7..d50f5b73 100644 --- a/dedoc/api/static/train_dataset/form_input_archive.html +++ b/dedoc/api/web/train_dataset/form_input_archive.html @@ -3,15 +3,10 @@ Загрузка файла - - - + + - -
@@ -29,7 +24,9 @@

Распознавание структуры документа

pdf_with_text_layer

@@ -37,18 +34,15 @@

Распознавание структуры документа

- +

@@ -60,7 +54,8 @@

Распознавание структуры документа

Распознавание структуры документа

- +
@@ -95,19 +90,5 @@

Распознавание структуры документа

- - diff --git a/dedoc/api/static/train_dataset/info_labeling_mode.html b/dedoc/api/web/train_dataset/info_labeling_mode.html similarity index 72% rename from dedoc/api/static/train_dataset/info_labeling_mode.html rename to dedoc/api/web/train_dataset/info_labeling_mode.html index 9ef6caa6..1fe3ddfb 100644 --- a/dedoc/api/static/train_dataset/info_labeling_mode.html +++ b/dedoc/api/web/train_dataset/info_labeling_mode.html @@ -3,7 +3,6 @@ Dedoc | prepare label data - @@ -13,7 +12,7 @@

Процесс создания датасетов и обучения классификаторов

Шаг 1 - Формирование заданий для системы разметки

-
    +
    1. Запустите dedoc в режиме разметки путем включения строки labeling_mode=True в ваш конфиг файл config.py.
    2. @@ -23,23 +22,16 @@

      Шаг 1 - Формирование заданий для системы р
    3. Для формирования заданий для внешней системы разметки идем сюда и загружаем подготовленный архив с необходимыми параметрами.
    4. -

+

Шаг 2 - Разметка данных

Разметка подготовленных данных осуществляется с помощью внешней системы разметки

-

Шаг 3 - Обучение классификаторов дедка

+

Шаг 3 - Удаление данных для разметки

- Информация по обучению классификаторов строк и классификатора ориентации изображений - находится тут. -

- -

Шаг 4 - Удаление данных для разметки

-

- Тут можно удалить промежуточные данные, - используемые при создании заданий на разметку. + Тут можно удалить промежуточные данные, используемые при создании заданий на разметку.

diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index 1392277a..37f4f98c 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -62,7 +62,6 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct parameters_copy = copy.deepcopy(parameters) parameters_copy["is_attached"] = True - parameters_copy["attachment"] = attachment parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments) try: diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index 0079d5c6..ee308f1a 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -4,6 +4,7 @@ import tempfile from typing import Dict, Optional +from dedoc.api.api_args import QueryParameters from dedoc.common.exceptions.dedoc_error import DedocError from dedoc.config import get_config from dedoc.data_structures import ParsedDocument, UnstructuredDocument @@ -54,6 +55,8 @@ def __init__(self, config: Optional[dict] = None, manager_config: Optional[dict] self.attachments_handler = manager_config.get("attachments_handler", None) assert self.attachments_handler is not None, "Attachments handler shouldn't be None" + self.default_parameters = QueryParameters().to_dict() + def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> ParsedDocument: """ Run the whole pipeline of the document processing. @@ -63,7 +66,8 @@ def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> :param parameters: any parameters, specify how to parse file (see API parameters documentation for more details) :return: parsed document """ - parameters = {} if parameters is None else parameters + parameters = self.__init_parameters(parameters) + self.logger.info(f"Get file {os.path.basename(file_path)} with parameters {parameters}") try: return self.__parse_no_error_handling(file_path=file_path, parameters=parameters) @@ -129,6 +133,15 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) self.logger.info(f"Finish handle {file_name}") return parsed_document + def __init_parameters(self, parameters: Optional[dict]) -> dict: + parameters = {} if parameters is None else parameters + result_parameters = {} + + for parameter_name, parameter_value in self.default_parameters.items(): + result_parameters[parameter_name] = parameters.get(parameter_name, parameter_value) + + return result_parameters + def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None: save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path)) shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path))) diff --git a/dedoc/main.py b/dedoc/main.py index e80ef2e4..30b7f1f1 100644 --- a/dedoc/main.py +++ b/dedoc/main.py @@ -20,7 +20,7 @@ def main() -> None: config = get_config() if config.get("labeling_mode", False): - from api.train_dataset.api_collect_train_dataset import run_special_api # noqa + from api.train_dataset.train_dataset_api import run_special_api # noqa run_special_api() else: main() diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index fa43f173..28ee7775 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -47,9 +47,7 @@ def __init__(self, *, config: dict) -> None: """ super().__init__(config=config) self.scew_corrector = SkewCorrector() - self.column_orientation_classifier = ColumnsOrientationClassifier(on_gpu=False, - checkpoint_path=get_config()["resources_path"], - config=config) + self.column_orientation_classifier = ColumnsOrientationClassifier(on_gpu=False, checkpoint_path=get_config()["resources_path"], config=config) self.binarizer = AdaptiveBinarizer() self.ocr = OCRLineExtractor(config=config) self.logger = config.get("logger", logging.getLogger()) diff --git a/tests/unit_tests/test_misc_dedoc_manager.py b/tests/unit_tests/test_misc_dedoc_manager.py index 536d2eaf..d13f2a50 100644 --- a/tests/unit_tests/test_misc_dedoc_manager.py +++ b/tests/unit_tests/test_misc_dedoc_manager.py @@ -14,10 +14,9 @@ class TestDedocManager(TestCase): def test_parse_file(self) -> None: filename = "csv_tab.tsv" - result = self.dedoc_manager.parse(os.path.join(self.path, "csv_tab.tsv")) + result = self.dedoc_manager.parse(os.path.join(self.path, filename)) cells = result.content.tables[0].cells self.assertEqual(filename, result.metadata.file_name) - self.assertEqual(filename, result.metadata.file_name) self.assertLessEqual(["1", "2", "3"], [cell.get_text() for cell in cells[0]]) self.assertLessEqual(["2", "1", "5"], [cell.get_text() for cell in cells[1]]) self.assertLessEqual(["5", "3", "1"], [cell.get_text() for cell in cells[2]])