Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-481 html refactoring #344

Merged
merged 8 commits into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test_on_push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: '3.8'
python-version: '3.9'
- name: Run lint
run: |
python3 -m pip install --upgrade pip
Expand Down
154 changes: 54 additions & 100 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
@@ -1,100 +1,54 @@
from typing import Any, Optional

from fastapi import Body
from pydantic import BaseModel


class QueryParameters(BaseModel):
document_type: Optional[str]
structure_type: Optional[str]
return_format: Optional[str]

with_attachments: Optional[str]
need_content_analysis: Optional[str]
recursion_deep_attachments: Optional[str]
return_base64: Optional[str]
attachments_dir: Optional[str]

need_pdf_table_analysis: Optional[str]
table_type: Optional[str]
orient_analysis_cells: Optional[str]
orient_cell_angle: Optional[str]

pdf_with_text_layer: Optional[str]
language: Optional[str]
pages: Optional[str]
is_one_column_document: Optional[str]
document_orientation: Optional[str]
need_header_footer_analysis: Optional[str]
need_binarization: Optional[str]

delimiter: Optional[str]
encoding: Optional[str]
html_fields: Optional[str]
handle_invisible_table: Optional[str]

def __init__(self,
# type of document structure parsing
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), # noqa
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), # noqa
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), # noqa

# attachments handling
with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), # noqa
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), # noqa
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), # noqa
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), # noqa
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # noqa

# tables handling
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), # noqa
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), # noqa
orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), # noqa
orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), # noqa

# pdf handling
pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), # noqa
language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), # noqa
pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), # noqa
is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), # noqa
document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), # noqa
need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), # noqa
need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), # noqa

# other formats handling
delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), # noqa
encoding: Optional[str] = Body(description="a document encoding", default=None), # noqa
html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), # noqa
handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), # noqa


**data: Any) -> None: # noqa

super().__init__(**data)
self.document_type: str = document_type or ""
self.structure_type: str = structure_type or "tree"
self.return_format: str = return_format or "json"

self.with_attachments: str = with_attachments or "false"
self.need_content_analysis: str = need_content_analysis or "false"
self.recursion_deep_attachments: str = recursion_deep_attachments or "10"
self.return_base64: str = return_base64 or "false"
self.attachments_dir: str = attachments_dir

self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true"
self.table_type: str = table_type or ""
self.orient_analysis_cells: str = orient_analysis_cells or "false"
self.orient_cell_angle: str = orient_cell_angle or "90"

self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby"
self.language: str = language or "rus+eng"
self.pages: str = pages or ":"
self.is_one_column_document: str = is_one_column_document or "auto"
self.document_orientation: str = document_orientation or "auto"
self.need_header_footer_analysis: str = need_header_footer_analysis or "false"
self.need_binarization: str = need_binarization or "false"

self.delimiter: str = delimiter
self.encoding: str = encoding
self.html_fields: str = html_fields or ""
self.handle_invisible_table: str = handle_invisible_table or "false"
from dataclasses import asdict, dataclass
from typing import Optional

from fastapi import Form


@dataclass
class QueryParameters:
# type of document structure parsing
document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
description="Response representation, most types (except json) are used for debug purposes only")

# attachments handling
with_attachments: str = Form("false", enum=["true", "false"], description="Enable attached files extraction")
need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments")

# tables handling
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
table_type: str = Form("", description="Pipeline mode for table recognition")
orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
orient_cell_angle: str = Form("90", enum=["90", "270"],
description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')

# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
description='One or multiple column document, "auto" - predict number of page columns automatically')
document_orientation: str = Form("auto", enum=["auto", "no_change"],
description='Orientation of the document pages, "auto" - predict orientation (0, 90, 180, 270 degrees), '
'"no_change" - set vertical orientation of the document without using an orientation classifier')
need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)")

# other formats handling
delimiter: Optional[str] = Form(None, description="Column separator for CSV files")
encoding: Optional[str] = Form(None, description="Document encoding")
html_fields: str = Form("", description="List of fields for JSON documents to be parsed as HTML documents")
handle_invisible_table: str = Form("false", enum=["true", "false"], description="Handle tables without visible borders as tables in HTML")

def to_dict(self) -> dict:
parameters = {}

for parameter_name, parameter_value in asdict(self).items():
parameters[parameter_name] = getattr(parameter_value, "default", parameter_value)

return parameters
43 changes: 27 additions & 16 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import dataclasses
import importlib
import json
import os
import tempfile
from typing import Optional

import uvicorn
from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
Expand All @@ -19,11 +22,11 @@

config = get_config()
PORT = config["api_port"]
static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static/")
static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "web")
static_files_dirs = config.get("static_files_dirs")

app = FastAPI()
app.mount("/static", StaticFiles(directory=config.get("static_path", static_path)), name="static")
app.mount("/web", StaticFiles(directory=config.get("static_path", static_path)), name="web")

module_api_args = importlib.import_module(config["import_path_init_api_args"])
logger = config["logger"]
Expand All @@ -36,14 +39,12 @@ def get_info() -> Response:
Root URL "/" is need start with simple Flask before rest-plus. API otherwise you will get 404 Error.
It is bug of rest-plus lib.
"""
return FileResponse(os.path.join(static_path, "html_eng/info.html"))
return FileResponse(os.path.join(static_path, "index.html"))


@app.get("/static_file")
def get_static_file(request: Request) -> Response:
path = _get_static_file_path(request)
# TODO check as_attachment
as_attachment = request.query_params.get("as_attachment") == "true" # noqa
return FileResponse(path)


Expand All @@ -61,35 +62,45 @@ def _get_static_file_path(request: Request) -> str:

@app.post("/upload")
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa
parameters = query_params.dict(by_alias=True)

parameters = dataclasses.asdict(query_params)
dronperminov marked this conversation as resolved.
Show resolved Hide resolved
if not file or file.filename == "":
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__)
# check if the post request_post has the file part

logger.info(f"Get file {file.filename} with parameters {parameters}")
with tempfile.TemporaryDirectory() as tmpdir:
file_path = save_upload_file(file, tmpdir)
document_tree = manager.parse(file_path, parameters=dict(parameters))

return_format = str(parameters.get("return_format", "json")).lower()
if return_format == "html":
html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)
return HTMLResponse(content=html_content, status_code=200)
return HTMLResponse(content=html_content)
elif return_format == "plain_text":
txt_content = json2txt(paragraph=document_tree.content.structure)
return PlainTextResponse(content=txt_content, status_code=200)
return PlainTextResponse(content=txt_content)
elif return_format == "tree":
html_content = json2tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content, status_code=200)
return HTMLResponse(content=html_content)
elif return_format == "ujson":
return UJSONResponse(content=document_tree.to_dict(), status_code=200)
elif str(parameters.get("return_format", "json")).lower() == "collapsed_tree":
return UJSONResponse(content=document_tree.to_dict())
elif return_format == "collapsed_tree":
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content, status_code=200)
return HTMLResponse(content=html_content)
elif return_format == "pretty_json":
return PlainTextResponse(content=json.dumps(document_tree.to_dict(), ensure_ascii=False, indent=2))
else:
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
return ORJSONResponse(content=document_tree.to_dict(), status_code=200)
return ORJSONResponse(content=document_tree.to_dict())


@app.get("/upload_example")
async def upload_example(file_name: str, return_format: Optional[str] = None) -> Response:
file_path = os.path.join(static_path, "examples", file_name)
parameters = {} if return_format is None else {"return_format": return_format}
document_tree = manager.parse(file_path, parameters=parameters)

if return_format == "html":
return HTMLResponse(content=json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0))
return ORJSONResponse(content=document_tree.to_dict(), status_code=200)


@app.exception_handler(DedocError)
Expand Down
11 changes: 0 additions & 11 deletions dedoc/api/static/books_2.csv

This file was deleted.

11 changes: 0 additions & 11 deletions dedoc/api/static/components.js

This file was deleted.

3 changes: 0 additions & 3 deletions dedoc/api/static/csv_semicolon.csv

This file was deleted.

Binary file removed dedoc/api/static/example.zip
Binary file not shown.
Binary file removed dedoc/api/static/example_with_images.xls
Binary file not shown.
Binary file removed dedoc/api/static/example_with_images.xlsx
Binary file not shown.
37 changes: 0 additions & 37 deletions dedoc/api/static/exampletable.json

This file was deleted.

11 changes: 0 additions & 11 deletions dedoc/api/static/html_eng/code_example.html

This file was deleted.

13 changes: 0 additions & 13 deletions dedoc/api/static/html_eng/errors.html

This file was deleted.

Loading
Loading