-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* delete unused files * Delete unused files, refactor html * Refactor query parameters * Fix tests * Refactor train dataset api * Fix style * Change python version in tests * Review fixes
- Loading branch information
1 parent
896a31a
commit 18d80b0
Showing
69 changed files
with
989 additions
and
1,772 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,100 +1,54 @@ | ||
from typing import Any, Optional | ||
|
||
from fastapi import Body | ||
from pydantic import BaseModel | ||
|
||
|
||
class QueryParameters(BaseModel): | ||
document_type: Optional[str] | ||
structure_type: Optional[str] | ||
return_format: Optional[str] | ||
|
||
with_attachments: Optional[str] | ||
need_content_analysis: Optional[str] | ||
recursion_deep_attachments: Optional[str] | ||
return_base64: Optional[str] | ||
attachments_dir: Optional[str] | ||
|
||
need_pdf_table_analysis: Optional[str] | ||
table_type: Optional[str] | ||
orient_analysis_cells: Optional[str] | ||
orient_cell_angle: Optional[str] | ||
|
||
pdf_with_text_layer: Optional[str] | ||
language: Optional[str] | ||
pages: Optional[str] | ||
is_one_column_document: Optional[str] | ||
document_orientation: Optional[str] | ||
need_header_footer_analysis: Optional[str] | ||
need_binarization: Optional[str] | ||
|
||
delimiter: Optional[str] | ||
encoding: Optional[str] | ||
html_fields: Optional[str] | ||
handle_invisible_table: Optional[str] | ||
|
||
def __init__(self, | ||
# type of document structure parsing | ||
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), # noqa | ||
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), # noqa | ||
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), # noqa | ||
|
||
# attachments handling | ||
with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), # noqa | ||
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), # noqa | ||
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), # noqa | ||
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), # noqa | ||
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # noqa | ||
|
||
# tables handling | ||
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), # noqa | ||
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), # noqa | ||
orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), # noqa | ||
orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), # noqa | ||
|
||
# pdf handling | ||
pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), # noqa | ||
language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), # noqa | ||
pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), # noqa | ||
is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), # noqa | ||
document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), # noqa | ||
need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), # noqa | ||
need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), # noqa | ||
|
||
# other formats handling | ||
delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), # noqa | ||
encoding: Optional[str] = Body(description="a document encoding", default=None), # noqa | ||
html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), # noqa | ||
handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), # noqa | ||
|
||
|
||
**data: Any) -> None: # noqa | ||
|
||
super().__init__(**data) | ||
self.document_type: str = document_type or "" | ||
self.structure_type: str = structure_type or "tree" | ||
self.return_format: str = return_format or "json" | ||
|
||
self.with_attachments: str = with_attachments or "false" | ||
self.need_content_analysis: str = need_content_analysis or "false" | ||
self.recursion_deep_attachments: str = recursion_deep_attachments or "10" | ||
self.return_base64: str = return_base64 or "false" | ||
self.attachments_dir: str = attachments_dir | ||
|
||
self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true" | ||
self.table_type: str = table_type or "" | ||
self.orient_analysis_cells: str = orient_analysis_cells or "false" | ||
self.orient_cell_angle: str = orient_cell_angle or "90" | ||
|
||
self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby" | ||
self.language: str = language or "rus+eng" | ||
self.pages: str = pages or ":" | ||
self.is_one_column_document: str = is_one_column_document or "auto" | ||
self.document_orientation: str = document_orientation or "auto" | ||
self.need_header_footer_analysis: str = need_header_footer_analysis or "false" | ||
self.need_binarization: str = need_binarization or "false" | ||
|
||
self.delimiter: str = delimiter | ||
self.encoding: str = encoding | ||
self.html_fields: str = html_fields or "" | ||
self.handle_invisible_table: str = handle_invisible_table or "false" | ||
from dataclasses import asdict, dataclass | ||
from typing import Optional | ||
|
||
from fastapi import Form | ||
|
||
|
||
@dataclass | ||
class QueryParameters: | ||
# type of document structure parsing | ||
document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain") | ||
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type") | ||
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"], | ||
description="Response representation, most types (except json) are used for debug purposes only") | ||
|
||
# attachments handling | ||
with_attachments: str = Form("false", enum=["true", "false"], description="Enable attached files extraction") | ||
need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files") | ||
recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true") | ||
return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format") | ||
attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments") | ||
|
||
# tables handling | ||
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf") | ||
table_type: str = Form("", description="Pipeline mode for table recognition") | ||
orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers") | ||
orient_cell_angle: str = Form("90", enum=["90", "270"], | ||
description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation') | ||
|
||
# pdf handling | ||
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], | ||
description="Extract text from a text layer of PDF or using OCR methods for image-like documents") | ||
language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language") | ||
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right') | ||
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"], | ||
description='One or multiple column document, "auto" - predict number of page columns automatically') | ||
document_orientation: str = Form("auto", enum=["auto", "no_change"], | ||
description='Orientation of the document pages, "auto" - predict orientation (0, 90, 180, 270 degrees), ' | ||
'"no_change" - set vertical orientation of the document without using an orientation classifier') | ||
need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result") | ||
need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)") | ||
|
||
# other formats handling | ||
delimiter: Optional[str] = Form(None, description="Column separator for CSV files") | ||
encoding: Optional[str] = Form(None, description="Document encoding") | ||
html_fields: str = Form("", description="List of fields for JSON documents to be parsed as HTML documents") | ||
handle_invisible_table: str = Form("false", enum=["true", "false"], description="Handle tables without visible borders as tables in HTML") | ||
|
||
def to_dict(self) -> dict: | ||
parameters = {} | ||
|
||
for parameter_name, parameter_value in asdict(self).items(): | ||
parameters[parameter_name] = getattr(parameter_value, "default", parameter_value) | ||
|
||
return parameters |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.