Skip to content

Commit

Permalink
Refactor train dataset api
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Oct 4, 2023
1 parent 8a99846 commit f020935
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 171 deletions.
49 changes: 24 additions & 25 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,44 +7,43 @@
@dataclass
class QueryParameters:
# type of document structure parsing
document_type: Optional[str] = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
structure_type: Optional[str] = Form("tree", enum=["linear", "tree"], description="Output structure type")
document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
description="Response representation, most types (except json) are used for debug purposes only")

# attachments handling
with_attachments: Optional[str] = Form("false", enum=["true", "false"], description="Enable attached files extraction")
need_content_analysis: Optional[str] = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
recursion_deep_attachments: Optional[str] = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
return_base64: Optional[str] = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
with_attachments: str = Form("false", enum=["true", "false"], description="Enable attached files extraction")
need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments")

# tables handling
need_pdf_table_analysis: Optional[str] = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
table_type: Optional[str] = Form("", description="Pipeline mode for table recognition")
orient_analysis_cells: Optional[str] = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
orient_cell_angle: Optional[str] = Form("90", enum=["90", "270"],
description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
table_type: str = Form("", description="Pipeline mode for table recognition")
orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
orient_cell_angle: str = Form("90", enum=["90", "270"],
description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')

# pdf handling
pdf_with_text_layer: Optional[str] = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
language: Optional[str] = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
pages: Optional[str] = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: Optional[str] = Form("auto", enum=["auto", "true", "false"],
description='One or multiple column document, "auto" - predict number of page columns automatically')
document_orientation: Optional[str] = Form("auto", enum=["auto", "no_change"],
description='Orientation of the document pages, "auto" - predict orientation (0, 90, 180, 270 degrees), '
'"no_change" - set vertical orientation of the document without using an orientation classifier')
need_header_footer_analysis: Optional[str] = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
need_binarization: Optional[str] = Form("false", enum=["true", "false"],
description="Binarize document pages (for images or PDF without a textual layer)")
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
description='One or multiple column document, "auto" - predict number of page columns automatically')
document_orientation: str = Form("auto", enum=["auto", "no_change"],
description='Orientation of the document pages, "auto" - predict orientation (0, 90, 180, 270 degrees), '
'"no_change" - set vertical orientation of the document without using an orientation classifier')
need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)")

# other formats handling
delimiter: Optional[str] = Form(None, description="Column separator for CSV files")
encoding: Optional[str] = Form(None, description="Document encoding")
html_fields: Optional[str] = Form("", description="List of fields for JSON documents to be parsed as HTML documents")
handle_invisible_table: Optional[str] = Form("false", enum=["true", "false"], description="Handle tables without visible borders as tables in HTML")
html_fields: str = Form("", description="List of fields for JSON documents to be parsed as HTML documents")
handle_invisible_table: str = Form("false", enum=["true", "false"], description="Handle tables without visible borders as tables in HTML")

def to_dict(self) -> dict:
parameters = {}
Expand Down
30 changes: 0 additions & 30 deletions dedoc/api/train_dataset/api_args.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import dataclasses
import logging
import os
import shutil
from dataclasses import dataclass
from typing import Optional

import uvicorn
from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
from fastapi import Depends, FastAPI, File, Form, Request, Response, UploadFile
from fastapi.staticfiles import StaticFiles
from starlette.responses import FileResponse, HTMLResponse
from starlette.templating import Jinja2Templates

from dedoc.api.api_args import QueryParameters
from dedoc.api.dedoc_api import _get_static_file_path
from dedoc.api.train_dataset.api_args import TrainDatasetParameters
from dedoc.api.train_dataset.async_archive_handler import AsyncHandler
from dedoc.config import get_config
from dedoc.dedoc_manager import DedocManager
Expand All @@ -21,6 +24,18 @@
from dedoc.train_dataset.train_dataset_utils import get_path_original_documents
from dedoc.utils.utils import calculate_file_hash


@dataclass
class TrainDatasetParameters(QueryParameters):
type_of_task: Optional[str] = Form("law_classifier",
enum=[
"law_classifier", "tz_classifier", "diploma_classifier", "header_classifier", "paragraph_classifier",
"tables_classifier"
],
description="Type of the task to create")
task_size: Optional[str] = Form("250", description="Maximum number of images in one task")


config = get_config()
PORT = config["api_port"]

Expand Down Expand Up @@ -161,7 +176,7 @@ def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParam
Run the whole pipeline of task making.
"""
clear()
parameters = query_params.dict(by_alias=True)
parameters = dataclasses.asdict(query_params)
uid = handler.handle(file=file, parameters=parameters)
return HTMLResponse(f'Successfully handle file. UID=<p><a href="/get_result_archive/?uid={uid}">get_result_archive/?uid={uid}</a></p>', status_code=201)

Expand All @@ -186,11 +201,6 @@ def get_result_archive(request: Request, uid: str) -> Response:
return HTMLResponse(response, status_code=202)


@app.get("/info_classifiers")
def get_classifiers_info() -> Response:
return FileResponse(os.path.join(static_path, "train_dataset/refit_classifier.html"))


@app.get("/static_file")
def get_static_file(request: Request) -> Response:
path = _get_static_file_path(request)
Expand Down
32 changes: 9 additions & 23 deletions dedoc/api/web/train_dataset/form_input_archive.html
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,25 @@ <h2>Распознавание структуры документа</h2>
<select name="pdf_with_text_layer">
<option value="true">true</option>
<option value="false">false</option>
<option value="auto" selected>auto</option>
<option value="auto">auto</option>
<option value="auto_tabby" selected>auto_tabby</option>
<option value="tabby">tabby</option>
</select> pdf_with_text_layer
</label>
</p>

<p>
<label id="language-2">
<select name="language">
<option value="rus" selected>rus</option>
<option value="rus">rus</option>
<option value="eng">eng</option>
<option value="rus+eng">rus+eng</option>
<option value="rus+eng" selected>rus+eng</option>
</select> language
</label>
</p>

<p>
<label>
<input type="checkbox" name="need_header_footer_analysis" value=True>
need_header_footer_analysis
</label>
<label><input name="need_header_footer_analysis" type="checkbox" value="true"> need_header_footer_analysis</label>
</p>

<p>
Expand All @@ -55,7 +54,8 @@ <h2>Распознавание структуры документа</h2>
<p>
<label>
<select name="document_type">
<option value="law" selected>law</option>
<option value="" selected>other</option>
<option value="law">law</option>
<option value="tz">tz</option>
<option value="diploma">diploma</option>
</select> document_type
Expand All @@ -77,7 +77,7 @@ <h2>Распознавание структуры документа</h2>

<div class="row">
<div class="col-md-4">
<input type=file name=file class="btn btn-default" data-buttonText="Выберите файл" id="select-file-box" onchange="UpdateVisibility()">
<input type=file name=file class="btn btn-default" data-buttonText="Выберите файл">
</div>

<div class="col-md-2">
Expand All @@ -90,19 +90,5 @@ <h2>Распознавание структуры документа</h2>
</div>
</div>
</div>

<script>
let htmlFieldsBox = document.getElementById("html-fields")
let attachmentsBox = document.getElementById("with-attachments-box")
let fileBox = document.getElementById("select-file-box")
htmlFieldsBox.style.display = "none"

function UpdateVisibility() {
let withAttachments = attachmentsBox.checked
let isJson = fileBox.value.endsWith(".json")
htmlFieldsBox.style.display = isJson && withAttachments ? "block" : "none"
}

</script>
</body>
</html>
15 changes: 4 additions & 11 deletions dedoc/api/web/train_dataset/info_labeling_mode.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
<h1> Процесс создания датасетов и обучения классификаторов </h1>

<h3> Шаг 1 - Формирование заданий для системы разметки </h3>
<ul>
<ol>
<li>
Запустите dedoc в режиме разметки путем включения строки <i>labeling_mode=True</i> в ваш конфиг файл <i>config.py</i>.
</li>
Expand All @@ -22,23 +22,16 @@ <h3> Шаг 1 - Формирование заданий для системы р
<li>
Для формирования заданий для внешней системы разметки идем <a href="/handle_archive" target="_blank" rel="noopener">сюда</a> и загружаем подготовленный архив с необходимыми параметрами.
</li>
</ul>
</ol>

<h3> Шаг 2 - Разметка данных </h3>
<p>
Разметка подготовленных данных осуществляется с помощью <a href=https://github.com/dronperminov/ImageClassifier>внешней системы разметки</a>
</p>

<h3> Шаг 3 - Обучение классификаторов дедка </h3>
<h3> Шаг 3 - Удаление данных для разметки </h3>
<p>
Информация по обучению классификаторов строк и классификатора ориентации изображений
находится <a href="/info_classifiers" target="_blank" rel="noopener">тут</a>.
</p>

<h3> Шаг 4 - Удаление данных для разметки </h3>
<p>
<a href="/clear" target="_blank" rel="noopener">Тут</a> можно удалить промежуточные данные,
используемые при создании заданий на разметку.
<a href="/clear" target="_blank" rel="noopener">Тут</a> можно удалить промежуточные данные, используемые при создании заданий на разметку.
</p>
</div>
</body>
Expand Down
66 changes: 0 additions & 66 deletions dedoc/api/web/train_dataset/refit_classifier.html

This file was deleted.

1 change: 1 addition & 0 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
continue

parameters_copy = copy.deepcopy(parameters)
parameters_copy["is_attached"] = True
parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments)

try:
Expand Down
Loading

0 comments on commit f020935

Please sign in to comment.