Refactor train dataset api

ispras · Oct 4, 2023 · f020935 · f020935
1 parent 8a99846
commit f020935
Show file tree

Hide file tree

Showing 9 changed files with 63 additions and 171 deletions.
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -7,44 +7,43 @@
 @dataclass
 class QueryParameters:
     # type of document structure parsing
-    document_type: Optional[str] = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
-    structure_type: Optional[str] = Form("tree", enum=["linear", "tree"], description="Output structure type")
+    document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
+    structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
     return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
                               description="Response representation, most types (except json) are used for debug purposes only")
 
     # attachments handling
-    with_attachments: Optional[str] = Form("false", enum=["true", "false"], description="Enable attached files extraction")
-    need_content_analysis: Optional[str] = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
-    recursion_deep_attachments: Optional[str] = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
-    return_base64: Optional[str] = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
+    with_attachments: str = Form("false", enum=["true", "false"], description="Enable attached files extraction")
+    need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
+    recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
+    return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
     attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments")
 
     # tables handling
-    need_pdf_table_analysis: Optional[str] = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
-    table_type: Optional[str] = Form("", description="Pipeline mode for table recognition")
-    orient_analysis_cells: Optional[str] = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
-    orient_cell_angle: Optional[str] = Form("90", enum=["90", "270"],
-                                            description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
+    need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
+    table_type: str = Form("", description="Pipeline mode for table recognition")
+    orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
+    orient_cell_angle: str = Form("90", enum=["90", "270"],
+                                  description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
 
     # pdf handling
-    pdf_with_text_layer: Optional[str] = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
-                                              description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
-    language: Optional[str] = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
-    pages: Optional[str] = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
-    is_one_column_document: Optional[str] = Form("auto", enum=["auto", "true", "false"],
-                                                 description='One or multiple column document, "auto" - predict number of page columns automatically')
-    document_orientation: Optional[str] = Form("auto", enum=["auto", "no_change"],
-                                               description='Orientation of the document pages, "auto" - predict orientation (0, 90, 180, 270 degrees), '
-                                                           '"no_change" - set vertical orientation of the document without using an orientation classifier')
-    need_header_footer_analysis: Optional[str] = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
-    need_binarization: Optional[str] = Form("false", enum=["true", "false"],
-                                            description="Binarize document pages (for images or PDF without a textual layer)")
+    pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
+                                    description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
+    language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
+    pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
+    is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
+                                       description='One or multiple column document, "auto" - predict number of page columns automatically')
+    document_orientation: str = Form("auto", enum=["auto", "no_change"],
+                                     description='Orientation of the document pages, "auto" - predict orientation (0, 90, 180, 270 degrees), '
+                                                 '"no_change" - set vertical orientation of the document without using an orientation classifier')
+    need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
+    need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)")
 
     # other formats handling
     delimiter: Optional[str] = Form(None, description="Column separator for CSV files")
     encoding: Optional[str] = Form(None, description="Document encoding")
-    html_fields: Optional[str] = Form("", description="List of fields for JSON documents to be parsed as HTML documents")
-    handle_invisible_table: Optional[str] = Form("false", enum=["true", "false"], description="Handle tables without visible borders as tables in HTML")
+    html_fields: str = Form("", description="List of fields for JSON documents to be parsed as HTML documents")
+    handle_invisible_table: str = Form("false", enum=["true", "false"], description="Handle tables without visible borders as tables in HTML")
 
     def to_dict(self) -> dict:
         parameters = {}

diff --git a/dedoc/api/train_dataset/api_args.py b/dedoc/api/train_dataset/api_args.py
diff --git a/...rain_dataset/api_collect_train_dataset.py → dedoc/api/train_dataset/train_dataset_api.py b/...rain_dataset/api_collect_train_dataset.py → dedoc/api/train_dataset/train_dataset_api.py
@@ -1,15 +1,18 @@
+import dataclasses
 import logging
 import os
 import shutil
+from dataclasses import dataclass
+from typing import Optional
 
 import uvicorn
-from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
+from fastapi import Depends, FastAPI, File, Form, Request, Response, UploadFile
 from fastapi.staticfiles import StaticFiles
 from starlette.responses import FileResponse, HTMLResponse
 from starlette.templating import Jinja2Templates
 
+from dedoc.api.api_args import QueryParameters
 from dedoc.api.dedoc_api import _get_static_file_path
-from dedoc.api.train_dataset.api_args import TrainDatasetParameters
 from dedoc.api.train_dataset.async_archive_handler import AsyncHandler
 from dedoc.config import get_config
 from dedoc.dedoc_manager import DedocManager
@@ -21,6 +24,18 @@
 from dedoc.train_dataset.train_dataset_utils import get_path_original_documents
 from dedoc.utils.utils import calculate_file_hash
 
+
+@dataclass
+class TrainDatasetParameters(QueryParameters):
+    type_of_task: Optional[str] = Form("law_classifier",
+                                       enum=[
+                                           "law_classifier", "tz_classifier", "diploma_classifier", "header_classifier", "paragraph_classifier",
+                                           "tables_classifier"
+                                       ],
+                                       description="Type of the task to create")
+    task_size: Optional[str] = Form("250", description="Maximum number of images in one task")
+
+
 config = get_config()
 PORT = config["api_port"]
 
@@ -161,7 +176,7 @@ def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParam
     Run the whole pipeline of task making.
     """
     clear()
-    parameters = query_params.dict(by_alias=True)
+    parameters = dataclasses.asdict(query_params)
     uid = handler.handle(file=file, parameters=parameters)
     return HTMLResponse(f'Successfully handle file. UID=<p><a href="/get_result_archive/?uid={uid}">get_result_archive/?uid={uid}</a></p>', status_code=201)
 
@@ -186,11 +201,6 @@ def get_result_archive(request: Request, uid: str) -> Response:
         return HTMLResponse(response, status_code=202)
 
 
-@app.get("/info_classifiers")
-def get_classifiers_info() -> Response:
-    return FileResponse(os.path.join(static_path, "train_dataset/refit_classifier.html"))
-
-
 @app.get("/static_file")
 def get_static_file(request: Request) -> Response:
     path = _get_static_file_path(request)

diff --git a/dedoc/api/web/train_dataset/form_input_archive.html b/dedoc/api/web/train_dataset/form_input_archive.html
@@ -24,26 +24,25 @@ <h2>Распознавание структуры документа</h2>
                             <select name="pdf_with_text_layer">
                                 <option value="true">true</option>
                                 <option value="false">false</option>
-                                <option value="auto" selected>auto</option>
+                                <option value="auto">auto</option>
+                                <option value="auto_tabby" selected>auto_tabby</option>
+                                <option value="tabby">tabby</option>
                             </select> pdf_with_text_layer
                         </label>
                     </p>
 
                     <p>
                         <label id="language-2">
                             <select name="language">
-                                <option value="rus" selected>rus</option>
+                                <option value="rus">rus</option>
                                 <option value="eng">eng</option>
-                                <option value="rus+eng">rus+eng</option>
+                                <option value="rus+eng" selected>rus+eng</option>
                             </select> language
                         </label>
                     </p>
 
                     <p>
-                        <label>
-                            <input type="checkbox" name="need_header_footer_analysis" value=True>
-                            need_header_footer_analysis
-                        </label>
+                        <label><input name="need_header_footer_analysis" type="checkbox" value="true"> need_header_footer_analysis</label>
                     </p>
 
                     <p>
@@ -55,7 +54,8 @@ <h2>Распознавание структуры документа</h2>
                     <p>
                         <label>
                             <select name="document_type">
-                                <option value="law" selected>law</option>
+                                <option value="" selected>other</option>
+                                <option value="law">law</option>
                                 <option value="tz">tz</option>
                                 <option value="diploma">diploma</option>
                             </select> document_type
@@ -77,7 +77,7 @@ <h2>Распознавание структуры документа</h2>
 
                     <div class="row">
                         <div class="col-md-4">
-                            <input type=file name=file class="btn btn-default" data-buttonText="Выберите файл" id="select-file-box" onchange="UpdateVisibility()">
+                            <input type=file name=file class="btn btn-default" data-buttonText="Выберите файл">
                         </div>
 
                         <div class="col-md-2">
@@ -90,19 +90,5 @@ <h2>Распознавание структуры документа</h2>
             </div>
         </div>
     </div>
-
-    <script>
-        let htmlFieldsBox = document.getElementById("html-fields")
-        let attachmentsBox = document.getElementById("with-attachments-box")
-        let fileBox = document.getElementById("select-file-box")
-        htmlFieldsBox.style.display = "none"
-
-        function UpdateVisibility() {
-            let withAttachments = attachmentsBox.checked
-            let isJson = fileBox.value.endsWith(".json")
-            htmlFieldsBox.style.display = isJson && withAttachments ? "block" : "none"
-        }
-
-    </script>
 </body>
 </html>
diff --git a/dedoc/api/web/train_dataset/info_labeling_mode.html b/dedoc/api/web/train_dataset/info_labeling_mode.html
@@ -12,7 +12,7 @@
         <h1> Процесс создания датасетов и обучения классификаторов </h1>
 
             <h3> Шаг 1 - Формирование заданий для системы разметки </h3>
-                <ul>
+                <ol>
                     <li>
                         Запустите dedoc в режиме разметки путем включения строки <i>labeling_mode=True</i> в ваш конфиг файл <i>config.py</i>.
                     </li>
@@ -22,23 +22,16 @@ <h3> Шаг 1 - Формирование заданий для системы р
                     <li>
                         Для формирования заданий для внешней системы разметки идем <a href="/handle_archive" target="_blank" rel="noopener">сюда</a> и загружаем подготовленный архив с необходимыми параметрами.
                     </li>
-                </ul>
+                </ol>
 
             <h3> Шаг 2 - Разметка данных </h3>
                 <p>
                     Разметка подготовленных данных осуществляется с помощью <a href=https://github.com/dronperminov/ImageClassifier>внешней системы разметки</a>
                 </p>
 
-            <h3> Шаг 3 - Обучение классификаторов дедка </h3>
+            <h3> Шаг 3 - Удаление данных для разметки </h3>
                 <p>
-                    Информация по обучению классификаторов строк и классификатора ориентации изображений
-                    находится <a href="/info_classifiers" target="_blank" rel="noopener">тут</a>.
-                </p>
-
-            <h3> Шаг 4 - Удаление данных для разметки </h3>
-                <p>
-                    <a href="/clear" target="_blank" rel="noopener">Тут</a> можно удалить промежуточные данные,
-                    используемые при создании заданий на разметку.
+                    <a href="/clear" target="_blank" rel="noopener">Тут</a> можно удалить промежуточные данные, используемые при создании заданий на разметку.
                 </p>
     </div>
 </body>

diff --git a/dedoc/api/web/train_dataset/refit_classifier.html b/dedoc/api/web/train_dataset/refit_classifier.html
diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py
@@ -61,6 +61,7 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
                 continue
 
             parameters_copy = copy.deepcopy(parameters)
+            parameters_copy["is_attached"] = True
             parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments)
 
             try: