From 4921d67747e6939c4aac715d0c850175e2e44957 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov <61932814+Travvy88@users.noreply.github.com> Date: Fri, 9 Aug 2024 11:44:08 +0300 Subject: [PATCH] TLDR-749 fast auto textual layer detection (#481) Co-authored-by: Nikita Shevtsov Co-authored-by: Nasty --- dedoc/api/api_args.py | 2 ++ dedoc/api/web/index.html | 4 ++++ .../pdf_auto_reader/pdf_auto_reader.py | 1 + .../pdf_auto_reader/txtlayer_detector.py | 9 +++++++-- docs/source/dedoc_api_usage/api.rst | 9 +++++++++ docs/source/parameters/pdf_handling.rst | 12 +++++++++++- .../test_api_format_pdf_auto_text_layer.py | 19 +++++++++++++++++++ 7 files changed, 53 insertions(+), 3 deletions(-) diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index c77ec19a..8f3e1415 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -28,6 +28,8 @@ class QueryParameters: # pdf handling pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], description="Extract text from a text layer of PDF or using OCR methods for image-like documents") + fast_textual_layer_detection: str = Form("false", enum=["true", "false"], + description="Use non-ML solution to detect textual layer. Much faster but less accurate.") language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')") pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right') is_one_column_document: str = Form("auto", enum=["auto", "true", "false"], diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index d98a9161..423dbcfe 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -128,6 +128,10 @@

PDF handling

+

+ +

+