From 4921d67747e6939c4aac715d0c850175e2e44957 Mon Sep 17 00:00:00 2001
From: Nikita Shevtsov <61932814+Travvy88@users.noreply.github.com>
Date: Fri, 9 Aug 2024 11:44:08 +0300
Subject: [PATCH] TLDR-749 fast auto textual layer detection (#481)
Co-authored-by: Nikita Shevtsov
Co-authored-by: Nasty
---
dedoc/api/api_args.py | 2 ++
dedoc/api/web/index.html | 4 ++++
.../pdf_auto_reader/pdf_auto_reader.py | 1 +
.../pdf_auto_reader/txtlayer_detector.py | 9 +++++++--
docs/source/dedoc_api_usage/api.rst | 9 +++++++++
docs/source/parameters/pdf_handling.rst | 12 +++++++++++-
.../test_api_format_pdf_auto_text_layer.py | 19 +++++++++++++++++++
7 files changed, 53 insertions(+), 3 deletions(-)
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
index c77ec19a..8f3e1415 100644
--- a/dedoc/api/api_args.py
+++ b/dedoc/api/api_args.py
@@ -28,6 +28,8 @@ class QueryParameters:
# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
+ fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
+ description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
index d98a9161..423dbcfe 100644
--- a/dedoc/api/web/index.html
+++ b/dedoc/api/web/index.html
@@ -128,6 +128,10 @@ PDF handling
+
+
+
+