diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
index c77ec19a..8f3e1415 100644
--- a/dedoc/api/api_args.py
+++ b/dedoc/api/api_args.py
@@ -28,6 +28,8 @@ class QueryParameters:
# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
+ fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
+ description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
index d98a9161..423dbcfe 100644
--- a/dedoc/api/web/index.html
+++ b/dedoc/api/web/index.html
@@ -128,6 +128,10 @@
PDF handling
+
+ fast_textual_layer_detection
+
+
language
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
index 523c96a1..eb7c933d 100644
--- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
@@ -52,6 +52,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
"""
+ parameters = {} if parameters is None else parameters
warnings = []
txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters)
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
index 0500698f..1ad11b40 100644
--- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
+++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
@@ -29,8 +29,13 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
"""
try:
lines = self.__get_lines_for_predict(path=path, parameters=parameters)
- is_correct = self.txtlayer_classifier.predict(lines)
- first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
+ if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true":
+ is_correct = any(line.line.strip() for line in lines)
+ first_page_lines = [line for line in lines if line.metadata.page_id == 0]
+ first_page_correct = bool(first_page_lines) and any(line.line.strip() for line in first_page_lines)
+ else:
+ is_correct = self.txtlayer_classifier.predict(lines)
+ first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
return PdfTxtlayerParameters(is_correct_text_layer=is_correct, is_first_page_correct=first_page_correct)
except Exception as e:
diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
index df6b4963..13cd0eaf 100644
--- a/docs/source/dedoc_api_usage/api.rst
+++ b/docs/source/dedoc_api_usage/api.rst
@@ -210,6 +210,15 @@ Api parameters description
If the document doesn't have a textual layer (it is an image, scanned document), PDF document parsing works like with ``need_pdf_table_analysis=false``.
It is highly recommended to use this option value for any PDF document parsing.
+ * - fast_textual_layer_detection
+ - true, false
+ - false
+ - Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**.
+
+ * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
+ * **false** -- use the textual layer classifier to detect textual layer and prove its correctness.
+
+
* - language
- rus, eng, rus+eng, fra, spa
- rus+eng
diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst
index 60ad0096..3323c2de 100644
--- a/docs/source/parameters/pdf_handling.rst
+++ b/docs/source/parameters/pdf_handling.rst
@@ -41,13 +41,23 @@ PDF and images handling
If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTxtlayerReader` will be used for parsing.
If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used.
-
* **auto_tabby** -- automatic detection of textual layer presence in the PDF document.
This option is used to choose :class:`dedoc.readers.PdfAutoReader` for parsing.
If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTabbyReader` will be used for parsing.
If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used.
It is highly recommended to use this option value for any PDF document parsing.
+ * - fast_textual_layer_detection
+ - true, false
+ - false
+ - * :meth:`dedoc.readers.PdfAutoReader.read`
+ * :meth:`dedoc.DedocManager.parse`
+ * :meth:`dedoc.readers.ReaderComposition.read`
+ - Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**.
+
+ * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
+ * **false** -- use the textual layer classifier to detect textual layer and prove its correctness.
+
* - language
- rus, eng, rus+eng, fra, spa
- rus+eng
diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
index b232798f..5b021d0b 100644
--- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py
+++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
@@ -77,3 +77,22 @@ def test_auto_partially_read(self) -> None:
self.assertEqual("4) Список идёт своим чередом\n", list_items[1]["text"])
self.assertEqual("5) заканчиваем список\n", list_items[2]["text"])
self.assertEqual("6) последний элемент списка.\n", list_items[3]["text"])
+
+ def test_fast_textual_layer_detection(self) -> None:
+ file_name = "0004057v1.pdf"
+ parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True)
+ result = self._send_request(file_name, parameters)
+ self.assertIn("Assume document has a correct textual layer", result["warnings"])
+ self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper")
+
+ file_name = "tz_scan_1page.pdf"
+ parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True)
+ result = self._send_request(file_name, parameters)
+ self.assertIn("Assume document has incorrect textual layer", result["warnings"])
+
+ file_name = "mixed_pdf.pdf"
+ parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True)
+ result = self._send_request(file_name, parameters)
+ warnings = result["warnings"]
+ self.assertIn("Assume document has a correct textual layer", warnings)
+ self.assertIn("Assume the first page hasn't a textual layer", warnings)