ispras · NastyBoget · Aug 9, 2024 · Aug 6, 2024 · Aug 7, 2024 · Aug 8, 2024
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -26,7 +26,7 @@ class QueryParameters:
                                   description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
 
     # pdf handling
-    pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
+    pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "fast_auto", "auto_tabby", "tabby"],
                                     description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
     language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
     pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')

diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
@@ -122,12 +122,13 @@ <h4>PDF handling</h4>
                             <option value="true">true</option>
                             <option value="false">false</option>
                             <option value="auto">auto</option>
+                            <option value="fast_auto">fast_auto</option>
                             <option value="auto_tabby" selected>auto_tabby</option>
                             <option value="tabby">tabby</option>
                         </select> pdf_with_text_layer
                     </label>
                 </p>
-
+                
                 <p>
                     <label> language
                         <input name="language" list="language" size="8" placeholder="rus+eng">

diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
@@ -43,7 +43,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
         You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
         """
         from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer
-        return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) in ("auto", "auto_tabby")
+        return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) in ("auto", "fast_auto", "auto_tabby")
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """

diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
@@ -29,8 +29,12 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
         """
         try:
             lines = self.__get_lines_for_predict(path=path, parameters=parameters)
-            is_correct = self.txtlayer_classifier.predict(lines)
-            first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
+            if parameters["pdf_with_text_layer"] == "fast_auto":
+                is_correct = any(line._line.strip() for line in lines)
+                first_page_correct = True
+            else:
+                is_correct = self.txtlayer_classifier.predict(lines)
+                first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
             return PdfTxtlayerParameters(is_correct_text_layer=is_correct, is_first_page_correct=first_page_correct)
 
         except Exception as e:

diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst
@@ -41,6 +41,8 @@ PDF and images handling
               If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTxtlayerReader` will be used for parsing.
               If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used.
 
+            * **fast_auto** -- the pipeline is the same as **auto** except thr detection of textual layer. It is much faster but less accurate
+            because of no-ML solution.
 
             * **auto_tabby** -- automatic detection of textual layer presence in the PDF document.
               This option is used to choose :class:`dedoc.readers.PdfAutoReader` for parsing.