Skip to content

Commit

Permalink
TLDR-749 fast auto textual layer detection (#481)
Browse files Browse the repository at this point in the history
Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru>
Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
  • Loading branch information
3 people authored Aug 9, 2024
1 parent ba9b3b4 commit 4921d67
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 3 deletions.
2 changes: 2 additions & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class QueryParameters:
# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
Expand Down
4 changes: 4 additions & 0 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ <h4>PDF handling</h4>
</label>
</p>

<p>
<label><input name="fast_textual_layer_detection" type="checkbox" value="true"> fast_textual_layer_detection</label>
</p>

<p>
<label> language
<input name="language" list="language" size="8" placeholder="rus+eng">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
"""
parameters = {} if parameters is None else parameters
warnings = []
txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters)

Expand Down
9 changes: 7 additions & 2 deletions dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,13 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
"""
try:
lines = self.__get_lines_for_predict(path=path, parameters=parameters)
is_correct = self.txtlayer_classifier.predict(lines)
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true":
is_correct = any(line.line.strip() for line in lines)
first_page_lines = [line for line in lines if line.metadata.page_id == 0]
first_page_correct = bool(first_page_lines) and any(line.line.strip() for line in first_page_lines)
else:
is_correct = self.txtlayer_classifier.predict(lines)
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
return PdfTxtlayerParameters(is_correct_text_layer=is_correct, is_first_page_correct=first_page_correct)

except Exception as e:
Expand Down
9 changes: 9 additions & 0 deletions docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,15 @@ Api parameters description
If the document doesn't have a textual layer (it is an image, scanned document), PDF document parsing works like with ``need_pdf_table_analysis=false``.
It is highly recommended to use this option value for any PDF document parsing.

* - fast_textual_layer_detection
- true, false
- false
- Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**.

* **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
* **false** -- use the textual layer classifier to detect textual layer and prove its correctness.


* - language
- rus, eng, rus+eng, fra, spa
- rus+eng
Expand Down
12 changes: 11 additions & 1 deletion docs/source/parameters/pdf_handling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,23 @@ PDF and images handling
If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTxtlayerReader` will be used for parsing.
If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used.


* **auto_tabby** -- automatic detection of textual layer presence in the PDF document.
This option is used to choose :class:`dedoc.readers.PdfAutoReader` for parsing.
If the document has a textual layer (is copyable), :class:`dedoc.readers.PdfTabbyReader` will be used for parsing.
If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used.
It is highly recommended to use this option value for any PDF document parsing.

* - fast_textual_layer_detection
- true, false
- false
- * :meth:`dedoc.readers.PdfAutoReader.read`
* :meth:`dedoc.DedocManager.parse`
* :meth:`dedoc.readers.ReaderComposition.read`
- Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**.

* **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
* **false** -- use the textual layer classifier to detect textual layer and prove its correctness.

* - language
- rus, eng, rus+eng, fra, spa
- rus+eng
Expand Down
19 changes: 19 additions & 0 deletions tests/api_tests/test_api_format_pdf_auto_text_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,22 @@ def test_auto_partially_read(self) -> None:
self.assertEqual("4) Список идёт своим чередом\n", list_items[1]["text"])
self.assertEqual("5) заканчиваем список\n", list_items[2]["text"])
self.assertEqual("6) последний элемент списка.\n", list_items[3]["text"])

def test_fast_textual_layer_detection(self) -> None:
file_name = "0004057v1.pdf"
parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True)
result = self._send_request(file_name, parameters)
self.assertIn("Assume document has a correct textual layer", result["warnings"])
self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper")

file_name = "tz_scan_1page.pdf"
parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True)
result = self._send_request(file_name, parameters)
self.assertIn("Assume document has incorrect textual layer", result["warnings"])

file_name = "mixed_pdf.pdf"
parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True)
result = self._send_request(file_name, parameters)
warnings = result["warnings"]
self.assertIn("Assume document has a correct textual layer", warnings)
self.assertIn("Assume the first page hasn't a textual layer", warnings)

0 comments on commit 4921d67

Please sign in to comment.