From 8530d89d58bfe1327a053706dc24ba171c0824f3 Mon Sep 17 00:00:00 2001
From: Nikita Shevtsov
Date: Tue, 6 Aug 2024 17:11:35 +0300
Subject: [PATCH 1/9] tldr 749
---
dedoc/api/api_args.py | 2 +-
dedoc/api/web/index.html | 3 ++-
.../readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py | 2 +-
.../pdf_reader/pdf_auto_reader/txtlayer_detector.py | 8 ++++++--
docs/source/parameters/pdf_handling.rst | 2 ++
5 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
index c77ec19a..67dcd18c 100644
--- a/dedoc/api/api_args.py
+++ b/dedoc/api/api_args.py
@@ -26,7 +26,7 @@ class QueryParameters:
description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
# pdf handling
- pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
+ pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "fast_auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
index d98a9161..ac86aa7b 100644
--- a/dedoc/api/web/index.html
+++ b/dedoc/api/web/index.html
@@ -122,12 +122,13 @@
PDF handling
+
pdf_with_text_layer
-
+
+
+
+
+
-
+
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
index 523c96a1..eb7c933d 100644
--- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
@@ -52,6 +52,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
"""
+ parameters = {} if parameters is None else parameters
warnings = []
txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters)
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
index 08f62e69..b312f80a 100644
--- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
+++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
@@ -29,9 +29,9 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
"""
try:
lines = self.__get_lines_for_predict(path=path, parameters=parameters)
- if parameters["fast_auto"] == "true":
- is_correct = any(line._line.strip() for line in lines)
- first_page_correct = True
+ if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true":
+ is_correct = any(line.line.strip() for line in lines)
+ first_page_correct = True if len([line for line in lines if line.metadata.page_id == 0]) > 0 else False
else:
is_correct = self.txtlayer_classifier.predict(lines)
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
index b521ff5c..3d0ed76d 100644
--- a/docs/source/dedoc_api_usage/api.rst
+++ b/docs/source/dedoc_api_usage/api.rst
@@ -215,8 +215,8 @@ Api parameters description
- false
- Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**.
- * **true** -- if any text is detected in a PDF file, Dedoc assumpts that textual layer is detected and it is correct. Much faster but less accurate.
- * **false** -- use :class:`dedoc.readers.TxtlayerClassifier` to detect textual layer and prove its correctness.
+ * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
+ * **false** -- use the textual layer classifier to detect textual layer and prove its correctness.
* - language
diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst
index 059db019..7a050d0d 100644
--- a/docs/source/parameters/pdf_handling.rst
+++ b/docs/source/parameters/pdf_handling.rst
@@ -51,11 +51,12 @@ PDF and images handling
- true, false
- false
- * :meth:`dedoc.readers.PdfAutoReader.read`
- * :meth:`dedoc.readers.PdfAutoReader.can_read`
+ * :meth:`dedoc.DedocManager.parse`
+ * :meth:`dedoc.readers.ReaderComposition.read`
- Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**.
- * **true** -- if any text is detected in a PDF file, Dedoc assumpts that textual layer is detected and it is correct. Much faster but less accurate.
- * **false** -- use :class:`dedoc.readers.TxtlayerClassifier` to detect textual layer and prove its correctness.
+ * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
+ * **false** -- use the textual layer classifier to detect textual layer and prove its correctness.
* - language
- rus, eng, rus+eng, fra, spa
From 7e60446575ebac2661f4f468691efd0fb7b0f158 Mon Sep 17 00:00:00 2001
From: Nikita Shevtsov
Date: Thu, 8 Aug 2024 14:47:59 +0300
Subject: [PATCH 4/9] add test_fast_textual_layer_detection
---
.../test_api_format_pdf_auto_text_layer.py | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
index b232798f..5307f6b1 100644
--- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py
+++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
@@ -77,3 +77,17 @@ def test_auto_partially_read(self) -> None:
self.assertEqual("4) Список идёт своим чередом\n", list_items[1]["text"])
self.assertEqual("5) заканчиваем список\n", list_items[2]["text"])
self.assertEqual("6) последний элемент списка.\n", list_items[3]["text"])
+
+ def test_fast_textual_layer_detection(self) -> None:
+ file_name = "0004057v1.pdf"
+ parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True)
+ result = self._send_request(file_name, parameters)
+ warnings = result["warnings"]
+ self.assertIn("Assume document has a correct textual layer", warnings)
+ self.assertEqual(result['content']['structure']['subparagraphs'][5]["text"][:10], "This paper")
+
+ parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True)
+ result = self._send_request(file_name, parameters)
+ warnings = result["warnings"]
+ self.assertIn("Assume document has a correct textual layer", warnings)
+ self.assertEqual(result['content']['structure']['subparagraphs'][5]["text"][:10], "This paper")
\ No newline at end of file
From ba3f4857e6e538bf5b8b2bc229a7f2812ee9f55b Mon Sep 17 00:00:00 2001
From: Nikita Shevtsov
Date: Thu, 8 Aug 2024 14:53:37 +0300
Subject: [PATCH 5/9] add test_fast_textual_layer_detection
---
tests/api_tests/test_api_format_pdf_auto_text_layer.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
index 5307f6b1..57c4ea09 100644
--- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py
+++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
@@ -57,7 +57,7 @@ def test_auto_document_mixed(self) -> None:
for pdf_with_text_layer in "auto", "auto_tabby":
result = self._send_request(file_name, dict(pdf_with_text_layer=pdf_with_text_layer))
self.assertIn("Assume document has a correct textual layer", result["warnings"])
- self.assertIn("Assume the first page hasn't a textual layer", result["warnings"])
+ self.assertIn("Assume the first page hasn"t a textual layer", result["warnings"])
self._check_english_doc(result)
structure = result["content"]["structure"]
list_items = structure["subparagraphs"][1]["subparagraphs"]
@@ -84,10 +84,10 @@ def test_fast_textual_layer_detection(self) -> None:
result = self._send_request(file_name, parameters)
warnings = result["warnings"]
self.assertIn("Assume document has a correct textual layer", warnings)
- self.assertEqual(result['content']['structure']['subparagraphs'][5]["text"][:10], "This paper")
+ self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper")
parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True)
result = self._send_request(file_name, parameters)
warnings = result["warnings"]
self.assertIn("Assume document has a correct textual layer", warnings)
- self.assertEqual(result['content']['structure']['subparagraphs'][5]["text"][:10], "This paper")
\ No newline at end of file
+ self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper")
From 2bc3c6ff36103b3a4d6dab556c02b87b07b16198 Mon Sep 17 00:00:00 2001
From: Nikita Shevtsov
Date: Thu, 8 Aug 2024 15:02:15 +0300
Subject: [PATCH 6/9] add test_fast_textual_layer_detection
---
dedoc/api/api_args.py | 4 ++--
tests/api_tests/test_api_format_pdf_auto_text_layer.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
index 599d6f67..8f3e1415 100644
--- a/dedoc/api/api_args.py
+++ b/dedoc/api/api_args.py
@@ -28,8 +28,8 @@ class QueryParameters:
# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
- fast_textual_layer_detection: str = Form("false", enum=["true", "false"], description="Use non-ML solution to detect textual layer if selected auto or"
- " auto_tabby in pdf_with_text_layer option. Much faster but less accurate.")
+ fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
+ description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
index 57c4ea09..47904ec2 100644
--- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py
+++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
@@ -57,7 +57,7 @@ def test_auto_document_mixed(self) -> None:
for pdf_with_text_layer in "auto", "auto_tabby":
result = self._send_request(file_name, dict(pdf_with_text_layer=pdf_with_text_layer))
self.assertIn("Assume document has a correct textual layer", result["warnings"])
- self.assertIn("Assume the first page hasn"t a textual layer", result["warnings"])
+ self.assertIn("Assume the first page hasn't a textual layer", result["warnings"])
self._check_english_doc(result)
structure = result["content"]["structure"]
list_items = structure["subparagraphs"][1]["subparagraphs"]
From 3334124cd6b7502c56880c8ec2f302d14991af34 Mon Sep 17 00:00:00 2001
From: Nikita Shevtsov
Date: Thu, 8 Aug 2024 15:06:00 +0300
Subject: [PATCH 7/9] add test_fast_textual_layer_detection
---
tests/api_tests/test_api_format_pdf_auto_text_layer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
index 47904ec2..6754fd82 100644
--- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py
+++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
@@ -77,7 +77,7 @@ def test_auto_partially_read(self) -> None:
self.assertEqual("4) Список идёт своим чередом\n", list_items[1]["text"])
self.assertEqual("5) заканчиваем список\n", list_items[2]["text"])
self.assertEqual("6) последний элемент списка.\n", list_items[3]["text"])
-
+
def test_fast_textual_layer_detection(self) -> None:
file_name = "0004057v1.pdf"
parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True)
From 70c78fde8eb7da066c9bc492fb5c6c9c0b242034 Mon Sep 17 00:00:00 2001
From: Nikita Shevtsov
Date: Thu, 8 Aug 2024 19:03:28 +0300
Subject: [PATCH 8/9] fix MRR
---
.../pdf_reader/pdf_auto_reader/txtlayer_detector.py | 3 ++-
docs/source/dedoc_api_usage/api.rst | 2 +-
docs/source/parameters/pdf_handling.rst | 2 +-
tests/api_tests/test_api_format_pdf_auto_text_layer.py | 9 ++++++++-
4 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
index b312f80a..c071a893 100644
--- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
+++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
@@ -31,7 +31,8 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
lines = self.__get_lines_for_predict(path=path, parameters=parameters)
if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true":
is_correct = any(line.line.strip() for line in lines)
- first_page_correct = True if len([line for line in lines if line.metadata.page_id == 0]) > 0 else False
+ first_page_lines = [line for line in lines if line.metadata.page_id == 0]
+ first_page_correct = first_page_lines and any(line.line.strip() for line in first_page_lines)
else:
is_correct = self.txtlayer_classifier.predict(lines)
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
index 3d0ed76d..13cd0eaf 100644
--- a/docs/source/dedoc_api_usage/api.rst
+++ b/docs/source/dedoc_api_usage/api.rst
@@ -210,7 +210,7 @@ Api parameters description
If the document doesn't have a textual layer (it is an image, scanned document), PDF document parsing works like with ``need_pdf_table_analysis=false``.
It is highly recommended to use this option value for any PDF document parsing.
- * - fast_auto
+ * - fast_textual_layer_detection
- true, false
- false
- Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**.
diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst
index 7a050d0d..3323c2de 100644
--- a/docs/source/parameters/pdf_handling.rst
+++ b/docs/source/parameters/pdf_handling.rst
@@ -47,7 +47,7 @@ PDF and images handling
If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used.
It is highly recommended to use this option value for any PDF document parsing.
- * - fast_auto
+ * - fast_textual_layer_detection
- true, false
- false
- * :meth:`dedoc.readers.PdfAutoReader.read`
diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
index 6754fd82..7d4ffd10 100644
--- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py
+++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
@@ -86,8 +86,15 @@ def test_fast_textual_layer_detection(self) -> None:
self.assertIn("Assume document has a correct textual layer", warnings)
self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper")
+ file_name = "tz_scan_1page.pdf"
parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True)
result = self._send_request(file_name, parameters)
warnings = result["warnings"]
+ self.assertIn("Assume document has incorrect textual layer", result["warnings"])
+
+ file_name = "mixed_pdf.pdf"
+ parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True)
+ result = self._send_request(file_name, parameters)
+ warnings = result["warnings"]
self.assertIn("Assume document has a correct textual layer", warnings)
- self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper")
+ self.assertIn("Assume the first page hasn't a textual layer", result["warnings"])
From 81d66df8c439c62afccc88fc6bf2252b2f2c7a58 Mon Sep 17 00:00:00 2001
From: Nasty
Date: Fri, 9 Aug 2024 11:11:24 +0300
Subject: [PATCH 9/9] small fixes
---
.../readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py | 2 +-
tests/api_tests/test_api_format_pdf_auto_text_layer.py | 6 ++----
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
index c071a893..1ad11b40 100644
--- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
+++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
@@ -32,7 +32,7 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true":
is_correct = any(line.line.strip() for line in lines)
first_page_lines = [line for line in lines if line.metadata.page_id == 0]
- first_page_correct = first_page_lines and any(line.line.strip() for line in first_page_lines)
+ first_page_correct = bool(first_page_lines) and any(line.line.strip() for line in first_page_lines)
else:
is_correct = self.txtlayer_classifier.predict(lines)
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
index 7d4ffd10..5b021d0b 100644
--- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py
+++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py
@@ -82,14 +82,12 @@ def test_fast_textual_layer_detection(self) -> None:
file_name = "0004057v1.pdf"
parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True)
result = self._send_request(file_name, parameters)
- warnings = result["warnings"]
- self.assertIn("Assume document has a correct textual layer", warnings)
+ self.assertIn("Assume document has a correct textual layer", result["warnings"])
self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper")
file_name = "tz_scan_1page.pdf"
parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True)
result = self._send_request(file_name, parameters)
- warnings = result["warnings"]
self.assertIn("Assume document has incorrect textual layer", result["warnings"])
file_name = "mixed_pdf.pdf"
@@ -97,4 +95,4 @@ def test_fast_textual_layer_detection(self) -> None:
result = self._send_request(file_name, parameters)
warnings = result["warnings"]
self.assertIn("Assume document has a correct textual layer", warnings)
- self.assertIn("Assume the first page hasn't a textual layer", result["warnings"])
+ self.assertIn("Assume the first page hasn't a textual layer", warnings)