From 8530d89d58bfe1327a053706dc24ba171c0824f3 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov Date: Tue, 6 Aug 2024 17:11:35 +0300 Subject: [PATCH 1/9] tldr 749 --- dedoc/api/api_args.py | 2 +- dedoc/api/web/index.html | 3 ++- .../readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py | 2 +- .../pdf_reader/pdf_auto_reader/txtlayer_detector.py | 8 ++++++-- docs/source/parameters/pdf_handling.rst | 2 ++ 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index c77ec19a..67dcd18c 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -26,7 +26,7 @@ class QueryParameters: description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation') # pdf handling - pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], + pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "fast_auto", "auto_tabby", "tabby"], description="Extract text from a text layer of PDF or using OCR methods for image-like documents") language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')") pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right') diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index d98a9161..ac86aa7b 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -122,12 +122,13 @@

PDF handling

+ pdf_with_text_layer

- +

PDF handling

- pdf_with_text_layer

+ +

+ +

PDF handling

- +

diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index 523c96a1..eb7c933d 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -52,6 +52,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + parameters = {} if parameters is None else parameters warnings = [] txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index 08f62e69..b312f80a 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -29,9 +29,9 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters: """ try: lines = self.__get_lines_for_predict(path=path, parameters=parameters) - if parameters["fast_auto"] == "true": - is_correct = any(line._line.strip() for line in lines) - first_page_correct = True + if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true": + is_correct = any(line.line.strip() for line in lines) + first_page_correct = True if len([line for line in lines if line.metadata.page_id == 0]) > 0 else False else: is_correct = self.txtlayer_classifier.predict(lines) first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct) diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index b521ff5c..3d0ed76d 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -215,8 +215,8 @@ Api parameters description - false - Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**. - * **true** -- if any text is detected in a PDF file, Dedoc assumpts that textual layer is detected and it is correct. Much faster but less accurate. - * **false** -- use :class:`dedoc.readers.TxtlayerClassifier` to detect textual layer and prove its correctness. + * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate. + * **false** -- use the textual layer classifier to detect textual layer and prove its correctness. * - language diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 059db019..7a050d0d 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -51,11 +51,12 @@ PDF and images handling - true, false - false - * :meth:`dedoc.readers.PdfAutoReader.read` - * :meth:`dedoc.readers.PdfAutoReader.can_read` + * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.ReaderComposition.read` - Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**. - * **true** -- if any text is detected in a PDF file, Dedoc assumpts that textual layer is detected and it is correct. Much faster but less accurate. - * **false** -- use :class:`dedoc.readers.TxtlayerClassifier` to detect textual layer and prove its correctness. + * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate. + * **false** -- use the textual layer classifier to detect textual layer and prove its correctness. * - language - rus, eng, rus+eng, fra, spa From 7e60446575ebac2661f4f468691efd0fb7b0f158 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov Date: Thu, 8 Aug 2024 14:47:59 +0300 Subject: [PATCH 4/9] add test_fast_textual_layer_detection --- .../test_api_format_pdf_auto_text_layer.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py index b232798f..5307f6b1 100644 --- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py +++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py @@ -77,3 +77,17 @@ def test_auto_partially_read(self) -> None: self.assertEqual("4) Список идёт своим чередом\n", list_items[1]["text"]) self.assertEqual("5) заканчиваем список\n", list_items[2]["text"]) self.assertEqual("6) последний элемент списка.\n", list_items[3]["text"]) + + def test_fast_textual_layer_detection(self) -> None: + file_name = "0004057v1.pdf" + parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True) + result = self._send_request(file_name, parameters) + warnings = result["warnings"] + self.assertIn("Assume document has a correct textual layer", warnings) + self.assertEqual(result['content']['structure']['subparagraphs'][5]["text"][:10], "This paper") + + parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True) + result = self._send_request(file_name, parameters) + warnings = result["warnings"] + self.assertIn("Assume document has a correct textual layer", warnings) + self.assertEqual(result['content']['structure']['subparagraphs'][5]["text"][:10], "This paper") \ No newline at end of file From ba3f4857e6e538bf5b8b2bc229a7f2812ee9f55b Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov Date: Thu, 8 Aug 2024 14:53:37 +0300 Subject: [PATCH 5/9] add test_fast_textual_layer_detection --- tests/api_tests/test_api_format_pdf_auto_text_layer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py index 5307f6b1..57c4ea09 100644 --- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py +++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py @@ -57,7 +57,7 @@ def test_auto_document_mixed(self) -> None: for pdf_with_text_layer in "auto", "auto_tabby": result = self._send_request(file_name, dict(pdf_with_text_layer=pdf_with_text_layer)) self.assertIn("Assume document has a correct textual layer", result["warnings"]) - self.assertIn("Assume the first page hasn't a textual layer", result["warnings"]) + self.assertIn("Assume the first page hasn"t a textual layer", result["warnings"]) self._check_english_doc(result) structure = result["content"]["structure"] list_items = structure["subparagraphs"][1]["subparagraphs"] @@ -84,10 +84,10 @@ def test_fast_textual_layer_detection(self) -> None: result = self._send_request(file_name, parameters) warnings = result["warnings"] self.assertIn("Assume document has a correct textual layer", warnings) - self.assertEqual(result['content']['structure']['subparagraphs'][5]["text"][:10], "This paper") + self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper") parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True) result = self._send_request(file_name, parameters) warnings = result["warnings"] self.assertIn("Assume document has a correct textual layer", warnings) - self.assertEqual(result['content']['structure']['subparagraphs'][5]["text"][:10], "This paper") \ No newline at end of file + self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper") From 2bc3c6ff36103b3a4d6dab556c02b87b07b16198 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov Date: Thu, 8 Aug 2024 15:02:15 +0300 Subject: [PATCH 6/9] add test_fast_textual_layer_detection --- dedoc/api/api_args.py | 4 ++-- tests/api_tests/test_api_format_pdf_auto_text_layer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 599d6f67..8f3e1415 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -28,8 +28,8 @@ class QueryParameters: # pdf handling pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], description="Extract text from a text layer of PDF or using OCR methods for image-like documents") - fast_textual_layer_detection: str = Form("false", enum=["true", "false"], description="Use non-ML solution to detect textual layer if selected auto or" - " auto_tabby in pdf_with_text_layer option. Much faster but less accurate.") + fast_textual_layer_detection: str = Form("false", enum=["true", "false"], + description="Use non-ML solution to detect textual layer. Much faster but less accurate.") language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')") pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right') is_one_column_document: str = Form("auto", enum=["auto", "true", "false"], diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py index 57c4ea09..47904ec2 100644 --- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py +++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py @@ -57,7 +57,7 @@ def test_auto_document_mixed(self) -> None: for pdf_with_text_layer in "auto", "auto_tabby": result = self._send_request(file_name, dict(pdf_with_text_layer=pdf_with_text_layer)) self.assertIn("Assume document has a correct textual layer", result["warnings"]) - self.assertIn("Assume the first page hasn"t a textual layer", result["warnings"]) + self.assertIn("Assume the first page hasn't a textual layer", result["warnings"]) self._check_english_doc(result) structure = result["content"]["structure"] list_items = structure["subparagraphs"][1]["subparagraphs"] From 3334124cd6b7502c56880c8ec2f302d14991af34 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov Date: Thu, 8 Aug 2024 15:06:00 +0300 Subject: [PATCH 7/9] add test_fast_textual_layer_detection --- tests/api_tests/test_api_format_pdf_auto_text_layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py index 47904ec2..6754fd82 100644 --- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py +++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py @@ -77,7 +77,7 @@ def test_auto_partially_read(self) -> None: self.assertEqual("4) Список идёт своим чередом\n", list_items[1]["text"]) self.assertEqual("5) заканчиваем список\n", list_items[2]["text"]) self.assertEqual("6) последний элемент списка.\n", list_items[3]["text"]) - + def test_fast_textual_layer_detection(self) -> None: file_name = "0004057v1.pdf" parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True) From 70c78fde8eb7da066c9bc492fb5c6c9c0b242034 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov Date: Thu, 8 Aug 2024 19:03:28 +0300 Subject: [PATCH 8/9] fix MRR --- .../pdf_reader/pdf_auto_reader/txtlayer_detector.py | 3 ++- docs/source/dedoc_api_usage/api.rst | 2 +- docs/source/parameters/pdf_handling.rst | 2 +- tests/api_tests/test_api_format_pdf_auto_text_layer.py | 9 ++++++++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index b312f80a..c071a893 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -31,7 +31,8 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters: lines = self.__get_lines_for_predict(path=path, parameters=parameters) if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true": is_correct = any(line.line.strip() for line in lines) - first_page_correct = True if len([line for line in lines if line.metadata.page_id == 0]) > 0 else False + first_page_lines = [line for line in lines if line.metadata.page_id == 0] + first_page_correct = first_page_lines and any(line.line.strip() for line in first_page_lines) else: is_correct = self.txtlayer_classifier.predict(lines) first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct) diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 3d0ed76d..13cd0eaf 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -210,7 +210,7 @@ Api parameters description If the document doesn't have a textual layer (it is an image, scanned document), PDF document parsing works like with ``need_pdf_table_analysis=false``. It is highly recommended to use this option value for any PDF document parsing. - * - fast_auto + * - fast_textual_layer_detection - true, false - false - Enable fast textual layer detection. Works only when **auto** or **auto_tabby** is selected at **pdf_with_text_layer**. diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 7a050d0d..3323c2de 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -47,7 +47,7 @@ PDF and images handling If the document doesn't have a textual layer (it is an image, scanned document), :class:`dedoc.readers.PdfImageReader` will be used. It is highly recommended to use this option value for any PDF document parsing. - * - fast_auto + * - fast_textual_layer_detection - true, false - false - * :meth:`dedoc.readers.PdfAutoReader.read` diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py index 6754fd82..7d4ffd10 100644 --- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py +++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py @@ -86,8 +86,15 @@ def test_fast_textual_layer_detection(self) -> None: self.assertIn("Assume document has a correct textual layer", warnings) self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper") + file_name = "tz_scan_1page.pdf" parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True) result = self._send_request(file_name, parameters) warnings = result["warnings"] + self.assertIn("Assume document has incorrect textual layer", result["warnings"]) + + file_name = "mixed_pdf.pdf" + parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True) + result = self._send_request(file_name, parameters) + warnings = result["warnings"] self.assertIn("Assume document has a correct textual layer", warnings) - self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper") + self.assertIn("Assume the first page hasn't a textual layer", result["warnings"]) From 81d66df8c439c62afccc88fc6bf2252b2f2c7a58 Mon Sep 17 00:00:00 2001 From: Nasty Date: Fri, 9 Aug 2024 11:11:24 +0300 Subject: [PATCH 9/9] small fixes --- .../readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py | 2 +- tests/api_tests/test_api_format_pdf_auto_text_layer.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index c071a893..1ad11b40 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -32,7 +32,7 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters: if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true": is_correct = any(line.line.strip() for line in lines) first_page_lines = [line for line in lines if line.metadata.page_id == 0] - first_page_correct = first_page_lines and any(line.line.strip() for line in first_page_lines) + first_page_correct = bool(first_page_lines) and any(line.line.strip() for line in first_page_lines) else: is_correct = self.txtlayer_classifier.predict(lines) first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct) diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py index 7d4ffd10..5b021d0b 100644 --- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py +++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py @@ -82,14 +82,12 @@ def test_fast_textual_layer_detection(self) -> None: file_name = "0004057v1.pdf" parameters = dict(pdf_with_text_layer="auto", fast_textual_layer_detection=True) result = self._send_request(file_name, parameters) - warnings = result["warnings"] - self.assertIn("Assume document has a correct textual layer", warnings) + self.assertIn("Assume document has a correct textual layer", result["warnings"]) self.assertEqual(result["content"]["structure"]["subparagraphs"][5]["text"][:10], "This paper") file_name = "tz_scan_1page.pdf" parameters = dict(pdf_with_text_layer="auto_tabby", fast_textual_layer_detection=True) result = self._send_request(file_name, parameters) - warnings = result["warnings"] self.assertIn("Assume document has incorrect textual layer", result["warnings"]) file_name = "mixed_pdf.pdf" @@ -97,4 +95,4 @@ def test_fast_textual_layer_detection(self) -> None: result = self._send_request(file_name, parameters) warnings = result["warnings"] self.assertIn("Assume document has a correct textual layer", warnings) - self.assertIn("Assume the first page hasn't a textual layer", result["warnings"]) + self.assertIn("Assume the first page hasn't a textual layer", warnings)