From 7dd3e6e11f557078c658c03f5ae22504ed63b7cd Mon Sep 17 00:00:00 2001 From: Alexander Golodkov Date: Wed, 11 Sep 2024 14:17:01 +0300 Subject: [PATCH] test fix and pdfbasereader fix --- dedoc/readers/pdf_reader/pdf_base_reader.py | 5 +++-- tests/unit_tests/test_module_gost_frame_recognizer.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index c9e85d65..ed50dc6f 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -107,13 +107,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis from dedoc.utils.pdf_utils import get_pdf_page_count + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader from dedoc.utils.utils import flatten first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page last_page = math.inf if parameters.last_page is None else parameters.last_page images = self._get_images(path, first_page, last_page) - if parameters.need_gost_frame_analysis and type(self).__name__ == "PdfImageReader": + if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader): gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images) result = Parallel(n_jobs=self.config["n_jobs"])( delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in @@ -152,7 +153,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links) if page_angles: metadata["rotated_page_angles"] = page_angles - if parameters.need_gost_frame_analysis and type(self).__name__ == "PdfImageReader": + if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader): self._shift_all_contents(lines=all_lines_with_paragraphs, mp_tables=mp_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images) return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata diff --git a/tests/unit_tests/test_module_gost_frame_recognizer.py b/tests/unit_tests/test_module_gost_frame_recognizer.py index c5aa26b4..a1e65222 100644 --- a/tests/unit_tests/test_module_gost_frame_recognizer.py +++ b/tests/unit_tests/test_module_gost_frame_recognizer.py @@ -85,4 +85,4 @@ def test_pdf_auto_reader(self) -> None: self.assertEqual(result.tables[0].cells[0][1].get_text(), "Колонка 2") self.assertEqual(result.tables[0].cells[0][2].get_text(), "Колонка 3") self.assertEqual(len(result.tables[0].cells), 22) - self.assertTrue("Названне таблицы (продолженне)" in result.lines[0].line) + self.assertTrue("Название таблицы (продолжение)" in result.lines[0].line)