Skip to content

Commit

Permalink
test fix and pdfbasereader fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Golodkov committed Sep 11, 2024
1 parent 03dbf9e commit 7dd3e6e
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 3 deletions.
5 changes: 3 additions & 2 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
from dedoc.utils.utils import flatten

first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page
last_page = math.inf if parameters.last_page is None else parameters.last_page
images = self._get_images(path, first_page, last_page)

if parameters.need_gost_frame_analysis and type(self).__name__ == "PdfImageReader":
if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
Expand Down Expand Up @@ -152,7 +153,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
metadata["rotated_page_angles"] = page_angles
if parameters.need_gost_frame_analysis and type(self).__name__ == "PdfImageReader":
if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
self._shift_all_contents(lines=all_lines_with_paragraphs, mp_tables=mp_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_module_gost_frame_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,4 @@ def test_pdf_auto_reader(self) -> None:
self.assertEqual(result.tables[0].cells[0][1].get_text(), "Колонка 2")
self.assertEqual(result.tables[0].cells[0][2].get_text(), "Колонка 3")
self.assertEqual(len(result.tables[0].cells), 22)
self.assertTrue("Названне таблицы (продолженне)" in result.lines[0].line)
self.assertTrue("Название таблицы (продолжение)" in result.lines[0].line)

0 comments on commit 7dd3e6e

Please sign in to comment.