diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index 51bf339a..5ccfe117 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -7,10 +7,11 @@ from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.serializable import Serializable from dedoc.utils.annotation_merger import AnnotationMerger -class LineWithMeta(Sized): +class LineWithMeta(Sized, Serializable): """ Structural unit of document - line (or paragraph) of text and its metadata. One LineWithMeta should not contain text from different logical parts of the document diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index 4cf80fc7..8167feb5 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -54,8 +54,8 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], word.bbox.y_top_left -= chunk_boxes[chunk_index].y_top_left word.bbox.x_top_left -= chunk_boxes[chunk_index].x_top_left # do absolute coordinate on src_image (inside src_image) - word.bbox.y_top_left += nodes_batch[chunk_index].cell_box.y_top_left - word.bbox.x_top_left += nodes_batch[chunk_index].cell_box.x_top_left + word.bbox.y_top_left += nodes_batch[chunk_index].crop_text_box.y_top_left + word.bbox.x_top_left += nodes_batch[chunk_index].crop_text_box.x_top_left originalbox_to_fastocrbox[nodes_batch[chunk_index].cell_box].append(line.words) diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py index 6515e6fe..70c249e7 100644 --- a/dedoc/scripts/test_words_bbox_extraction.py +++ b/dedoc/scripts/test_words_bbox_extraction.py @@ -159,7 +159,7 @@ def test_table_word_extraction(self): os.makedirs(output_path, exist_ok=True) file_names = ["tables/example_with_table5.png", "tables/example_with_table3.png", "tables/example_with_table4.jpg", "tables/example_with_table6.png", "tables/example_with_table_horizontal_union.jpg", - "scanned/orient_1.png"] + "scanned/orient_1.png", "tables/rotated_table.png"] for file_name in file_names: result = self._send_request(file_name, data=dict()) @@ -173,4 +173,3 @@ def test_table_word_extraction(self): image = self.draw_word_annotations(image, word_annotations, angle=table_angle) cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image) - diff --git a/tests/data/tables/rotated_table.png b/tests/data/tables/rotated_table.png new file mode 100644 index 00000000..d0f790f3 Binary files /dev/null and b/tests/data/tables/rotated_table.png differ