Skip to content

Commit

Permalink
ESL-137 fixed unit-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy committed Sep 22, 2023
1 parent 459c065 commit fc71c10
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra
# add confidence value
text_line += OCRCellExtractor.get_line_with_meta(text=word.text, bbox=word.bbox, image=page_image,
confidences=[
ConfidenceAnnotation(start=0, end=len(word.text), value=word.confidence / 100.)])
ConfidenceAnnotation(start=0,
end=len(word.text),
value=0. if word.confidence < 0 else word.confidence / 100.)])
if len(text_line) > 0: # add new line
cell_lines.append(text_line)

Expand Down
124 changes: 62 additions & 62 deletions tests/unit_tests/test_format_docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,76 +147,76 @@ def test_table_parsing_correctness(self) -> None:
path = self._get_path("merged_cells.docx")
result = docx_reader.read(path)

self.assertEqual("Merged sells", result.tables[0].cells[0][0])
self.assertEqual("Merged sells", result.tables[0].cells[0][1])
self.assertEqual("Some text", result.tables[0].cells[0][2])
self.assertEqual("Some text", result.tables[0].cells[0][3])
self.assertEqual("Cell 1", result.tables[0].cells[1][0])
self.assertEqual("Cell 2", result.tables[0].cells[1][1])
self.assertEqual("Vertically split cells 1", result.tables[0].cells[1][2])
self.assertEqual("Vertically split cells 2", result.tables[0].cells[1][3])
self.assertEqual("Cell 3", result.tables[0].cells[2][0])
self.assertEqual("Cell 4", result.tables[0].cells[2][1])
self.assertEqual("Horizontally split cells 1", result.tables[0].cells[2][2])
self.assertEqual("Horizontally split cells 1", result.tables[0].cells[2][3])
self.assertEqual("Cell 3", result.tables[0].cells[3][0])
self.assertEqual("Cell 4", result.tables[0].cells[3][1])
self.assertEqual("Horizontally split cells 2", result.tables[0].cells[3][2])
self.assertEqual("Horizontally split cells 2", result.tables[0].cells[3][3])

self.assertEqual("cell1", result.tables[1].cells[0][0])
self.assertEqual("cell2", result.tables[1].cells[0][1])
self.assertEqual("Horizontally merged", result.tables[1].cells[0][2])
self.assertEqual("Horizontally merged", result.tables[1].cells[0][3])
self.assertEqual("Horizontally merged", result.tables[1].cells[0][4])
self.assertEqual("Horizontally merged", result.tables[1].cells[0][5])
self.assertEqual("Vertically and horizontally merged cells", result.tables[1].cells[1][0])
self.assertEqual("Vertically and horizontally merged cells", result.tables[1].cells[1][1])
self.assertEqual("cell3", result.tables[1].cells[1][2])
self.assertEqual("Vertically merged", result.tables[1].cells[1][3])
self.assertEqual("cell4", result.tables[1].cells[1][4])
self.assertEqual("cell4", result.tables[1].cells[1][5])
self.assertEqual("Vertically and horizontally merged cells", result.tables[1].cells[2][0])
self.assertEqual("Vertically and horizontally merged cells", result.tables[1].cells[2][1])
self.assertEqual("cell5", result.tables[1].cells[2][2])
self.assertEqual("Vertically merged", result.tables[1].cells[2][3])
self.assertEqual("v1", result.tables[1].cells[2][4])
self.assertEqual("v2", result.tables[1].cells[2][5])
self.assertEqual("Merged sells", result.tables[0].cells[0][0].get_text())
self.assertEqual("Merged sells", result.tables[0].cells[0][1].get_text())
self.assertEqual("Some text", result.tables[0].cells[0][2].get_text())
self.assertEqual("Some text", result.tables[0].cells[0][3].get_text())
self.assertEqual("Cell 1", result.tables[0].cells[1][0].get_text())
self.assertEqual("Cell 2", result.tables[0].cells[1][1].get_text())
self.assertEqual("Vertically split cells 1", result.tables[0].cells[1][2].get_text())
self.assertEqual("Vertically split cells 2", result.tables[0].cells[1][3].get_text())
self.assertEqual("Cell 3", result.tables[0].cells[2][0].get_text())
self.assertEqual("Cell 4", result.tables[0].cells[2][1].get_text())
self.assertEqual("Horizontally split cells 1", result.tables[0].cells[2][2].get_text())
self.assertEqual("Horizontally split cells 1", result.tables[0].cells[2][3].get_text())
self.assertEqual("Cell 3", result.tables[0].cells[3][0].get_text())
self.assertEqual("Cell 4", result.tables[0].cells[3][1].get_text())
self.assertEqual("Horizontally split cells 2", result.tables[0].cells[3][2].get_text())
self.assertEqual("Horizontally split cells 2", result.tables[0].cells[3][3].get_text())

self.assertEqual("cell1", result.tables[1].cells[0][0].get_text())
self.assertEqual("cell2", result.tables[1].cells[0][1].get_text())
self.assertEqual("Horizontally merged", result.tables[1].cells[0][2].get_text())
self.assertEqual("Horizontally merged", result.tables[1].cells[0][3].get_text())
self.assertEqual("Horizontally merged", result.tables[1].cells[0][4].get_text())
self.assertEqual("Horizontally merged", result.tables[1].cells[0][5].get_text())
self.assertEqual("Vertically and horizontally merged cells", result.tables[1].cells[1][0].get_text())
self.assertEqual("Vertically and horizontally merged cells", result.tables[1].cells[1][1].get_text())
self.assertEqual("cell3", result.tables[1].cells[1][2].get_text())
self.assertEqual("Vertically merged", result.tables[1].cells[1][3].get_text())
self.assertEqual("cell4", result.tables[1].cells[1][4].get_text())
self.assertEqual("cell4", result.tables[1].cells[1][5].get_text())
self.assertEqual("Vertically and horizontally merged cells", result.tables[1].cells[2][0].get_text())
self.assertEqual("Vertically and horizontally merged cells", result.tables[1].cells[2][1].get_text())
self.assertEqual("cell5", result.tables[1].cells[2][2].get_text())
self.assertEqual("Vertically merged", result.tables[1].cells[2][3].get_text())
self.assertEqual("v1", result.tables[1].cells[2][4].get_text())
self.assertEqual("v2", result.tables[1].cells[2][5].get_text())

hidden_cells_table_1 = [(0, 1), (0, 3), (2, 3), (3, 0), (3, 1), (3, 3)]
for i, j in hidden_cells_table_1:
self.assertTrue(result.tables[0].metadata.cell_properties[i][j].invisible)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].rowspan, 1)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].colspan, 1)
self.assertTrue(result.tables[0].cells[i][j].invisible)
self.assertEqual(result.tables[0].cells[i][j].rowspan, 1)
self.assertEqual(result.tables[0].cells[i][j].colspan, 1)

hidden_cells_table_1_with_colspan = [(0, 0), (0, 2), (2, 2), (3, 2)]
for i, j in hidden_cells_table_1_with_colspan:
self.assertFalse(result.tables[0].metadata.cell_properties[i][j].invisible)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].rowspan, 1)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].colspan, 2)
self.assertFalse(result.tables[0].cells[i][j].invisible)
self.assertEqual(result.tables[0].cells[i][j].rowspan, 1)
self.assertEqual(result.tables[0].cells[i][j].colspan, 2)

hidden_cells_table_1_with_rowspan = [(2, 0), (2, 1)]
for i, j in hidden_cells_table_1_with_rowspan:
self.assertFalse(result.tables[0].metadata.cell_properties[i][j].invisible)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].rowspan, 2)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].colspan, 1)
self.assertFalse(result.tables[0].cells[i][j].invisible)
self.assertEqual(result.tables[0].cells[i][j].rowspan, 2)
self.assertEqual(result.tables[0].cells[i][j].colspan, 1)

hidden_cells_table_2 = [(0, 3), (0, 4), (0, 5), (1, 1), (1, 5), (2, 0), (2, 1), (2, 3)]
for i, j in hidden_cells_table_2:
self.assertTrue(result.tables[1].metadata.cell_properties[i][j].invisible)
self.assertEqual(result.tables[1].metadata.cell_properties[i][j].rowspan, 1)
self.assertEqual(result.tables[1].metadata.cell_properties[i][j].colspan, 1)
self.assertTrue(result.tables[1].cells[i][j].invisible)
self.assertEqual(result.tables[1].cells[i][j].rowspan, 1)
self.assertEqual(result.tables[1].cells[i][j].colspan, 1)

hidden_cells_table_2_with_colspan = [[(0, 2), 4], [(1, 4), 2]]
for (i, j), k in hidden_cells_table_2_with_colspan:
self.assertFalse(result.tables[1].metadata.cell_properties[i][j].invisible)
self.assertEqual(result.tables[1].metadata.cell_properties[i][j].rowspan, 1)
self.assertEqual(result.tables[1].metadata.cell_properties[i][j].colspan, k)
self.assertFalse(result.tables[1].cells[i][j].invisible)
self.assertEqual(result.tables[1].cells[i][j].rowspan, 1)
self.assertEqual(result.tables[1].cells[i][j].colspan, k)

# both colspan and rowspan check
self.assertFalse(result.tables[1].metadata.cell_properties[1][0].invisible)
self.assertEqual(result.tables[1].metadata.cell_properties[1][0].rowspan, 2)
self.assertEqual(result.tables[1].metadata.cell_properties[1][0].colspan, 2)
self.assertFalse(result.tables[1].cells[1][0].invisible)
self.assertEqual(result.tables[1].cells[1][0].rowspan, 2)
self.assertEqual(result.tables[1].cells[1][0].colspan, 2)

def test_tables_with_merged_cells(self) -> None:
docx_reader = DocxReader(config=get_config())
Expand All @@ -225,20 +225,20 @@ def test_tables_with_merged_cells(self) -> None:
hidden_cells_big_table = [(0, 1), (0, 2), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (3, 1), (3, 2), (3, 3),
(4, 0), (4, 1), (4, 2), (4, 3), (5, 0), (5, 1), (5, 2), (5, 3), (5, 6), (5, 7), (5, 8), (5, 9)]
for i, j in hidden_cells_big_table:
self.assertTrue(result.tables[0].metadata.cell_properties[i][j].invisible)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].rowspan, 1)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].colspan, 1)
self.assertTrue(result.tables[0].cells[i][j].invisible)
self.assertEqual(result.tables[0].cells[i][j].rowspan, 1)
self.assertEqual(result.tables[0].cells[i][j].colspan, 1)

hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]]
for (i, j), k in hidden_cells_big_table_with_colspan:
self.assertFalse(result.tables[0].metadata.cell_properties[i][j].invisible)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].rowspan, 1)
self.assertEqual(result.tables[0].metadata.cell_properties[i][j].colspan, k)
self.assertFalse(result.tables[0].cells[i][j].invisible)
self.assertEqual(result.tables[0].cells[i][j].rowspan, 1)
self.assertEqual(result.tables[0].cells[i][j].colspan, k)

# both colspan and rowspan check
self.assertFalse(result.tables[0].metadata.cell_properties[3][0].invisible)
self.assertEqual(result.tables[0].metadata.cell_properties[3][0].rowspan, 3)
self.assertEqual(result.tables[0].metadata.cell_properties[3][0].colspan, 4)
self.assertFalse(result.tables[0].cells[3][0].invisible)
self.assertEqual(result.tables[0].cells[3][0].rowspan, 3)
self.assertEqual(result.tables[0].cells[3][0].colspan, 4)

def test_diagram_annotation(self) -> None:
docx_reader = DocxReader(config=get_config())
Expand Down
7 changes: 4 additions & 3 deletions tests/unit_tests/test_misc_dedoc_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@ class TestDedocManager(TestCase):
def test_parse_file(self) -> None:
filename = "csv_tab.tsv"
result = self.dedoc_manager.parse(os.path.join(self.path, "csv_tab.tsv"))
cells = result.content.tables[0].cells
self.assertEqual(filename, result.metadata.file_name)
self.assertEqual(filename, result.metadata.file_name)
self.assertLessEqual(["1", "2", "3"], result.content.tables[0].cells[0])
self.assertLessEqual(["2", "1", "5"], result.content.tables[0].cells[1])
self.assertLessEqual(["5", "3", "1"], result.content.tables[0].cells[2])
self.assertLessEqual(["1", "2", "3"], [cell.get_text() for cell in cells[0]])
self.assertLessEqual(["2", "1", "5"], [cell.get_text() for cell in cells[1]])
self.assertLessEqual(["5", "3", "1"], [cell.get_text() for cell in cells[2]])

def test_file_not_exists(self) -> None:
with self.assertRaises(FileNotFoundError):
Expand Down
14 changes: 7 additions & 7 deletions tests/unit_tests/test_module_table_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ def test_table_split_right_column(self) -> None:
image = cv2.imread(path_image, 0)

tables = self.get_table(image, "rus+eng", table_type="split_last_column+wo_external_bounds")
self.assertTrue(tables[0].matrix_cells[4][-1], "40703978900000345077")
self.assertTrue(tables[0].matrix_cells[5][-1], "049401814")
self.assertTrue(tables[0].matrix_cells[6][-1], "30101810200000000814")
self.assertTrue(tables[0].matrix_cells[7][-1], "049401814")
self.assertTrue(tables[0].matrix_cells[8][-1], "30101810200000000814")
self.assertTrue(tables[0].matrix_cells[9][-1], "30110978700000070815")
self.assertTrue(tables[0].matrix_cells[10][-1], "30110978700000070815")
self.assertTrue(tables[0].matrix_cells[4][-1].get_text(), "40703978900000345077")
self.assertTrue(tables[0].matrix_cells[5][-1].get_text(), "049401814")
self.assertTrue(tables[0].matrix_cells[6][-1].get_text(), "30101810200000000814")
self.assertTrue(tables[0].matrix_cells[7][-1].get_text(), "049401814")
self.assertTrue(tables[0].matrix_cells[8][-1].get_text(), "30101810200000000814")
self.assertTrue(tables[0].matrix_cells[9][-1].get_text(), "30110978700000070815")
self.assertTrue(tables[0].matrix_cells[10][-1].get_text(), "30110978700000070815")

def test_table_extract_one_cell_and_one_cell_tables(self) -> None:
path_image = get_full_path("data/lising/platezhka.jpg")
Expand Down

0 comments on commit fc71c10

Please sign in to comment.