Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-872 rewrite benchmark correctness #510

Merged
merged 3 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 0 additions & 21 deletions resources/benchmarks/benchmarks_tl_correctness.json

This file was deleted.

27 changes: 27 additions & 0 deletions resources/benchmarks/benchmarks_tl_correctness.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Version =

--- Balanced Accuracy --- = 0.843482905982906
--- Accuracy --- = 0.9534883720930233
--- Weighted --- Precision = 0.9519564983695847, Recall=0.9534883720930233, F1=0.9525762106576597
--- Class corrected --- : Precision = 0.9703389830508474, Recall=0.9786324786324786, F1=0.9744680851063829
--- Class incorrected --- : Precision = 0.7727272727272727, Recall=0.7083333333333334, F1=0.7391304347826088
--- AVG Time corrected pdfs --- = 3.2058254999992175
--- AVG Time incorrected pdfs --- = 4.9308231472969055
--- AVG Time all pdfs --- = 3.3662903974222584


--- Failed corrected pdfs --- :
hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf
demystifying-nge-rock-ridge_1643518222_537.pdf
b96a__usmc-combat-camera-directory.pdf
afcea-spy.pdf
access-the-vision-for-2013.pdf

--- Failed incorrected pdfs --- :
Gromov_Dubova_-_Primenenie_metodov_TFKP_k_vychisleniyu_opredelennykh_integralov.pdf
PE157_1616278053_181.pdf
ЧММФ_Абакумов_учебник.pdf
EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf
slides.pdf
PE20_1616439522_1.pdf
Catalog-2020_dealers mail (1).pdf
123 changes: 81 additions & 42 deletions scripts/benchmark_tl_correctness.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import json
import os
import zipfile
from collections import OrderedDict, namedtuple
from time import time

import numpy as np
import requests
import wget
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

from dedoc.config import get_config
from dedoc.utils.utils import send_file

path_result = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks"))
os.makedirs(path_result, exist_ok=True)
path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")
path_result = os.path.join(path_result, "benchmarks_tl_correctness.txt")

"""
Experiments are available -> https://github.com/alexander1999-hub/txt_layer_correctness/tree/main :
Expand All @@ -24,37 +25,10 @@
"""

host = "http://localhost:1231"
param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))


def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
failed = []
total_incorrect_files = 0
directory = os.path.join(path_base, tl_path)
files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")]
total_file_size = len(files_list)
print(f"Files: {files_list}\nFiles number: {total_file_size}")
for file in tqdm(files_list):
file_path = os.path.join(directory, file)
r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)

found = False
for warning in r["warnings"]:
if warning.find(tl_type) != -1:
found = True
break

if found:
total_incorrect_files += 1
failed.append(file)
return param_dist_errors(total_file_size, total_incorrect_files, failed)


if __name__ == "__main__":
data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
os.makedirs(data_dir, exist_ok=True)
def download_dataset(data_dir: str) -> str:
benchmark_data_dir = os.path.join(data_dir, "data_with_text_layer")

if not os.path.isdir(benchmark_data_dir):
path_out = os.path.join(data_dir, "data_with_text_layer.zip")
wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out)
Expand All @@ -67,20 +41,85 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para

assert os.path.isdir(benchmark_data_dir)

result = OrderedDict()
result["version"] = requests.get(f"{host}/version").text
return benchmark_data_dir


def get_metrics(max_eval_pdf: int = 10000) -> None:
data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
os.makedirs(data_dir, exist_ok=True)

data_dir = download_dataset(data_dir)

folder = os.path.join(data_dir, "data_correct_text_layer")
correct_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")])
folder = os.path.join(data_dir, "data_incorrect_text_layer")
incorrect_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")])

files = np.append(correct_files, incorrect_files)

labels = np.empty(files.size)
labels[:correct_files.size] = 0 # "correct"
labels[correct_files.size:] = 1 # "incorrect"

failed_corrected_pdfs = []
failed_incorrected_pdfs = []

# run pipeline for prediction
predicts = np.empty(files.size)
parameters = dict(pdf_with_text_layer="auto", pages="1:1")
result_item = OrderedDict()
times_correct, times_incorrect = [], []

count = min(max_eval_pdf, len(files))

for i, file_path in enumerate(tqdm(files[:count])):
file_name = file_path.split("/")[-1]

time_b = time()
r = send_file(host=host, file_name=file_name, file_path=file_path, parameters=parameters)
time_eval = time() - time_b

if labels[i] == 0:
times_correct.append(time_eval)
else:
times_incorrect.append(time_eval)

predicts[i] = 3 # "failed" not handling
for warning in r["warnings"]:
if "has incorrect textual layer" in warning:
predicts[i] = 1 # "incorrect"
if "has a correct textual layer" in warning:
predicts[i] = 0 # "correct"

incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters)
result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size
result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed
if predicts[i] != labels[i]:
failed_corrected_pdfs.append(file_name) if labels[i] == 0 else failed_incorrected_pdfs.append(file_name)

correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters)
result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size
result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed
result["guessing_the_correctness_of_the_text"] = result_item
labels, predicts = labels[:count], predicts[:count]

b_accuracy = balanced_accuracy_score(labels, predicts)
accuracy = accuracy_score(labels, predicts)
w_avg = precision_recall_fscore_support(labels, predicts, average="weighted")
avg = precision_recall_fscore_support(labels, predicts, average=None, labels=[0, 1])

output = f"Version = {requests.get(host + '/version').text}\n\n"

output += f"--- Balanced Accuracy --- = {b_accuracy}\n"
output += f"--- Accuracy --- = {accuracy}\n"
output += f"--- Weighted --- Precision = {w_avg[0]}, Recall={w_avg[1]}, F1={w_avg[2]}\n"
output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n"
output += f"--- Class incorrected --- : Precision = {avg[0][1]}, Recall={avg[1][1]}, F1={avg[2][1]}\n"

output += f"--- AVG Time corrected pdfs --- = {np.mean(times_correct)}\n"
output += f"--- AVG Time incorrected pdfs --- = {np.mean(times_incorrect)}\n"
output += f"--- AVG Time all pdfs --- = {np.mean(times_correct + times_incorrect)}\n"

output += "\n\n--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs) # noqa
output += "\n\n--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs) # noqa

print(output)
with open(path_result, "w") as file_out:
json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
file_out.write(output)
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
print(f"Save result in {path_result}")


if __name__ == "__main__":
get_metrics()
Loading