From 338508ee59f914682e910dce7bb564906bce12fb Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Thu, 28 Nov 2024 13:20:43 +0300 Subject: [PATCH 1/3] TLDR-872 rewrite correctness script --- scripts/benchmark_tl_correctness.py | 131 ++++++++++++++++++++++++++-- 1 file changed, 123 insertions(+), 8 deletions(-) diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py index 2538cdef..0e7080c2 100644 --- a/scripts/benchmark_tl_correctness.py +++ b/scripts/benchmark_tl_correctness.py @@ -2,9 +2,13 @@ import os import zipfile from collections import OrderedDict, namedtuple +from time import time +import numpy as np import requests import wget +from Cryptodome.Random.random import shuffle +from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support from tqdm import tqdm from dedoc.config import get_config @@ -12,7 +16,7 @@ path_result = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks")) os.makedirs(path_result, exist_ok=True) -path_result = os.path.join(path_result, "benchmarks_tl_correctness.json") +path_result = os.path.join(path_result, "benchmarks_tl_correctness.txt") """ Experiments are available -> https://github.com/alexander1999-hub/txt_layer_correctness/tree/main : @@ -27,6 +31,29 @@ param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed")) +def send_request_mineru(file_path: str, url: str) -> dict: + """ + send file `file_name` in post request with `data` as parameters. Expects that response return code + `expected_code` + + :param file_name: name of file (should lie src/tests/data folder + :param data: parameter dictionary (here you can put language for example) + :param expected_code: expected http response code. 200 for normal request + :return: result from json + """ + data = {"parse_method": "auto", "is_json_md_dump": True} + file_name = file_path.split("/")[-1] + + with open(file_path, "rb") as file: + files = {"pdf_file": (file_name, file)} + r = requests.post(url, files=files, data=data) + + if r.status_code != 200: + return r.content.decode() + else: + return json.loads(r.content.decode()) + + def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple: failed = [] total_incorrect_files = 0 @@ -38,23 +65,20 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para file_path = os.path.join(directory, file) r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters) - found = False + found = False # found error of classifier for warning in r["warnings"]: if warning.find(tl_type) != -1: found = True break if found: - total_incorrect_files += 1 - failed.append(file) + total_incorrect_files += 1 # count, where label != predict + failed.append(file) # file, where classifier failed return param_dist_errors(total_file_size, total_incorrect_files, failed) -if __name__ == "__main__": - data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data") - os.makedirs(data_dir, exist_ok=True) +def download_dataset(data_dir: str) -> str: benchmark_data_dir = os.path.join(data_dir, "data_with_text_layer") - if not os.path.isdir(benchmark_data_dir): path_out = os.path.join(data_dir, "data_with_text_layer.zip") wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out) @@ -67,6 +91,15 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para assert os.path.isdir(benchmark_data_dir) + return benchmark_data_dir + + +def evaluation_dedoc() -> None: + data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data") + os.makedirs(data_dir, exist_ok=True) + + benchmark_data_dir = download_dataset(data_dir) + result = OrderedDict() result["version"] = requests.get(f"{host}/version").text parameters = dict(pdf_with_text_layer="auto", pages="1:1") @@ -84,3 +117,85 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para with open(path_result, "w") as file_out: json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False) print(f"Save result in {path_result}") + + +def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None: + data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data") + os.makedirs(data_dir, exist_ok=True) + + data_dir = download_dataset(data_dir) + + folder = os.path.join(data_dir, "data_correct_text_layer") + correct_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")]) + folder = os.path.join(data_dir, "data_incorrect_text_layer") + incorrect_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")]) + + files = np.append(correct_files, incorrect_files) + + labels = np.empty(files.size) + labels[:correct_files.size] = 0 # "correct" + labels[correct_files.size:] = 1 # "incorrect" + + failed_corrected_pdfs = [] + failed_incorrected_pdfs = [] + + # run pipeline for prediction + predicts = np.empty(files.size) + parameters = dict(pdf_with_text_layer="auto", pages="1:1") + times_correct, times_incorrect = [], [] + + if with_shuffle: + shuffle(files) + count = min(max_eval_pdf, len(files)) + + for i, file_path in enumerate(tqdm(files[:count])): + file_name = file_path.split("/")[-1] + + time_b = time() + r = send_file(host=host, file_name=file_name, file_path=file_path, parameters=parameters) + time_eval = time() - time_b + + if labels[i] == 0: + times_correct.append(time_eval) + else: + times_incorrect.append(time_eval) + + predicts[i] = 3 # "failed" not handling + for warning in r["warnings"]: + if "has incorrect textual layer" in warning: + predicts[i] = 1 # "incorrect" + if "has a correct textual layer" in warning: + predicts[i] = 0 # "correct" + + if predicts[i] != labels[i]: + failed_corrected_pdfs.append(file_name) if labels[i] == 0 else failed_incorrected_pdfs.append(file_name) + + labels, predicts = labels[:count], predicts[:count] + + b_accuracy = balanced_accuracy_score(labels, predicts) + accuracy = accuracy_score(labels, predicts) + w_avg = precision_recall_fscore_support(labels, predicts, average="weighted") + avg = precision_recall_fscore_support(labels, predicts, average=None, labels=[0, 1]) + + output = f"--- Balanced Accuracy --- = {b_accuracy}\n" + output += f"--- Accuracy --- = {accuracy}\n" + output += f"--- Weighted --- Precision = {w_avg[0]}, Recall={w_avg[1]}, F1={w_avg[2]}\n" + output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n" + output += f"--- Class incorrected --- : Precision = {avg[0][1]}, Recall={avg[1][1]}, F1={avg[2][1]}\n" + + output += f"--- AVG Time corrected pdfs --- = {np.array(times_correct).mean()}\n" + output += f"--- AVG Time incorrected pdfs --- = {np.array(times_incorrect).mean()}\n" + output += f"--- AVG Time all pdfs --- = {np.array(times_correct + times_incorrect).mean()}\n" + + output += "--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs) # noqa + output += "--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs) # noqa + + print(output) + with open(path_result, "w") as file_out: + file_out.write(output) + print(f"Save result in {path_result}") + + +if __name__ == "__main__": + # evaluation_dedoc() + get_metrics(max_eval_pdf=50, with_shuffle=True) From a113c1947f6b90b9129fbd5946419b7d98cfa692 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Thu, 28 Nov 2024 18:33:54 +0300 Subject: [PATCH 2/3] TLDR-872 change correctness benchmark --- .../benchmarks/benchmarks_tl_correctness.txt | 27 ++++++ scripts/benchmark_tl_correctness.py | 85 ++----------------- 2 files changed, 33 insertions(+), 79 deletions(-) create mode 100644 resources/benchmarks/benchmarks_tl_correctness.txt diff --git a/resources/benchmarks/benchmarks_tl_correctness.txt b/resources/benchmarks/benchmarks_tl_correctness.txt new file mode 100644 index 00000000..351e9fd8 --- /dev/null +++ b/resources/benchmarks/benchmarks_tl_correctness.txt @@ -0,0 +1,27 @@ +Version = + +--- Balanced Accuracy --- = 0.843482905982906 +--- Accuracy --- = 0.9534883720930233 +--- Weighted --- Precision = 0.9519564983695847, Recall=0.9534883720930233, F1=0.9525762106576597 +--- Class corrected --- : Precision = 0.9703389830508474, Recall=0.9786324786324786, F1=0.9744680851063829 +--- Class incorrected --- : Precision = 0.7727272727272727, Recall=0.7083333333333334, F1=0.7391304347826088 +--- AVG Time corrected pdfs --- = 3.2058254999992175 +--- AVG Time incorrected pdfs --- = 4.9308231472969055 +--- AVG Time all pdfs --- = 3.3662903974222584 + + +--- Failed corrected pdfs --- : +hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf +demystifying-nge-rock-ridge_1643518222_537.pdf +b96a__usmc-combat-camera-directory.pdf +afcea-spy.pdf +access-the-vision-for-2013.pdf + +--- Failed incorrected pdfs --- : +Gromov_Dubova_-_Primenenie_metodov_TFKP_k_vychisleniyu_opredelennykh_integralov.pdf +PE157_1616278053_181.pdf +ЧММФ_Абакумов_учебник.pdf +EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf +slides.pdf +PE20_1616439522_1.pdf +Catalog-2020_dealers mail (1).pdf \ No newline at end of file diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py index 0e7080c2..8379fd7b 100644 --- a/scripts/benchmark_tl_correctness.py +++ b/scripts/benchmark_tl_correctness.py @@ -1,7 +1,5 @@ -import json import os import zipfile -from collections import OrderedDict, namedtuple from time import time import numpy as np @@ -28,53 +26,6 @@ """ host = "http://localhost:1231" -param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed")) - - -def send_request_mineru(file_path: str, url: str) -> dict: - """ - send file `file_name` in post request with `data` as parameters. Expects that response return code - `expected_code` - - :param file_name: name of file (should lie src/tests/data folder - :param data: parameter dictionary (here you can put language for example) - :param expected_code: expected http response code. 200 for normal request - :return: result from json - """ - data = {"parse_method": "auto", "is_json_md_dump": True} - file_name = file_path.split("/")[-1] - - with open(file_path, "rb") as file: - files = {"pdf_file": (file_name, file)} - r = requests.post(url, files=files, data=data) - - if r.status_code != 200: - return r.content.decode() - else: - return json.loads(r.content.decode()) - - -def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple: - failed = [] - total_incorrect_files = 0 - directory = os.path.join(path_base, tl_path) - files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")] - total_file_size = len(files_list) - print(f"Files: {files_list}\nFiles number: {total_file_size}") - for file in tqdm(files_list): - file_path = os.path.join(directory, file) - r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters) - - found = False # found error of classifier - for warning in r["warnings"]: - if warning.find(tl_type) != -1: - found = True - break - - if found: - total_incorrect_files += 1 # count, where label != predict - failed.append(file) # file, where classifier failed - return param_dist_errors(total_file_size, total_incorrect_files, failed) def download_dataset(data_dir: str) -> str: @@ -94,31 +45,6 @@ def download_dataset(data_dir: str) -> str: return benchmark_data_dir -def evaluation_dedoc() -> None: - data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data") - os.makedirs(data_dir, exist_ok=True) - - benchmark_data_dir = download_dataset(data_dir) - - result = OrderedDict() - result["version"] = requests.get(f"{host}/version").text - parameters = dict(pdf_with_text_layer="auto", pages="1:1") - result_item = OrderedDict() - - incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters) - result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size - result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed - - correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters) - result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size - result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed - result["guessing_the_correctness_of_the_text"] = result_item - - with open(path_result, "w") as file_out: - json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False) - print(f"Save result in {path_result}") - - def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None: data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data") os.makedirs(data_dir, exist_ok=True) @@ -177,7 +103,9 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None: w_avg = precision_recall_fscore_support(labels, predicts, average="weighted") avg = precision_recall_fscore_support(labels, predicts, average=None, labels=[0, 1]) - output = f"--- Balanced Accuracy --- = {b_accuracy}\n" + output = f"Version = {requests.get(host + '/version').text}\n\n" + + output += f"--- Balanced Accuracy --- = {b_accuracy}\n" output += f"--- Accuracy --- = {accuracy}\n" output += f"--- Weighted --- Precision = {w_avg[0]}, Recall={w_avg[1]}, F1={w_avg[2]}\n" output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n" @@ -187,8 +115,8 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None: output += f"--- AVG Time incorrected pdfs --- = {np.array(times_incorrect).mean()}\n" output += f"--- AVG Time all pdfs --- = {np.array(times_correct + times_incorrect).mean()}\n" - output += "--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs) # noqa - output += "--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs) # noqa + output += "\n\n--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs) # noqa + output += "\n\n--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs) # noqa print(output) with open(path_result, "w") as file_out: @@ -197,5 +125,4 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None: if __name__ == "__main__": - # evaluation_dedoc() - get_metrics(max_eval_pdf=50, with_shuffle=True) + get_metrics() From e269a7d01b57d1b95cc43e8b17cd23e21336b9ff Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Tue, 3 Dec 2024 14:47:36 +0300 Subject: [PATCH 3/3] TLDR-872 after review --- .../benchmarks/benchmarks_tl_correctness.json | 21 ------------------- scripts/benchmark_tl_correctness.py | 11 ++++------ 2 files changed, 4 insertions(+), 28 deletions(-) delete mode 100644 resources/benchmarks/benchmarks_tl_correctness.json diff --git a/resources/benchmarks/benchmarks_tl_correctness.json b/resources/benchmarks/benchmarks_tl_correctness.json deleted file mode 100644 index f3fee769..00000000 --- a/resources/benchmarks/benchmarks_tl_correctness.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "version": "0.11.2", - "guessing_the_correctness_of_the_text": { - "percentage_of_guessed_correct_tl": 0.9785407725321889, - "list_of_file_with_incorrect_tl": [ - "hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf", - "afcea-spy.pdf", - "b96a__usmc-combat-camera-directory.pdf", - "access-the-vision-for-2013.pdf", - "demystifying-nge-rock-ridge_1643518222_537.pdf" - ], - "percentage_of_guessed_incorrect_tl": 0.7916666666666666, - "list_of_file_with_correct_tl": [ - "PE20_1616439522_1.pdf", - "slides.pdf", - "PE157_1616278053_181.pdf", - "EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf", - "ЧММФ_Абакумов_учебник.pdf" - ] - } -} \ No newline at end of file diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py index 8379fd7b..5469f309 100644 --- a/scripts/benchmark_tl_correctness.py +++ b/scripts/benchmark_tl_correctness.py @@ -5,7 +5,6 @@ import numpy as np import requests import wget -from Cryptodome.Random.random import shuffle from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support from tqdm import tqdm @@ -45,7 +44,7 @@ def download_dataset(data_dir: str) -> str: return benchmark_data_dir -def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None: +def get_metrics(max_eval_pdf: int = 10000) -> None: data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data") os.makedirs(data_dir, exist_ok=True) @@ -70,8 +69,6 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None: parameters = dict(pdf_with_text_layer="auto", pages="1:1") times_correct, times_incorrect = [], [] - if with_shuffle: - shuffle(files) count = min(max_eval_pdf, len(files)) for i, file_path in enumerate(tqdm(files[:count])): @@ -111,9 +108,9 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None: output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n" output += f"--- Class incorrected --- : Precision = {avg[0][1]}, Recall={avg[1][1]}, F1={avg[2][1]}\n" - output += f"--- AVG Time corrected pdfs --- = {np.array(times_correct).mean()}\n" - output += f"--- AVG Time incorrected pdfs --- = {np.array(times_incorrect).mean()}\n" - output += f"--- AVG Time all pdfs --- = {np.array(times_correct + times_incorrect).mean()}\n" + output += f"--- AVG Time corrected pdfs --- = {np.mean(times_correct)}\n" + output += f"--- AVG Time incorrected pdfs --- = {np.mean(times_incorrect)}\n" + output += f"--- AVG Time all pdfs --- = {np.mean(times_correct + times_incorrect)}\n" output += "\n\n--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs) # noqa output += "\n\n--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs) # noqa