From 338508ee59f914682e910dce7bb564906bce12fb Mon Sep 17 00:00:00 2001
From: Belyaeva Oksana <belyaeva@ispras.ru>
Date: Thu, 28 Nov 2024 13:20:43 +0300
Subject: [PATCH 1/3] TLDR-872 rewrite correctness script

---
 scripts/benchmark_tl_correctness.py | 131 ++++++++++++++++++++++++++--
 1 file changed, 123 insertions(+), 8 deletions(-)

diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py
index 2538cdef..0e7080c2 100644
--- a/scripts/benchmark_tl_correctness.py
+++ b/scripts/benchmark_tl_correctness.py
@@ -2,9 +2,13 @@
 import os
 import zipfile
 from collections import OrderedDict, namedtuple
+from time import time
 
+import numpy as np
 import requests
 import wget
+from Cryptodome.Random.random import shuffle
+from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
 from tqdm import tqdm
 
 from dedoc.config import get_config
@@ -12,7 +16,7 @@
 
 path_result = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks"))
 os.makedirs(path_result, exist_ok=True)
-path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")
+path_result = os.path.join(path_result, "benchmarks_tl_correctness.txt")
 
 """
 Experiments are available -> https://github.com/alexander1999-hub/txt_layer_correctness/tree/main :
@@ -27,6 +31,29 @@
 param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))
 
 
+def send_request_mineru(file_path: str, url: str) -> dict:
+    """
+    send file `file_name` in post request with `data` as parameters. Expects that response return code
+    `expected_code`
+
+    :param file_name: name of file (should lie  src/tests/data folder
+    :param data: parameter dictionary (here you can put language for example)
+    :param expected_code: expected http response code. 200 for normal request
+    :return: result from json
+    """
+    data = {"parse_method": "auto", "is_json_md_dump": True}
+    file_name = file_path.split("/")[-1]
+
+    with open(file_path, "rb") as file:
+        files = {"pdf_file": (file_name, file)}
+        r = requests.post(url, files=files, data=data)
+
+        if r.status_code != 200:
+            return r.content.decode()
+        else:
+            return json.loads(r.content.decode())
+
+
 def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
     failed = []
     total_incorrect_files = 0
@@ -38,23 +65,20 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
         file_path = os.path.join(directory, file)
         r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
 
-        found = False
+        found = False  # found error of classifier
         for warning in r["warnings"]:
             if warning.find(tl_type) != -1:
                 found = True
                 break
 
         if found:
-            total_incorrect_files += 1
-            failed.append(file)
+            total_incorrect_files += 1  # count, where label != predict
+            failed.append(file)         # file, where classifier failed
     return param_dist_errors(total_file_size, total_incorrect_files, failed)
 
 
-if __name__ == "__main__":
-    data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
-    os.makedirs(data_dir, exist_ok=True)
+def download_dataset(data_dir: str) -> str:
     benchmark_data_dir = os.path.join(data_dir, "data_with_text_layer")
-
     if not os.path.isdir(benchmark_data_dir):
         path_out = os.path.join(data_dir, "data_with_text_layer.zip")
         wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out)
@@ -67,6 +91,15 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
 
     assert os.path.isdir(benchmark_data_dir)
 
+    return benchmark_data_dir
+
+
+def evaluation_dedoc() -> None:
+    data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
+    os.makedirs(data_dir, exist_ok=True)
+
+    benchmark_data_dir = download_dataset(data_dir)
+
     result = OrderedDict()
     result["version"] = requests.get(f"{host}/version").text
     parameters = dict(pdf_with_text_layer="auto", pages="1:1")
@@ -84,3 +117,85 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para
     with open(path_result, "w") as file_out:
         json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
     print(f"Save result in {path_result}")
+
+
+def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None:
+    data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_dir = download_dataset(data_dir)
+
+    folder = os.path.join(data_dir, "data_correct_text_layer")
+    correct_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")])
+    folder = os.path.join(data_dir, "data_incorrect_text_layer")
+    incorrect_files = np.array([os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith(".pdf")])
+
+    files = np.append(correct_files, incorrect_files)
+
+    labels = np.empty(files.size)
+    labels[:correct_files.size] = 0  # "correct"
+    labels[correct_files.size:] = 1  # "incorrect"
+
+    failed_corrected_pdfs = []
+    failed_incorrected_pdfs = []
+
+    # run pipeline for prediction
+    predicts = np.empty(files.size)
+    parameters = dict(pdf_with_text_layer="auto", pages="1:1")
+    times_correct, times_incorrect = [], []
+
+    if with_shuffle:
+        shuffle(files)
+    count = min(max_eval_pdf, len(files))
+
+    for i, file_path in enumerate(tqdm(files[:count])):
+        file_name = file_path.split("/")[-1]
+
+        time_b = time()
+        r = send_file(host=host, file_name=file_name, file_path=file_path, parameters=parameters)
+        time_eval = time() - time_b
+
+        if labels[i] == 0:
+            times_correct.append(time_eval)
+        else:
+            times_incorrect.append(time_eval)
+
+        predicts[i] = 3  # "failed" not handling
+        for warning in r["warnings"]:
+            if "has incorrect textual layer" in warning:
+                predicts[i] = 1  # "incorrect"
+            if "has a correct textual layer" in warning:
+                predicts[i] = 0  # "correct"
+
+        if predicts[i] != labels[i]:
+            failed_corrected_pdfs.append(file_name) if labels[i] == 0 else failed_incorrected_pdfs.append(file_name)
+
+    labels, predicts = labels[:count], predicts[:count]
+
+    b_accuracy = balanced_accuracy_score(labels, predicts)
+    accuracy = accuracy_score(labels, predicts)
+    w_avg = precision_recall_fscore_support(labels, predicts, average="weighted")
+    avg = precision_recall_fscore_support(labels, predicts, average=None, labels=[0, 1])
+
+    output = f"--- Balanced Accuracy --- = {b_accuracy}\n"
+    output += f"--- Accuracy --- = {accuracy}\n"
+    output += f"--- Weighted --- Precision = {w_avg[0]}, Recall={w_avg[1]}, F1={w_avg[2]}\n"
+    output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n"
+    output += f"--- Class incorrected --- : Precision = {avg[0][1]}, Recall={avg[1][1]}, F1={avg[2][1]}\n"
+
+    output += f"--- AVG Time corrected pdfs --- = {np.array(times_correct).mean()}\n"
+    output += f"--- AVG Time incorrected pdfs --- = {np.array(times_incorrect).mean()}\n"
+    output += f"--- AVG Time all pdfs --- = {np.array(times_correct + times_incorrect).mean()}\n"
+
+    output += "--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs)  # noqa
+    output += "--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs)  # noqa
+
+    print(output)
+    with open(path_result, "w") as file_out:
+        file_out.write(output)
+    print(f"Save result in {path_result}")
+
+
+if __name__ == "__main__":
+    # evaluation_dedoc()
+    get_metrics(max_eval_pdf=50, with_shuffle=True)

From a113c1947f6b90b9129fbd5946419b7d98cfa692 Mon Sep 17 00:00:00 2001
From: Belyaeva Oksana <belyaeva@ispras.ru>
Date: Thu, 28 Nov 2024 18:33:54 +0300
Subject: [PATCH 2/3] TLDR-872 change correctness benchmark

---
 .../benchmarks/benchmarks_tl_correctness.txt  | 27 ++++++
 scripts/benchmark_tl_correctness.py           | 85 ++-----------------
 2 files changed, 33 insertions(+), 79 deletions(-)
 create mode 100644 resources/benchmarks/benchmarks_tl_correctness.txt

diff --git a/resources/benchmarks/benchmarks_tl_correctness.txt b/resources/benchmarks/benchmarks_tl_correctness.txt
new file mode 100644
index 00000000..351e9fd8
--- /dev/null
+++ b/resources/benchmarks/benchmarks_tl_correctness.txt
@@ -0,0 +1,27 @@
+Version = 
+
+--- Balanced Accuracy --- = 0.843482905982906
+--- Accuracy --- = 0.9534883720930233
+--- Weighted --- Precision = 0.9519564983695847, Recall=0.9534883720930233, F1=0.9525762106576597
+--- Class corrected --- : Precision = 0.9703389830508474, Recall=0.9786324786324786, F1=0.9744680851063829
+--- Class incorrected --- : Precision = 0.7727272727272727, Recall=0.7083333333333334, F1=0.7391304347826088
+--- AVG Time corrected pdfs --- = 3.2058254999992175
+--- AVG Time incorrected pdfs --- = 4.9308231472969055
+--- AVG Time all pdfs --- = 3.3662903974222584
+
+
+--- Failed corrected pdfs --- : 
+hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf
+demystifying-nge-rock-ridge_1643518222_537.pdf
+b96a__usmc-combat-camera-directory.pdf
+afcea-spy.pdf
+access-the-vision-for-2013.pdf
+
+--- Failed incorrected pdfs --- : 
+Gromov_Dubova_-_Primenenie_metodov_TFKP_k_vychisleniyu_opredelennykh_integralov.pdf
+PE157_1616278053_181.pdf
+╨º╨£╨£╨ñ_╨É╨▒╨░╨║╤â╨╝╨╛╨▓_╤â╤ç╨╡╨▒╨╜╨╕╨║.pdf
+EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf
+slides.pdf
+PE20_1616439522_1.pdf
+Catalog-2020_dealers mail (1).pdf
\ No newline at end of file
diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py
index 0e7080c2..8379fd7b 100644
--- a/scripts/benchmark_tl_correctness.py
+++ b/scripts/benchmark_tl_correctness.py
@@ -1,7 +1,5 @@
-import json
 import os
 import zipfile
-from collections import OrderedDict, namedtuple
 from time import time
 
 import numpy as np
@@ -28,53 +26,6 @@
 """
 
 host = "http://localhost:1231"
-param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))
-
-
-def send_request_mineru(file_path: str, url: str) -> dict:
-    """
-    send file `file_name` in post request with `data` as parameters. Expects that response return code
-    `expected_code`
-
-    :param file_name: name of file (should lie  src/tests/data folder
-    :param data: parameter dictionary (here you can put language for example)
-    :param expected_code: expected http response code. 200 for normal request
-    :return: result from json
-    """
-    data = {"parse_method": "auto", "is_json_md_dump": True}
-    file_name = file_path.split("/")[-1]
-
-    with open(file_path, "rb") as file:
-        files = {"pdf_file": (file_name, file)}
-        r = requests.post(url, files=files, data=data)
-
-        if r.status_code != 200:
-            return r.content.decode()
-        else:
-            return json.loads(r.content.decode())
-
-
-def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, parameters: dict) -> namedtuple:
-    failed = []
-    total_incorrect_files = 0
-    directory = os.path.join(path_base, tl_path)
-    files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")]
-    total_file_size = len(files_list)
-    print(f"Files: {files_list}\nFiles number: {total_file_size}")
-    for file in tqdm(files_list):
-        file_path = os.path.join(directory, file)
-        r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
-
-        found = False  # found error of classifier
-        for warning in r["warnings"]:
-            if warning.find(tl_type) != -1:
-                found = True
-                break
-
-        if found:
-            total_incorrect_files += 1  # count, where label != predict
-            failed.append(file)         # file, where classifier failed
-    return param_dist_errors(total_file_size, total_incorrect_files, failed)
 
 
 def download_dataset(data_dir: str) -> str:
@@ -94,31 +45,6 @@ def download_dataset(data_dir: str) -> str:
     return benchmark_data_dir
 
 
-def evaluation_dedoc() -> None:
-    data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
-    os.makedirs(data_dir, exist_ok=True)
-
-    benchmark_data_dir = download_dataset(data_dir)
-
-    result = OrderedDict()
-    result["version"] = requests.get(f"{host}/version").text
-    parameters = dict(pdf_with_text_layer="auto", pages="1:1")
-    result_item = OrderedDict()
-
-    incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, " incorrect ", "data_correct_text_layer", parameters)
-    result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size
-    result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed
-
-    correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, " correct ", "data_incorrect_text_layer", parameters)
-    result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size
-    result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed
-    result["guessing_the_correctness_of_the_text"] = result_item
-
-    with open(path_result, "w") as file_out:
-        json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False)
-    print(f"Save result in {path_result}")
-
-
 def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None:
     data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
     os.makedirs(data_dir, exist_ok=True)
@@ -177,7 +103,9 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None:
     w_avg = precision_recall_fscore_support(labels, predicts, average="weighted")
     avg = precision_recall_fscore_support(labels, predicts, average=None, labels=[0, 1])
 
-    output = f"--- Balanced Accuracy --- = {b_accuracy}\n"
+    output = f"Version = {requests.get(host + '/version').text}\n\n"
+
+    output += f"--- Balanced Accuracy --- = {b_accuracy}\n"
     output += f"--- Accuracy --- = {accuracy}\n"
     output += f"--- Weighted --- Precision = {w_avg[0]}, Recall={w_avg[1]}, F1={w_avg[2]}\n"
     output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n"
@@ -187,8 +115,8 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None:
     output += f"--- AVG Time incorrected pdfs --- = {np.array(times_incorrect).mean()}\n"
     output += f"--- AVG Time all pdfs --- = {np.array(times_correct + times_incorrect).mean()}\n"
 
-    output += "--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs)  # noqa
-    output += "--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs)  # noqa
+    output += "\n\n--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs)  # noqa
+    output += "\n\n--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs)  # noqa
 
     print(output)
     with open(path_result, "w") as file_out:
@@ -197,5 +125,4 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None:
 
 
 if __name__ == "__main__":
-    # evaluation_dedoc()
-    get_metrics(max_eval_pdf=50, with_shuffle=True)
+    get_metrics()

From e269a7d01b57d1b95cc43e8b17cd23e21336b9ff Mon Sep 17 00:00:00 2001
From: Belyaeva Oksana <belyaeva@ispras.ru>
Date: Tue, 3 Dec 2024 14:47:36 +0300
Subject: [PATCH 3/3] TLDR-872 after review

---
 .../benchmarks/benchmarks_tl_correctness.json | 21 -------------------
 scripts/benchmark_tl_correctness.py           | 11 ++++------
 2 files changed, 4 insertions(+), 28 deletions(-)
 delete mode 100644 resources/benchmarks/benchmarks_tl_correctness.json

diff --git a/resources/benchmarks/benchmarks_tl_correctness.json b/resources/benchmarks/benchmarks_tl_correctness.json
deleted file mode 100644
index f3fee769..00000000
--- a/resources/benchmarks/benchmarks_tl_correctness.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-    "version": "0.11.2",
-    "guessing_the_correctness_of_the_text": {
-        "percentage_of_guessed_correct_tl": 0.9785407725321889,
-        "list_of_file_with_incorrect_tl": [
-            "hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf",
-            "afcea-spy.pdf",
-            "b96a__usmc-combat-camera-directory.pdf",
-            "access-the-vision-for-2013.pdf",
-            "demystifying-nge-rock-ridge_1643518222_537.pdf"
-        ],
-        "percentage_of_guessed_incorrect_tl": 0.7916666666666666,
-        "list_of_file_with_correct_tl": [
-            "PE20_1616439522_1.pdf",
-            "slides.pdf",
-            "PE157_1616278053_181.pdf",
-            "EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf",
-            "╨º╨£╨£╨ñ_╨É╨▒╨░╨║╤â╨╝╨╛╨▓_╤â╤ç╨╡╨▒╨╜╨╕╨║.pdf"
-        ]
-    }
-}
\ No newline at end of file
diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py
index 8379fd7b..5469f309 100644
--- a/scripts/benchmark_tl_correctness.py
+++ b/scripts/benchmark_tl_correctness.py
@@ -5,7 +5,6 @@
 import numpy as np
 import requests
 import wget
-from Cryptodome.Random.random import shuffle
 from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
 from tqdm import tqdm
 
@@ -45,7 +44,7 @@ def download_dataset(data_dir: str) -> str:
     return benchmark_data_dir
 
 
-def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None:
+def get_metrics(max_eval_pdf: int = 10000) -> None:
     data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data")
     os.makedirs(data_dir, exist_ok=True)
 
@@ -70,8 +69,6 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None:
     parameters = dict(pdf_with_text_layer="auto", pages="1:1")
     times_correct, times_incorrect = [], []
 
-    if with_shuffle:
-        shuffle(files)
     count = min(max_eval_pdf, len(files))
 
     for i, file_path in enumerate(tqdm(files[:count])):
@@ -111,9 +108,9 @@ def get_metrics(max_eval_pdf: int = 10000, with_shuffle: bool = False) -> None:
     output += f"--- Class corrected --- : Precision = {avg[0][0]}, Recall={avg[1][0]}, F1={avg[2][0]}\n"
     output += f"--- Class incorrected --- : Precision = {avg[0][1]}, Recall={avg[1][1]}, F1={avg[2][1]}\n"
 
-    output += f"--- AVG Time corrected pdfs --- = {np.array(times_correct).mean()}\n"
-    output += f"--- AVG Time incorrected pdfs --- = {np.array(times_incorrect).mean()}\n"
-    output += f"--- AVG Time all pdfs --- = {np.array(times_correct + times_incorrect).mean()}\n"
+    output += f"--- AVG Time corrected pdfs --- = {np.mean(times_correct)}\n"
+    output += f"--- AVG Time incorrected pdfs --- = {np.mean(times_incorrect)}\n"
+    output += f"--- AVG Time all pdfs --- = {np.mean(times_correct + times_incorrect)}\n"
 
     output += "\n\n--- Failed corrected pdfs --- : \n" + '\n'.join(failed_corrected_pdfs)  # noqa
     output += "\n\n--- Failed incorrected pdfs --- : \n" + '\n'.join(failed_incorrected_pdfs)  # noqa