Skip to content

Commit

Permalink
fix test_pdf_miner script
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikita Shevtsov committed Oct 17, 2023
1 parent 381deaa commit a66d2a9
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 10 deletions.
25 changes: 15 additions & 10 deletions dedoc/scripts/test_pdf_miner.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import json
import os
from pathlib import Path
import re
from pathlib import Path
from tempfile import TemporaryDirectory

import requests

from dedoc.api.api_utils import json2txt
from dedoc.dedoc_manager import DedocManager
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor


import requests

URL = "https://at.ispras.ru/owncloud/index.php/s/uImxYhliBHU8ei7/download"

Expand All @@ -20,22 +20,24 @@
f.write(response.content)
import zipfile

with zipfile.ZipFile((Path(tmpdir) / "pdfs.zip"), 'r') as zip_ref:
with zipfile.ZipFile((Path(tmpdir) / "pdfs.zip"), "r") as zip_ref:
zip_ref.extractall((Path(tmpdir)))
os.remove(Path(tmpdir) / "pdfs.zip")
pdfs_path = Path(tmpdir) / 'PdfMiner params '
pdfs_path = Path(tmpdir) / "PdfMiner params "

manager = DedocManager()
for file in os.listdir(pdfs_path):
result = manager.parse(file_path=str(pdfs_path / file), parameters={"pdf_with_text_layer": "true"})
txt_content = json2txt(paragraph=result.content.structure)
with (Path(tmpdir) / 'ocr.txt').open('w') as f:
with (Path(tmpdir) / "ocr.txt").open("w") as f:
f.write(txt_content)

accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy"))
gt_path = Path("pdf_ground_truths") / (file[:-3] + 'txt')
tmp_ocr_path = Path(tmpdir) / 'ocr.txt'
gt_path = Path("pdf_ground_truths") / (file[:-3] + "txt")
tmp_ocr_path = Path(tmpdir) / "ocr.txt"
accuracy_path = Path(tmpdir) / "accuracy.txt"
if accuracy_path.exists():
accuracy_path.unlink()
command = f"{accuracy_script_path} \"{gt_path}\" {tmp_ocr_path} >> {accuracy_path}"
os.system(command)

Expand All @@ -47,5 +49,8 @@
acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1]
info[str(file)] = acc_percent

print(info)
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
with (Path(output_dir) / "test_pdf_miner.json").open("w") as f:
json.dump(info, f)

print("save result in" + output_dir)
13 changes: 13 additions & 0 deletions resources/benchmarks/test_pdf_miner.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"\u041e\u0431\u0440\u0430\u0437\u0435\u0446 \u043f\u0440\u0438\u043c\u0435\u0440\u043d\u043e\u0433\u043e \u0437\u0430\u043f\u043e\u043b\u043d\u0435\u043d\u0438\u044f \u0443\u0432\u0435\u0434\u043e\u043c\u043b\u0435\u043d\u0438\u044f \u043e\u0431 \u043e\u0442\u0441\u0443\u0442\u0441\u0442\u0432\u0438\u0438 \u0446\u0438\u0444\u0440\u043e\u0432\u044b\u0445 \u0444\u0438\u043d\u0430\u043d\u0441\u043e\u0432\u044b\u0445 \u0430\u043a\u0442\u0438\u0432\u043e\u0432.pdf": "100.0",
"2023 \u0413\u043e\u043d\u043e\u0447\u043d\u0430\u044f \u0418\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438\u044f CR 2023.pdf": "100.0",
"support_182_poisk-dokumentov.pdf": "100.0",
"6.1 \u041e\u043f\u0438\u0441\u0430\u043d\u0438\u0435 \u043f\u0440\u043e\u0435\u043a\u0442\u0430 Thalamus.pdf": "100.0",
"ECPPM2020_Instructions.pdf": "100.0",
"NOR CHR 2023.pdf": "100.0",
"2-column-state.pdf": "100.0",
"ba-2017.pdf": "100.0",
"\u041c\u0435\u0436\u0434\u0443\u043d\u0430\u0440\u043e\u0434\u043d\u043e\u0435 \u0438 \u043d\u0430\u0446\u0438\u043e\u043d\u0430\u043b\u044c\u043d\u043e\u0435 \u0441\u043f\u043e\u0440\u0442\u0438\u0432\u043d\u043e\u0435 \u043f\u0440\u0430\u0432\u043e \u043f\u043e\u0440\u0442\u0444\u043e\u043b\u0438\u043e_\u0440\u0443\u0441.pdf": "100.0",
"Uvedoml_ESN.pdf": "100.0",
"instruction_gibdd.pdf": "100.0"
}

0 comments on commit a66d2a9

Please sign in to comment.