From 5406b3534503a98d345f9aca4990f8359cd11d93 Mon Sep 17 00:00:00 2001 From: "Jan C. Brammer" Date: Thu, 31 Oct 2024 13:29:38 +0000 Subject: [PATCH] Don't gitignore PubChem scripts --- .gitignore | 1 + .../test_library/data/pubchem/download.py | 16 ++++++++++++ .../tests/test_library/data/pubchem/utils.py | 26 +++++++++++++++++++ .../test_library/data/pubchem/validate.py | 25 ++++++++++++++++++ 4 files changed, 68 insertions(+) create mode 100644 INCHI-1-TEST/tests/test_library/data/pubchem/download.py create mode 100644 INCHI-1-TEST/tests/test_library/data/pubchem/utils.py create mode 100644 INCHI-1-TEST/tests/test_library/data/pubchem/validate.py diff --git a/.gitignore b/.gitignore index 55e48e4..057dada 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ INCHI-1-TEST/libs INCHI-1-TEST/tests/test_library/data/**/*.html INCHI-1-TEST/tests/test_library/data/**/*.log INCHI-1-TEST/tests/test_library/data/pubchem/** +!INCHI-1-TEST/tests/test_library/data/pubchem/*.py INCHI-1-SRC/INCHI_API/bin* INCHI-1-SRC/INCHI_EXE/bin* !INCHI-1-SRC/INCHI_EXE/bin/Linux/64bit/.keep diff --git a/INCHI-1-TEST/tests/test_library/data/pubchem/download.py b/INCHI-1-TEST/tests/test_library/data/pubchem/download.py new file mode 100644 index 0000000..1466afb --- /dev/null +++ b/INCHI-1-TEST/tests/test_library/data/pubchem/download.py @@ -0,0 +1,16 @@ +import subprocess +import shlex +from .utils import get_dataset_arg, DOWNLOAD_PATHS, PUBCHEM_DIR + + +if __name__ == "__main__": + dataset = get_dataset_arg() + + # https://depth-first.com/articles/2010/02/09/big-data-in-chemistry-mirroring-pubchem-the-easy-way-part-2/ + download_command = ( + f"wget --mirror --directory-prefix {PUBCHEM_DIR.joinpath(dataset)} " + + "--no-directories --continue --accept '*.sdf.gz,*.sdf.gz.md5' " + + f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/{DOWNLOAD_PATHS[dataset]}/SDF/" + ) + + subprocess.run(shlex.split(download_command), check=True) diff --git a/INCHI-1-TEST/tests/test_library/data/pubchem/utils.py b/INCHI-1-TEST/tests/test_library/data/pubchem/utils.py new file mode 100644 index 0000000..4f358c1 --- /dev/null +++ b/INCHI-1-TEST/tests/test_library/data/pubchem/utils.py @@ -0,0 +1,26 @@ +import argparse +from pathlib import Path + + +DATASETS = ["compound", "compound3d", "substance"] +DOWNLOAD_PATHS = { + "compound": "Compound/CURRENT-Full", + "compound3d": "Compound_3D/01_conf_per_cmpd", + "substance": "Substance/CURRENT-Full", +} +PUBCHEM_DIR = Path(__file__).parent.absolute() + + +def get_dataset_arg() -> str: + parser = argparse.ArgumentParser( + description="Choose a dataset.", + ) + parser.add_argument( + "dataset", + choices=DATASETS, + type=str, + help=f"Choose a dataset from {set(DATASETS)}", + ) + args = parser.parse_args() + + return args.dataset diff --git a/INCHI-1-TEST/tests/test_library/data/pubchem/validate.py b/INCHI-1-TEST/tests/test_library/data/pubchem/validate.py new file mode 100644 index 0000000..0831be2 --- /dev/null +++ b/INCHI-1-TEST/tests/test_library/data/pubchem/validate.py @@ -0,0 +1,25 @@ +import hashlib +from inchi_tests.utils import get_progress +from .utils import get_dataset_arg, PUBCHEM_DIR + + +if __name__ == "__main__": + dataset = get_dataset_arg() + sdf_paths = sorted(PUBCHEM_DIR.joinpath(dataset).glob("*.sdf.gz")) + n_sdf = len(sdf_paths) + + for i, sdf_path in enumerate(sdf_paths): + with open(sdf_path, "rb") as sdf_file: + print( + f"{get_progress(i + 1, n_sdf)}; Validating integrity of {sdf_path.name}." + ) + local_hash = hashlib.file_digest(sdf_file, "md5").hexdigest() + try: + with open(sdf_path.with_suffix(".gz.md5"), "r") as md5_file: + server_hash = md5_file.read().split()[0].strip() + if local_hash != server_hash: + print( + f"{sdf_path.name}: local hash {local_hash} does not match server hash {server_hash}." + ) + except FileNotFoundError as e: + print(e)