Don't gitignore PubChem scripts

IUPAC-InChI · Oct 31, 2024 · 5406b35 · 5406b35
1 parent b2f25cf
commit 5406b35
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ INCHI-1-TEST/libs
 INCHI-1-TEST/tests/test_library/data/**/*.html
 INCHI-1-TEST/tests/test_library/data/**/*.log
 INCHI-1-TEST/tests/test_library/data/pubchem/**
+!INCHI-1-TEST/tests/test_library/data/pubchem/*.py
 INCHI-1-SRC/INCHI_API/bin*
 INCHI-1-SRC/INCHI_EXE/bin*
 !INCHI-1-SRC/INCHI_EXE/bin/Linux/64bit/.keep

diff --git a/INCHI-1-TEST/tests/test_library/data/pubchem/download.py b/INCHI-1-TEST/tests/test_library/data/pubchem/download.py
@@ -0,0 +1,16 @@
+import subprocess
+import shlex
+from .utils import get_dataset_arg, DOWNLOAD_PATHS, PUBCHEM_DIR
+
+
+if __name__ == "__main__":
+    dataset = get_dataset_arg()
+
+    # https://depth-first.com/articles/2010/02/09/big-data-in-chemistry-mirroring-pubchem-the-easy-way-part-2/
+    download_command = (
+        f"wget --mirror --directory-prefix {PUBCHEM_DIR.joinpath(dataset)} "
+        + "--no-directories --continue --accept '*.sdf.gz,*.sdf.gz.md5' "
+        + f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/{DOWNLOAD_PATHS[dataset]}/SDF/"
+    )
+
+    subprocess.run(shlex.split(download_command), check=True)
diff --git a/INCHI-1-TEST/tests/test_library/data/pubchem/utils.py b/INCHI-1-TEST/tests/test_library/data/pubchem/utils.py
@@ -0,0 +1,26 @@
+import argparse
+from pathlib import Path
+
+
+DATASETS = ["compound", "compound3d", "substance"]
+DOWNLOAD_PATHS = {
+    "compound": "Compound/CURRENT-Full",
+    "compound3d": "Compound_3D/01_conf_per_cmpd",
+    "substance": "Substance/CURRENT-Full",
+}
+PUBCHEM_DIR = Path(__file__).parent.absolute()
+
+
+def get_dataset_arg() -> str:
+    parser = argparse.ArgumentParser(
+        description="Choose a dataset.",
+    )
+    parser.add_argument(
+        "dataset",
+        choices=DATASETS,
+        type=str,
+        help=f"Choose a dataset from {set(DATASETS)}",
+    )
+    args = parser.parse_args()
+
+    return args.dataset
diff --git a/INCHI-1-TEST/tests/test_library/data/pubchem/validate.py b/INCHI-1-TEST/tests/test_library/data/pubchem/validate.py
@@ -0,0 +1,25 @@
+import hashlib
+from inchi_tests.utils import get_progress
+from .utils import get_dataset_arg, PUBCHEM_DIR
+
+
+if __name__ == "__main__":
+    dataset = get_dataset_arg()
+    sdf_paths = sorted(PUBCHEM_DIR.joinpath(dataset).glob("*.sdf.gz"))
+    n_sdf = len(sdf_paths)
+
+    for i, sdf_path in enumerate(sdf_paths):
+        with open(sdf_path, "rb") as sdf_file:
+            print(
+                f"{get_progress(i + 1, n_sdf)}; Validating integrity of {sdf_path.name}."
+            )
+            local_hash = hashlib.file_digest(sdf_file, "md5").hexdigest()
+            try:
+                with open(sdf_path.with_suffix(".gz.md5"), "r") as md5_file:
+                    server_hash = md5_file.read().split()[0].strip()
+                    if local_hash != server_hash:
+                        print(
+                            f"{sdf_path.name}: local hash {local_hash} does not match server hash {server_hash}."
+                        )
+            except FileNotFoundError as e:
+                print(e)