Skip to content

Commit

Permalink
Don't gitignore PubChem scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
JanCBrammer committed Oct 31, 2024
1 parent b2f25cf commit 5406b35
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ INCHI-1-TEST/libs
INCHI-1-TEST/tests/test_library/data/**/*.html
INCHI-1-TEST/tests/test_library/data/**/*.log
INCHI-1-TEST/tests/test_library/data/pubchem/**
!INCHI-1-TEST/tests/test_library/data/pubchem/*.py
INCHI-1-SRC/INCHI_API/bin*
INCHI-1-SRC/INCHI_EXE/bin*
!INCHI-1-SRC/INCHI_EXE/bin/Linux/64bit/.keep
Expand Down
16 changes: 16 additions & 0 deletions INCHI-1-TEST/tests/test_library/data/pubchem/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import subprocess
import shlex
from .utils import get_dataset_arg, DOWNLOAD_PATHS, PUBCHEM_DIR


if __name__ == "__main__":
dataset = get_dataset_arg()

# https://depth-first.com/articles/2010/02/09/big-data-in-chemistry-mirroring-pubchem-the-easy-way-part-2/
download_command = (
f"wget --mirror --directory-prefix {PUBCHEM_DIR.joinpath(dataset)} "
+ "--no-directories --continue --accept '*.sdf.gz,*.sdf.gz.md5' "
+ f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/{DOWNLOAD_PATHS[dataset]}/SDF/"
)

subprocess.run(shlex.split(download_command), check=True)
26 changes: 26 additions & 0 deletions INCHI-1-TEST/tests/test_library/data/pubchem/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import argparse
from pathlib import Path


DATASETS = ["compound", "compound3d", "substance"]
DOWNLOAD_PATHS = {
"compound": "Compound/CURRENT-Full",
"compound3d": "Compound_3D/01_conf_per_cmpd",
"substance": "Substance/CURRENT-Full",
}
PUBCHEM_DIR = Path(__file__).parent.absolute()


def get_dataset_arg() -> str:
parser = argparse.ArgumentParser(
description="Choose a dataset.",
)
parser.add_argument(
"dataset",
choices=DATASETS,
type=str,
help=f"Choose a dataset from {set(DATASETS)}",
)
args = parser.parse_args()

return args.dataset
25 changes: 25 additions & 0 deletions INCHI-1-TEST/tests/test_library/data/pubchem/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import hashlib
from inchi_tests.utils import get_progress
from .utils import get_dataset_arg, PUBCHEM_DIR


if __name__ == "__main__":
dataset = get_dataset_arg()
sdf_paths = sorted(PUBCHEM_DIR.joinpath(dataset).glob("*.sdf.gz"))
n_sdf = len(sdf_paths)

for i, sdf_path in enumerate(sdf_paths):
with open(sdf_path, "rb") as sdf_file:
print(
f"{get_progress(i + 1, n_sdf)}; Validating integrity of {sdf_path.name}."
)
local_hash = hashlib.file_digest(sdf_file, "md5").hexdigest()
try:
with open(sdf_path.with_suffix(".gz.md5"), "r") as md5_file:
server_hash = md5_file.read().split()[0].strip()
if local_hash != server_hash:
print(
f"{sdf_path.name}: local hash {local_hash} does not match server hash {server_hash}."
)
except FileNotFoundError as e:
print(e)

0 comments on commit 5406b35

Please sign in to comment.