From 66094f924829e259ba650e0d013da8547ee6359a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 20 Dec 2021 12:48:13 -0500 Subject: [PATCH] Add substructure library getter --- README.md | 23 ++++++++++++ src/chembl_downloader/__init__.py | 1 + src/chembl_downloader/api.py | 59 +++++++++++++++++++++++++++++++ src/chembl_downloader/cli.py | 10 +++++- 4 files changed, 92 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b737f10..1555d70 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,29 @@ with chembl_downloader.supplier() as suppl: This example was adapted from Greg Landrum's RDKit blog post on [generalized substructure search](https://greglandrum.github.io/rdkit-blog/tutorial/substructure/2021/08/03/generalized-substructure-search.html). +### Get an RDKit substructure library + +Building on the `supplier()` function, the `get_substructure_library()` +makes the preparation of a [substructure library](https://www.rdkit.org/docs/cppapi/classRDKit_1_1SubstructLibrary.html) +automated and reproducible. Additionally, it caches the results of the build, +which takes on the order of tens of minutes, only has to be done once and future +loading from a pickle object takes on the order of seconds. + +The implementation was inspired by Greg Landrum's RDKit blog post, +[Some new features in the SubstructLibrary](https://greglandrum.github.io/rdkit-blog/tutorial/substructure/2021/12/20/substructlibrary-search-order.html). +The following example shows how it can be used to accomplish some of the first +tasks presented in the post: + +```python +from rdkit import Chem + +import chembl_downloader + +library = chembl_downloader.get_substructure_library() +query = Chem.MolFromSmarts('[O,N]=C-c:1:c:c:n:c:c:1') +matches = library.GetMatches(query) +``` + ### Store in a Different Place If you want to store the data elsewhere using `pystow` (e.g., in [`pyobo`](https://github.com/pyobo/pyobo) diff --git a/src/chembl_downloader/__init__.py b/src/chembl_downloader/__init__.py index 4a96086..18d4d49 100644 --- a/src/chembl_downloader/__init__.py +++ b/src/chembl_downloader/__init__.py @@ -10,5 +10,6 @@ download_sqlite, latest, query, + get_substructure_library, supplier, ) diff --git a/src/chembl_downloader/api.py b/src/chembl_downloader/api.py index a2da9cd..fe112aa 100644 --- a/src/chembl_downloader/api.py +++ b/src/chembl_downloader/api.py @@ -5,6 +5,7 @@ import gzip import logging import os +import pickle import sqlite3 import tarfile from contextlib import closing, contextmanager @@ -12,6 +13,7 @@ from typing import Optional, Sequence, TYPE_CHECKING, Tuple import pystow +from tqdm import tqdm if TYPE_CHECKING: import pandas @@ -25,6 +27,7 @@ "cursor", "query", "supplier", + "get_substructure_library", ] logger = logging.getLogger(__name__) @@ -233,3 +236,59 @@ def supplier( _, path = download_sdf(version=version, prefix=prefix) with gzip.open(path) as file: yield Chem.ForwardSDMolSupplier(file, **kwargs) + + +def get_substructure_library( + version: Optional[str] = None, + prefix: Optional[Sequence[str]] = None, + max_heavy: int = 75, + **kwargs, +): + """Get the ChEMBL substructure library. + + :param version: The version number of ChEMBL to get. If none specified, uses + :func:`bioversions.get_version` to look up the latest. + :param prefix: The directory inside :mod:`pystow` to use + :param max_heavy: The largest number of heavy atoms that are considered before skipping the molecule. + :param kwargs: keyword arguments to pass through to :class:`rdkit.Chem.ForwardSDMolSupplier`, such as + ``sanitize`` and ``removeHs`` via :func:`supplier`. + :returns: A substructure library object + :rtype: rdkit.Chem.rdSubstructLibrary.SubstructLibrary + + .. seealso:: + + https://greglandrum.github.io/rdkit-blog/tutorial/substructure/2021/12/20/substructlibrary-search-order.html + """ + # Requires minimum version of v2021.09 + from rdkit.Chem.rdSubstructLibrary import ( + CachedTrustedSmilesMolHolder, + TautomerPatternHolder, + KeyFromPropHolder, + SubstructLibrary, + ) + + if version is None: + version = latest() + + path = pystow.join(*(prefix or PYSTOW_PARTS), version, name="ssslib.pkl") + if path.is_file(): + logger.info("loading substructure library from pickle: %s", path) + with path.open("rb") as file: + return pickle.load(file) + + molecule_holder = CachedTrustedSmilesMolHolder() + tautomer_pattern_holder = TautomerPatternHolder() + key_from_prop_holder = KeyFromPropHolder() + library = SubstructLibrary(molecule_holder, tautomer_pattern_holder, key_from_prop_holder) + with supplier(version=version, prefix=prefix, **kwargs) as suppl: + for mol in tqdm( + suppl, unit="molecule", unit_scale=True, desc="Building substructure library" + ): + if mol is None: + continue + if mol.GetNumHeavyAtoms() > max_heavy: # skip huge molecules + continue + library.AddMol(mol) + with path.open("wb") as file: + pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL) + return library diff --git a/src/chembl_downloader/cli.py b/src/chembl_downloader/cli.py index 58e5b23..1f1ebd4 100644 --- a/src/chembl_downloader/cli.py +++ b/src/chembl_downloader/cli.py @@ -7,7 +7,7 @@ import click from more_click import verbose_option -from .api import download_extract_sqlite, query +from .api import download_extract_sqlite, get_substructure_library, query from .queries import ACTIVITIES_QUERY, ID_NAME_QUERY __all__ = [ @@ -44,5 +44,13 @@ def test(version: Optional[str]): click.echo(df.to_markdown(index=False)) +@main.command() +@version_option +@verbose_option +def substructure(version: Optional[str]): + """Build a substructure library.""" + get_substructure_library(version=version) + + if __name__ == "__main__": main()