Skip to content

Commit

Permalink
Add substructure library getter
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Dec 20, 2021
1 parent a2c405d commit 66094f9
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 1 deletion.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,29 @@ with chembl_downloader.supplier() as suppl:
This example was adapted from Greg Landrum's RDKit blog post
on [generalized substructure search](https://greglandrum.github.io/rdkit-blog/tutorial/substructure/2021/08/03/generalized-substructure-search.html).

### Get an RDKit substructure library

Building on the `supplier()` function, the `get_substructure_library()`
makes the preparation of a [substructure library](https://www.rdkit.org/docs/cppapi/classRDKit_1_1SubstructLibrary.html)
automated and reproducible. Additionally, it caches the results of the build,
which takes on the order of tens of minutes, only has to be done once and future
loading from a pickle object takes on the order of seconds.

The implementation was inspired by Greg Landrum's RDKit blog post,
[Some new features in the SubstructLibrary](https://greglandrum.github.io/rdkit-blog/tutorial/substructure/2021/12/20/substructlibrary-search-order.html).
The following example shows how it can be used to accomplish some of the first
tasks presented in the post:

```python
from rdkit import Chem

import chembl_downloader

library = chembl_downloader.get_substructure_library()
query = Chem.MolFromSmarts('[O,N]=C-c:1:c:c:n:c:c:1')
matches = library.GetMatches(query)
```

### Store in a Different Place

If you want to store the data elsewhere using `pystow` (e.g., in [`pyobo`](https://github.com/pyobo/pyobo)
Expand Down
1 change: 1 addition & 0 deletions src/chembl_downloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@
download_sqlite,
latest,
query,
get_substructure_library,
supplier,
)
59 changes: 59 additions & 0 deletions src/chembl_downloader/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
import gzip
import logging
import os
import pickle
import sqlite3
import tarfile
from contextlib import closing, contextmanager
from pathlib import Path
from typing import Optional, Sequence, TYPE_CHECKING, Tuple

import pystow
from tqdm import tqdm

if TYPE_CHECKING:
import pandas
Expand All @@ -25,6 +27,7 @@
"cursor",
"query",
"supplier",
"get_substructure_library",
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -233,3 +236,59 @@ def supplier(
_, path = download_sdf(version=version, prefix=prefix)
with gzip.open(path) as file:
yield Chem.ForwardSDMolSupplier(file, **kwargs)


def get_substructure_library(
version: Optional[str] = None,
prefix: Optional[Sequence[str]] = None,
max_heavy: int = 75,
**kwargs,
):
"""Get the ChEMBL substructure library.
:param version: The version number of ChEMBL to get. If none specified, uses
:func:`bioversions.get_version` to look up the latest.
:param prefix: The directory inside :mod:`pystow` to use
:param max_heavy: The largest number of heavy atoms that are considered before skipping the molecule.
:param kwargs: keyword arguments to pass through to :class:`rdkit.Chem.ForwardSDMolSupplier`, such as
``sanitize`` and ``removeHs`` via :func:`supplier`.
:returns: A substructure library object
:rtype: rdkit.Chem.rdSubstructLibrary.SubstructLibrary
.. seealso::
https://greglandrum.github.io/rdkit-blog/tutorial/substructure/2021/12/20/substructlibrary-search-order.html
"""
# Requires minimum version of v2021.09
from rdkit.Chem.rdSubstructLibrary import (
CachedTrustedSmilesMolHolder,
TautomerPatternHolder,
KeyFromPropHolder,
SubstructLibrary,
)

if version is None:
version = latest()

path = pystow.join(*(prefix or PYSTOW_PARTS), version, name="ssslib.pkl")
if path.is_file():
logger.info("loading substructure library from pickle: %s", path)
with path.open("rb") as file:
return pickle.load(file)

molecule_holder = CachedTrustedSmilesMolHolder()
tautomer_pattern_holder = TautomerPatternHolder()
key_from_prop_holder = KeyFromPropHolder()
library = SubstructLibrary(molecule_holder, tautomer_pattern_holder, key_from_prop_holder)
with supplier(version=version, prefix=prefix, **kwargs) as suppl:
for mol in tqdm(
suppl, unit="molecule", unit_scale=True, desc="Building substructure library"
):
if mol is None:
continue
if mol.GetNumHeavyAtoms() > max_heavy: # skip huge molecules
continue
library.AddMol(mol)
with path.open("wb") as file:
pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL)
return library
10 changes: 9 additions & 1 deletion src/chembl_downloader/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import click
from more_click import verbose_option

from .api import download_extract_sqlite, query
from .api import download_extract_sqlite, get_substructure_library, query
from .queries import ACTIVITIES_QUERY, ID_NAME_QUERY

__all__ = [
Expand Down Expand Up @@ -44,5 +44,13 @@ def test(version: Optional[str]):
click.echo(df.to_markdown(index=False))


@main.command()
@version_option
@verbose_option
def substructure(version: Optional[str]):
"""Build a substructure library."""
get_substructure_library(version=version)


if __name__ == "__main__":
main()

0 comments on commit 66094f9

Please sign in to comment.