diff --git a/.lockfiles/py310-dev.lock b/.lockfiles/py310-dev.lock index 60546ccb6..1d59d53b5 100644 --- a/.lockfiles/py310-dev.lock +++ b/.lockfiles/py310-dev.lock @@ -103,6 +103,8 @@ cyclonedx-python-lib==7.5.1 # via pip-audit dask==2024.7.1 # via xyzpy +datasketch==1.6.5 + # via scikit-fingerprints debugpy==1.8.2 # via ipykernel decorator==5.1.1 @@ -116,6 +118,8 @@ deprecated==1.2.14 # opentelemetry-api # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http +descriptastorus==2.6.1 + # via scikit-fingerprints distlib==0.3.8 # via virtualenv docstring-parser-fork==0.0.9 @@ -126,6 +130,8 @@ docutils==0.21.2 # pybtex-docutils # sphinx # sphinxcontrib-bibtex +e3fp==1.2.5 + # via scikit-fingerprints et-xmlfile==1.1.0 # via openpyxl exceptiongroup==1.2.2 @@ -143,6 +149,7 @@ fastjsonschema==2.20.0 filelock==3.15.4 # via # cachecontrol + # huggingface-hub # torch # tox # triton @@ -160,6 +167,7 @@ fqdn==1.5.1 fsspec==2024.6.1 # via # dask + # huggingface-hub # torch funcy==1.17 # via @@ -167,7 +175,7 @@ funcy==1.17 # funcy-stubs funcy-stubs==0.1.1 # via baybe (pyproject.toml) -furo==2024.7.18 +furo==2024.8.6 # via baybe (pyproject.toml) future==1.0.0 # via autograd @@ -199,6 +207,8 @@ httpcore==1.0.5 # via httpx httpx==0.27.0 # via jupyterlab +huggingface-hub==0.25.1 + # via scikit-fingerprints humanfriendly==10.0 # via coloredlogs hypothesis==6.108.4 @@ -228,7 +238,6 @@ ipykernel==6.29.5 # jupyter # jupyter-console # jupyterlab - # qtconsole ipython==8.26.0 # via # ipykernel @@ -256,6 +265,7 @@ jinja2==3.1.4 joblib==1.4.2 # via # baybe (pyproject.toml) + # scikit-fingerprints # scikit-learn # xyzpy json5==0.9.25 @@ -270,7 +280,7 @@ jsonschema==4.23.0 # nbformat jsonschema-specifications==2023.12.1 # via jsonschema -jupyter==1.0.0 +jupyter==1.1.1 # via baybe (pyproject.toml) jupyter-client==8.6.2 # via @@ -278,7 +288,6 @@ jupyter-client==8.6.2 # jupyter-console # jupyter-server # nbclient - # qtconsole jupyter-console==6.6.3 # via jupyter jupyter-core==5.7.2 @@ -291,7 +300,6 @@ jupyter-core==5.7.2 # nbclient # nbconvert # nbformat - # qtconsole jupyter-events==0.10.0 # via jupyter-server jupyter-lsp==2.2.5 @@ -306,7 +314,9 @@ jupyter-server==2.14.2 jupyter-server-terminals==0.5.3 # via jupyter-server jupyterlab==4.2.4 - # via notebook + # via + # jupyter + # notebook jupyterlab-pygments==0.3.0 # via nbconvert jupyterlab-server==2.27.3 @@ -315,7 +325,7 @@ jupyterlab-server==2.27.3 # notebook jupyterlab-widgets==3.0.11 # via ipywidgets -jupytext==1.16.3 +jupytext==1.16.4 # via baybe (pyproject.toml) kiwisolver==1.4.5 # via matplotlib @@ -329,6 +339,8 @@ linear-operator==0.5.2 # via # botorch # gpytorch +llvmlite==0.43.0 + # via numba locket==1.0.0 # via partd markdown-it-py==3.0.0 @@ -363,8 +375,10 @@ mistune==3.0.2 # via nbconvert mkl==2021.4.0 ; platform_system == 'Windows' # via torch +mmh3==5.0.1 + # via e3fp mordredcommunity==2.0.6 - # via baybe (pyproject.toml) + # via scikit-fingerprints mpmath==1.3.0 # via # botorch @@ -378,7 +392,7 @@ mypy==1.11.0 # via baybe (pyproject.toml) mypy-extensions==1.0.0 # via mypy -myst-parser==3.0.1 +myst-parser==4.0.0 # via baybe (pyproject.toml) nbclient==0.10.0 # via nbconvert @@ -408,6 +422,8 @@ notebook-shim==0.2.4 # via # jupyterlab # notebook +numba==0.60.0 + # via scikit-fingerprints numpy==1.26.4 # via # baybe (pyproject.toml) @@ -415,12 +431,16 @@ numpy==1.26.4 # autograd # botorch # contourpy + # datasketch + # descriptastorus + # e3fp # formulaic # h5py # lifelines # matplotlib # mordredcommunity # ngboost + # numba # onnx # onnxconverter-common # onnxruntime @@ -431,6 +451,7 @@ numpy==1.26.4 # pydeck # pyro-ppl # rdkit + # scikit-fingerprints # scikit-learn # scikit-learn-extra # scipy @@ -526,6 +547,7 @@ packaging==24.1 # altair # dask # h5netcdf + # huggingface-hub # ipykernel # jupyter-server # jupyterlab @@ -541,8 +563,6 @@ packaging==24.1 # plotly # pyproject-api # pytest - # qtconsole - # qtpy # setuptools-scm # sphinx # streamlit @@ -556,10 +576,14 @@ pandas==2.2.2 # formulaic # hypothesis # lifelines + # pandas-flavor + # scikit-fingerprints # seaborn # streamlit # xarray # xyzpy +pandas-flavor==0.6.0 + # via descriptastorus pandas-stubs==2.2.2.240603 # via # baybe (pyproject.toml) @@ -651,7 +675,6 @@ pygments==2.18.0 # ipython # jupyter-console # nbconvert - # qtconsole # rich # sphinx pyparsing==3.1.2 @@ -694,6 +717,7 @@ pywinpty==2.0.13 ; os_name == 'nt' pyyaml==6.0.1 # via # dask + # huggingface-hub # jupyter-events # jupytext # myst-parser @@ -705,15 +729,11 @@ pyzmq==26.0.3 # jupyter-client # jupyter-console # jupyter-server - # qtconsole -qtconsole==5.5.2 - # via jupyter -qtpy==2.4.1 - # via qtconsole rdkit==2024.3.3 # via - # baybe (pyproject.toml) + # descriptastorus # mordredcommunity + # scikit-fingerprints referencing==0.35.1 # via # jsonschema @@ -722,6 +742,7 @@ referencing==0.35.1 requests==2.32.3 # via # cachecontrol + # huggingface-hub # jupyterlab-server # opentelemetry-exporter-otlp-proto-http # pip-audit @@ -745,11 +766,14 @@ rpds-py==0.19.0 # referencing ruff==0.5.2 # via baybe (pyproject.toml) +scikit-fingerprints==1.9.0 + # via baybe (pyproject.toml) scikit-learn==1.5.1 # via # baybe (pyproject.toml) # gpytorch # ngboost + # scikit-fingerprints # scikit-learn-extra # skl2onnx scikit-learn-extra==0.3.0 @@ -759,13 +783,19 @@ scipy==1.14.0 # baybe (pyproject.toml) # autograd-gamma # botorch + # datasketch + # descriptastorus + # e3fp # formulaic # gpytorch # lifelines # linear-operator # ngboost + # scikit-fingerprints # scikit-learn # scikit-learn-extra +sdaxen-python-utilities==0.1.5 + # via e3fp seaborn==0.13.2 # via baybe (pyproject.toml) send2trash==1.8.3 @@ -787,6 +817,8 @@ six==1.16.0 # rfc3339-validator skl2onnx==1.17.0 # via baybe (pyproject.toml) +smart-open==7.0.5 + # via e3fp smmap==5.0.1 # via gitdb sniffio==1.3.1 @@ -801,7 +833,7 @@ sortedcontainers==2.4.0 # hypothesis soupsieve==2.5 # via beautifulsoup4 -sphinx==7.4.7 +sphinx==8.1.3 # via # baybe (pyproject.toml) # furo @@ -810,7 +842,7 @@ sphinx==7.4.7 # sphinx-basic-ng # sphinx-copybutton # sphinxcontrib-bibtex -sphinx-autodoc-typehints==2.2.3 +sphinx-autodoc-typehints==2.5.0 # via baybe (pyproject.toml) sphinx-basic-ng==1.0.0b2 # via furo @@ -898,8 +930,10 @@ tox-uv==1.9.1 # via baybe (pyproject.toml) tqdm==4.66.4 # via + # huggingface-hub # ngboost # pyro-ppl + # scikit-fingerprints # xyzpy traitlets==5.14.3 # via @@ -917,7 +951,6 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat - # qtconsole triton==2.3.1 ; python_full_version < '3.12' and platform_machine == 'x86_64' and platform_system == 'Linux' # via torch typeguard==2.13.3 @@ -939,6 +972,7 @@ typing-extensions==4.12.2 # cattrs # formulaic # funcy-stubs + # huggingface-hub # ipython # mypy # opentelemetry-sdk @@ -977,8 +1011,11 @@ wrapt==1.16.0 # via # deprecated # formulaic + # smart-open xarray==2024.6.0 - # via xyzpy + # via + # pandas-flavor + # xyzpy xyzpy==1.2.1 # via baybe (pyproject.toml) zipp==3.19.2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 31137a7bc..5770b467e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Example for a traditional mixture ### Changed +- `SubstanceParameter` encodings are now computed exclusively with the + `scikit-fingerprints` package, granting access to all fingerprints available therein - Example for slot-based mixtures has been revised and grouped together with the new traditional mixture example +- Memory caching is now non-verbose ### Deprecations - Passing a dataframe via the `data` argument to `Objective.transform` is no longer @@ -21,6 +24,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `get_transform_parameters` has been replaced with `get_transform_objects` - Passing a dataframe via the `data` argument to `Target.transform` is no longer possible. The data must now be passed as a series as first positional argument. +- `SubstanceEncoding` value `MORGAN_FP`. As a replacement, `ECFP` with 1024 bits and + radius of 4 can be used. +- `SubstanceEncoding` value `RDKIT`. As a replacement, `RDKIT2DDESCRIPTORS` can be used. ## [0.11.3] - 2024-11-06 ### Fixed diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 05e9796ea..6f67b86aa 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -25,4 +25,6 @@ - Di Jin (Merck Life Science KGaA, Darmstadt, Germany):\ Cardinality constraints - Julian Streibel (Merck Life Science KGaA, Darmstadt, Germany):\ - Bernoulli multi-armed bandit and Thompson sampling \ No newline at end of file + Bernoulli multi-armed bandit and Thompson sampling +- Karin Hrovatin (Merck KGaA, Darmstadt, Germany):\ + `scikit-fingerprints` support diff --git a/README.md b/README.md index 035f69b11..0f2529c7d 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ parameters = [ "Solvent C": "O", "Solvent D": "CS(=O)C", }, - encoding="MORDRED", # chemical encoding via mordred package + encoding="MORDRED", # chemical encoding via scikit-fingerprints ), ] ``` diff --git a/baybe/_optional/chem.py b/baybe/_optional/chem.py index 1d9d661a6..7cd222392 100644 --- a/baybe/_optional/chem.py +++ b/baybe/_optional/chem.py @@ -3,9 +3,11 @@ from baybe.exceptions import OptionalImportError try: - from mordred import Calculator, descriptors - from rdkit import Chem, RDLogger - from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect + from rdkit import Chem + from skfp import fingerprints + from skfp.bases import BaseFingerprintTransformer + from skfp.preprocessing import ConformerGenerator, MolFromSmilesTransformer + except ModuleNotFoundError as ex: raise OptionalImportError( "Chemistry functionality is unavailable because the necessary optional " @@ -15,9 +17,9 @@ ) from ex __all__ = [ - "descriptors", - "Calculator", "Chem", - "GetMorganFingerprintAsBitVect", - "RDLogger", + "fingerprints", + "BaseFingerprintTransformer", + "ConformerGenerator", + "MolFromSmilesTransformer", ] diff --git a/baybe/_optional/info.py b/baybe/_optional/info.py index e725b4799..b35c53dab 100644 --- a/baybe/_optional/info.py +++ b/baybe/_optional/info.py @@ -25,13 +25,12 @@ def exclude_sys_path(path: str, /): # noqa: DOC402, DOC404 # Individual packages with exclude_sys_path(os.getcwd()): FLAKE8_INSTALLED = find_spec("flake8") is not None - MORDRED_INSTALLED = find_spec("mordred") is not None ONNX_INSTALLED = find_spec("onnxruntime") is not None POLARS_INSTALLED = find_spec("polars") is not None PRE_COMMIT_INSTALLED = find_spec("pre_commit") is not None PYDOCLINT_INSTALLED = find_spec("pydoclint") is not None - RDKIT_INSTALLED = find_spec("rdkit") is not None RUFF_INSTALLED = find_spec("ruff") is not None + SKFP_INSTALLED = find_spec("skfp") is not None # scikit-fingerprints STREAMLIT_INSTALLED = find_spec("streamlit") is not None XYZPY_INSTALLED = find_spec("xyzpy") is not None @@ -43,8 +42,8 @@ def exclude_sys_path(path: str, /): # noqa: DOC402, DOC404 # directly depend on the flag – we thus simply set it to `True`. TYPOS_INSTALLED = True -# Package combinations -CHEM_INSTALLED = MORDRED_INSTALLED and RDKIT_INSTALLED +# Information on whether all required packages for certain functionality are available +CHEM_INSTALLED = SKFP_INSTALLED LINT_INSTALLED = all( ( FLAKE8_INSTALLED, diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 7985a928a..79a4e8df6 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -17,21 +17,126 @@ class CategoricalEncoding(ParameterEncoding): """Integer encoding.""" +class CustomEncoding(ParameterEncoding): + """Available encodings for custom parameters.""" + + CUSTOM = "CUSTOM" + """User-defined encoding.""" + + class SubstanceEncoding(ParameterEncoding): - """Available encodings for substance parameters.""" + """Available encodings for substance parameters from `scikit-fingerprints`_ package. + + .. _scikit-fingerprints: https://scikit-fingerprints.github.io/scikit-fingerprints/ + """ + + ATOMPAIR = "ATOMPAIR" + """:class:`skfp.fingerprints.AtomPairFingerprint`""" + + AUTOCORR = "AUTOCORR" + """:class:`skfp.fingerprints.AutocorrFingerprint`""" + + AVALON = "AVALON" + """:class:`skfp.fingerprints.AvalonFingerprint`""" + + E3FP = "E3FP" + """:class:`skfp.fingerprints.E3FPFingerprint`""" + + ECFP = "ECFP" + """:class:`skfp.fingerprints.ECFPFingerprint`""" + + ELECTROSHAPE = "ELECTROSHAPE" + """:class:`skfp.fingerprints.ElectroShapeFingerprint`""" + + MORGAN_FP = "MORGAN_FP" + """ + Deprecated! Uses :class:`skfp.fingerprints.ECFPFingerprint` with ``fp_size=1024`` + and ``radius=4``. + """ + + ERG = "ERG" + """:class:`skfp.fingerprints.ERGFingerprint`""" + + ESTATE = "ESTATE" + """:class:`skfp.fingerprints.EStateFingerprint`""" + + FUNCTIONALGROUPS = "FUNCTIONALGROUPS" + """:class:`skfp.fingerprints.FunctionalGroupsFingerprint`""" + + GETAWAY = "GETAWAY" + """:class:`skfp.fingerprints.GETAWAYFingerprint`""" + + GHOSECRIPPEN = "GHOSECRIPPEN" + """:class:`skfp.fingerprints.GhoseCrippenFingerprint`""" + + KLEKOTAROTH = "KLEKOTAROTH" + """:class:`skfp.fingerprints.KlekotaRothFingerprint`""" + + LAGGNER = "LAGGNER" + """:class:`skfp.fingerprints.LaggnerFingerprint`""" + + LAYERED = "LAYERED" + """:class:`skfp.fingerprints.LayeredFingerprint`""" + + LINGO = "LINGO" + """:class:`skfp.fingerprints.LingoFingerprint`""" + + MACCS = "MACCS" + """:class:`skfp.fingerprints.MACCSFingerprint`""" + + MAP = "MAP" + """:class:`skfp.fingerprints.MAPFingerprint`""" + + MHFP = "MHFP" + """:class:`skfp.fingerprints.MHFPFingerprint`""" + + MORSE = "MORSE" + """:class:`skfp.fingerprints.MORSEFingerprint`""" + + MQNS = "MQNS" + """:class:`skfp.fingerprints.MQNsFingerprint`""" MORDRED = "MORDRED" - """Encoding based on Mordred chemical descriptors.""" + """:class:`skfp.fingerprints.MordredFingerprint`""" + + PATTERN = "PATTERN" + """:class:`skfp.fingerprints.PatternFingerprint`""" + + PHARMACOPHORE = "PHARMACOPHORE" + """:class:`skfp.fingerprints.PharmacophoreFingerprint`""" + + PHYSIOCHEMICALPROPERTIES = "PHYSIOCHEMICALPROPERTIES" + """:class:`skfp.fingerprints.PhysiochemicalPropertiesFingerprint`""" + + PUBCHEM = "PUBCHEM" + """:class:`skfp.fingerprints.PubChemFingerprint`""" + + RDF = "RDF" + """:class:`skfp.fingerprints.RDFFingerprint`""" RDKIT = "RDKIT" - """Encoding based on RDKit chemical descriptors.""" + """Deprecated! Uses :class:`skfp.fingerprints.RDKit2DDescriptors`.""" - MORGAN_FP = "MORGAN_FP" - """Encoding based on Morgan molecule fingerprints.""" + RDKITFINGERPRINT = "RDKITFINGERPRINT" + """:class:`skfp.fingerprints.RDKitFingerprint`""" + RDKIT2DDESCRIPTORS = "RDKIT2DDESCRIPTORS" + """:class:`skfp.fingerprints.RDKit2DDescriptorsFingerprint`""" -class CustomEncoding(ParameterEncoding): - """Available encodings for custom parameters.""" + SECFP = "SECFP" + """:class:`skfp.fingerprints.SECFPFingerprint`""" - CUSTOM = "CUSTOM" - """User-defined encoding.""" + TOPOLOGICALTORSION = "TOPOLOGICALTORSION" + """:class:`skfp.fingerprints.TopologicalTorsionFingerprint`""" + + USR = "USR" + """:class:`skfp.fingerprints.USRFingerprint`""" + + USRCAT = "USRCAT" + """:class:`skfp.fingerprints.USRCATFingerprint`""" + + VSA = "VSA" + """:class:`skfp.fingerprints.VSAFingerprint`""" + + WHIM = "WHIM" + """:class:`skfp.fingerprints.WHIMFingerprint`""" diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py index 55819775b..1a8539070 100644 --- a/baybe/parameters/substance.py +++ b/baybe/parameters/substance.py @@ -20,7 +20,6 @@ except NameError: from exceptiongroup import ExceptionGroup - Smiles = str """Type alias for SMILES strings.""" @@ -65,6 +64,14 @@ class SubstanceParameter(DiscreteParameter): ) # See base class. + kwargs_fingerprint: dict[str, Any] = field( + factory=dict, validator=instance_of(dict) + ) + """Keyword arguments passed to fingerprint generator.""" + + kwargs_conformer: dict[str, Any] = field(factory=dict, validator=instance_of(dict)) + """Keyword arguments passed to conformer generator.""" + @data.validator def _validate_substance_data( # noqa: DOC101, DOC103 self, _: Any, data: dict[str, Smiles] @@ -118,28 +125,21 @@ def comp_df(self) -> pd.DataFrame: from baybe.utils import chemistry vals = list(self.data.values()) - pref = self.name + "_" + pref = self.name # Get the raw descriptors - if self.encoding is SubstanceEncoding.MORDRED: - comp_df = chemistry.smiles_to_mordred_features(vals, prefix=pref) - elif self.encoding is SubstanceEncoding.RDKIT: - comp_df = chemistry.smiles_to_rdkit_features(vals, prefix=pref) - elif self.encoding is SubstanceEncoding.MORGAN_FP: - comp_df = chemistry.smiles_to_fp_features(vals, prefix=pref) - else: - raise ValueError( - f"Unknown parameter encoding {self.encoding} for parameter {self.name}." - ) + comp_df = chemistry.smiles_to_fingerprint_features( + vals, + encoding=self.encoding, + prefix=pref, + kwargs_conformer=self.kwargs_conformer, + kwargs_fingerprint=self.kwargs_fingerprint, + ) # Drop NaN and constant columns comp_df = comp_df.loc[:, ~comp_df.isna().any(axis=0)] comp_df = df_drop_single_value_columns(comp_df) - # If there are bool columns, convert them to int (possible for Mordred) - bool_cols = comp_df.select_dtypes(bool).columns - comp_df[bool_cols] = comp_df[bool_cols].astype(int) - # Label the rows with the molecule names comp_df.index = pd.Index(self.values) diff --git a/baybe/searchspace/core.py b/baybe/searchspace/core.py index 1711352ab..56bab3243 100644 --- a/baybe/searchspace/core.py +++ b/baybe/searchspace/core.py @@ -16,7 +16,7 @@ validate_constraints, ) from baybe.constraints.base import Constraint -from baybe.parameters import SubstanceEncoding, TaskParameter +from baybe.parameters import TaskParameter from baybe.parameters.base import Parameter from baybe.searchspace.continuous import SubspaceContinuous from baybe.searchspace.discrete import ( @@ -225,20 +225,6 @@ def type(self) -> SearchSpaceType: return SearchSpaceType.HYBRID raise RuntimeError("This line should be impossible to reach.") - @property - def contains_mordred(self) -> bool: - """Indicates if any of the discrete parameters uses ``MORDRED`` encoding.""" - return any( - p.encoding is SubstanceEncoding.MORDRED for p in self.discrete.parameters - ) - - @property - def contains_rdkit(self) -> bool: - """Indicates if any of the discrete parameters uses ``RDKIT`` encoding.""" - return any( - p.encoding is SubstanceEncoding.RDKIT for p in self.discrete.parameters - ) - @property def comp_rep_columns(self) -> tuple[str, ...]: """The columns spanning the computational representation.""" diff --git a/baybe/surrogates/gaussian_process/presets/edbo.py b/baybe/surrogates/gaussian_process/presets/edbo.py index de3863f67..4b1678ab6 100644 --- a/baybe/surrogates/gaussian_process/presets/edbo.py +++ b/baybe/surrogates/gaussian_process/presets/edbo.py @@ -3,6 +3,7 @@ from __future__ import annotations import gc +from collections.abc import Collection from typing import TYPE_CHECKING from attrs import define @@ -11,7 +12,10 @@ from baybe.kernels.basic import MaternKernel from baybe.kernels.composite import ScaleKernel from baybe.parameters import TaskParameter +from baybe.parameters.enum import SubstanceEncoding +from baybe.parameters.substance import SubstanceParameter from baybe.priors.basic import GammaPrior +from baybe.searchspace.discrete import SubspaceDiscrete from baybe.surrogates.gaussian_process.kernel_factory import KernelFactory if TYPE_CHECKING: @@ -21,6 +25,25 @@ from baybe.searchspace.core import SearchSpace +def _contains_encoding( + subspace: SubspaceDiscrete, encodings: Collection[SubstanceEncoding] +) -> bool: + """Tell if any of the substance parameters uses one of the specified encodings.""" + return any( + p.encoding in encodings + for p in subspace.parameters + if isinstance(p, SubstanceParameter) + ) + + +_EDBO_ENCODINGS = ( + SubstanceEncoding.MORDRED, + SubstanceEncoding.RDKIT, + SubstanceEncoding.RDKIT2DDESCRIPTORS, +) +"""Encodings relevant to EDBO logic.""" + + @define class EDBOKernelFactory(KernelFactory): """A factory providing the kernel for Gaussian process surrogates adapted from EDBO. @@ -38,9 +61,9 @@ def __call__( [p for p in searchspace.parameters if isinstance(p, TaskParameter)] ) - mordred = (searchspace.contains_mordred or searchspace.contains_rdkit) and ( - effective_dims >= 50 - ) + switching_condition = _contains_encoding( + searchspace.discrete, _EDBO_ENCODINGS + ) and (effective_dims >= 50) # low D priors if effective_dims < 5: @@ -50,14 +73,14 @@ def __call__( outputscale_initial_value = 8.0 # DFT optimized priors - elif mordred and effective_dims < 100: + elif switching_condition and effective_dims < 100: lengthscale_prior = GammaPrior(2.0, 0.2) lengthscale_initial_value = 5.0 outputscale_prior = GammaPrior(5.0, 0.5) outputscale_initial_value = 8.0 # Mordred optimized priors - elif mordred: + elif switching_condition: lengthscale_prior = GammaPrior(2.0, 0.1) lengthscale_initial_value = 10.0 outputscale_prior = GammaPrior(2.0, 0.1) @@ -97,8 +120,8 @@ def _edbo_noise_factory( [p for p in searchspace.parameters if isinstance(p, TaskParameter)] ) - uses_descriptors = ( - searchspace.contains_mordred or searchspace.contains_rdkit + switching_condition = _contains_encoding( + searchspace.discrete, _EDBO_ENCODINGS ) and (effective_dims >= 50) # low D priors @@ -106,11 +129,11 @@ def _edbo_noise_factory( return (GammaPrior(1.05, 0.5), 0.1) # DFT optimized priors - elif uses_descriptors and effective_dims < 100: + elif switching_condition and effective_dims < 100: return (GammaPrior(1.5, 0.1), 5.0) # Mordred optimized priors - elif uses_descriptors: + elif switching_condition: return (GammaPrior(1.5, 0.1), 5.0) # OHE optimized priors diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 8d55f1358..6e20f0fe2 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -4,6 +4,8 @@ import ssl import tempfile import urllib.request +import warnings +from collections.abc import Sequence from functools import lru_cache from pathlib import Path @@ -12,17 +14,15 @@ from joblib import Memory from baybe._optional.chem import ( - Calculator, + BaseFingerprintTransformer, Chem, - GetMorganFingerprintAsBitVect, - RDLogger, - descriptors, + ConformerGenerator, + MolFromSmilesTransformer, + fingerprints, ) +from baybe.parameters.enum import SubstanceEncoding from baybe.utils.numerical import DTypeFloatNumpy -_mordred_calculator = Calculator(descriptors) - - # Caching _cachedir = os.environ.get( "BAYBE_CACHE_DIR", str(Path(tempfile.gettempdir()) / ".baybe_cache") @@ -33,7 +33,9 @@ def _dummy_wrapper(func): return func -_disk_cache = _dummy_wrapper if _cachedir == "" else Memory(Path(_cachedir)).cache +_disk_cache = ( + _dummy_wrapper if _cachedir == "" else Memory(Path(_cachedir), verbose=0).cache +) def name_to_smiles(name: str) -> str: @@ -72,158 +74,117 @@ def name_to_smiles(name: str) -> str: @lru_cache(maxsize=None) @_disk_cache -def _smiles_to_mordred_features(smiles: str) -> np.ndarray: - """Memory- and disk-cached computation of Mordred descriptors. - - Args: - smiles: SMILES string. - - Returns: - Mordred descriptors for the given smiles string. - """ - try: - return np.asarray( - _mordred_calculator(Chem.MolFromSmiles(smiles)).fill_missing() - ) - except Exception: - return np.full(len(_mordred_calculator.descriptors), np.nan) - - -def smiles_to_mordred_features( - smiles_list: list[str], - prefix: str = "", - dropna: bool = True, -) -> pd.DataFrame: - """Compute Mordred chemical descriptors for a list of SMILES strings. - - Args: - smiles_list: List of SMILES strings. - prefix: Name prefix for each descriptor - (e.g., nBase --> _nBase). - dropna: If ``True``, drops columns that contain NaNs. - - Returns: - Dataframe containing overlapping Mordred descriptors for each SMILES - string. - """ - features = [_smiles_to_mordred_features(smiles) for smiles in smiles_list] - descriptor_names = list(_mordred_calculator.descriptors) - columns = [prefix + "MORDRED_" + str(name) for name in descriptor_names] - dataframe = pd.DataFrame(data=features, columns=columns, dtype=DTypeFloatNumpy) - - if dropna: - dataframe = dataframe.dropna(axis=1) - - return dataframe - - -def smiles_to_molecules(smiles_list: list[str]) -> list[Chem.Mol]: - """Convert a given list of SMILES strings into corresponding Molecule objects. +def _molecule_to_fingerprint_features( + molecule: str | Chem.Mol, + encoder: BaseFingerprintTransformer, +) -> np.ndarray: + """Compute molecular fingerprint for a single molecule. Args: - smiles_list: List of SMILES strings. + molecule: SMILES string or molecule object. + encoder: Instance of the fingerprint class to be used for computation. Returns: - List of corresponding molecules. - - Raises: - ValueError: If the SMILES does not seem to be chemically valid. + Array of fingerprint features. """ - mols = [] - for smiles in smiles_list: - try: - mol = Chem.MolFromSmiles(smiles) - if mol is None: - raise ValueError() - mols.append(mol) - except Exception as ex: - raise ValueError( - f"The SMILES {smiles} does not seem to be chemically valid." - ) from ex - return mols - - -def smiles_to_rdkit_features( - smiles_list: list[str], prefix: str = "", dropna: bool = True -) -> pd.DataFrame: - """Compute RDKit chemical descriptors for a list of SMILES strings. - - Args: - smiles_list: List of SMILES strings. - prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). - dropna: If ``True``, drops columns that contain NaNs. - - Returns: - Dataframe containing overlapping RDKit descriptors for each SMILES string. - """ - mols = smiles_to_molecules(smiles_list) - - res = [] - for mol in mols: - desc = { - prefix + "RDKIT_" + dname: DTypeFloatNumpy(func(mol)) - for dname, func in Chem.Descriptors.descList - } - res.append(desc) + return encoder.transform([molecule]) - df = pd.DataFrame(res) - if dropna: - df = df.dropna(axis=1) - return df - - -def smiles_to_fp_features( - smiles_list: list[str], - prefix: str = "", - dtype: type[int] | type[float] = int, - radius: int = 4, - n_bits: int = 1024, +def smiles_to_fingerprint_features( + smiles: Sequence[str], + encoding: SubstanceEncoding, + prefix: str | None = None, + kwargs_conformer: dict | None = None, + kwargs_fingerprint: dict | None = None, ) -> pd.DataFrame: - """Compute standard Morgan molecule fingerprints for a list of SMILES strings. + """Compute molecular fingerprints for a list of SMILES strings. Args: - smiles_list: List of SMILES strings. + smiles: Sequence of SMILES strings. + encoding: Encoding used to transform SMILES to fingerprints. prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). - dtype: Specifies whether fingerprints will have int or float data type. - radius: Radius for the Morgan fingerprint. - n_bits:Number of bits for the Morgan fingerprint. + kwargs_conformer: kwargs for conformer generator + kwargs_fingerprint: kwargs for fingerprint generator Returns: - Dataframe containing Morgan fingerprints for each SMILES string. + Dataframe containing fingerprints for each SMILES string. """ - mols = smiles_to_molecules(smiles_list) + kwargs_fingerprint = kwargs_fingerprint or {} + kwargs_conformer = kwargs_conformer or {} + + if encoding is SubstanceEncoding.MORGAN_FP: + warnings.warn( + f"Substance encoding '{encoding.name}' is deprecated and will be disabled " + f"in a future version. Use '{SubstanceEncoding.ECFP.name}' " + f"with 'fp_size' 1024 and 'radius' 4 instead.", + DeprecationWarning, + ) + encoding = SubstanceEncoding.ECFP + kwargs_fingerprint.update({"fp_size": 1024, "radius": 4}) + + elif encoding is SubstanceEncoding.RDKIT: + warnings.warn( + f"Substance encoding '{encoding.name}' is deprecated and will be disabled " + f"in a future version. Use '{SubstanceEncoding.RDKIT2DDESCRIPTORS.name}' " + f"instead.", + DeprecationWarning, + ) + encoding = SubstanceEncoding.RDKIT2DDESCRIPTORS - res = [] - for mol in mols: - RDLogger.logger().setLevel(RDLogger.CRITICAL) + fingerprint_cls = get_fingerprint_class(encoding) + fingerprint_encoder = fingerprint_cls(**kwargs_fingerprint) - fingerp = GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits).ToBitString() - fingerp = map(int, fingerp) - fpvec = np.array(list(fingerp)) - res.append( - {prefix + "FP_" + f"{k + 1}": dtype(bit) for k, bit in enumerate(fpvec)} + if fingerprint_encoder.requires_conformers: + mol_list = ConformerGenerator(**kwargs_conformer).transform( + MolFromSmilesTransformer().transform(smiles) ) - - df = pd.DataFrame(res) + else: + mol_list = smiles + + features = np.concatenate( + [ + _molecule_to_fingerprint_features(mol, fingerprint_encoder) + for mol in mol_list + ] + ) + name = f"{encoding.name}_" + prefix = prefix + "_" if prefix else "" + col_names = [ + prefix + name + f.split("fingerprint")[1] + for f in fingerprint_encoder.get_feature_names_out() + ] + df = pd.DataFrame(features, columns=col_names, dtype=DTypeFloatNumpy) return df -def is_valid_smiles(smiles: str) -> bool: - """Test if a SMILES string is valid according to RDKit. +def get_fingerprint_class(encoding: SubstanceEncoding) -> BaseFingerprintTransformer: + """Retrieve the fingerprint class corresponding to a given encoding. Args: - smiles: SMILES string to be tested. + encoding: A substance encoding. + + Raises: + ValueError: If no fingerprint class for the specified encoding is found. Returns: - ``True`` if the provided SMILES is valid, ``False`` else. + The fingerprint class. """ + # Exception case + if encoding is SubstanceEncoding.RDKITFINGERPRINT: + return fingerprints.RDKitFingerprint + try: - mol = Chem.MolFromSmiles(smiles) - return mol is not None - except Exception: - return False + cls_name = next( + name + for name in dir(fingerprints) + if (encoding.name + "Fingerprint").casefold() == name.casefold() + ) + except StopIteration as e: + raise ValueError( + f"No fingerprint class exists for the specified encoding '{encoding.name}'." + ) from e + return getattr(fingerprints, cls_name) def get_canonical_smiles(smiles: str) -> str: diff --git a/docs/conf.py b/docs/conf.py index 79822b084..403322ffe 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -246,6 +246,7 @@ "python": ("https://docs.python.org/3", None), "pandas": ("https://pandas.pydata.org/docs/", None), "polars": ("https://docs.pola.rs/api/python/stable/", None), + "skfp": ("https://scikit-fingerprints.github.io/scikit-fingerprints/", None), "sklearn": ("https://scikit-learn.org/stable/", None), "sklearn_extra": ("https://scikit-learn-extra.readthedocs.io/en/stable", None), "numpy": ("https://numpy.org/doc/stable/", None), diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index 377ac14ca..3074c73a2 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -1,6 +1,15 @@ +[`SearchSpace`]: baybe.searchspace.core.SearchSpace +[`Constraint`]: baybe.constraints.base.Constraint +[`SubstanceParameter`]: baybe.parameters.substance.SubstanceParameter +[`CategoricalParameter`]: baybe.parameters.categorical.CategoricalParameter +[`TaskParameter`]: baybe.parameters.categorical.TaskParameter +[`CustomDiscreteParameter`]: baybe.parameters.custom.CustomDiscreteParameter +[`SubstanceEncoding`]: baybe.parameters.enum.SubstanceEncoding +[scikit-fingerprints]: https://scikit-fingerprints.github.io/scikit-fingerprints/ + # Parameters -Parameters are fundamental for BayBE, as they configure the ``SearchSpace`` and serve +Parameters are fundamental for BayBE, as they configure the [`SearchSpace`] and serve as the direct link to the controllable variables in your experiment. Before starting an iterative campaign, the user is required to specify the exact parameters they can control and want to consider in their optimization. @@ -16,11 +25,11 @@ differently under the hood: Discrete and continuous parameters. ## Continuous Parameters -### ``NumericalContinuousParameter`` +### NumericalContinuousParameter This is currently the only continuous parameter type BayBE supports. It defines possible values from a numerical interval called ``bounds``, and thus has an infinite amount of possibilities. -Unless restrained by `Constraint`s, BayBE will consider any possible parameter value +Unless restrained by [`Constraint`]s, BayBE will consider any possible parameter value that lies within the chosen interval. ```python @@ -47,7 +56,7 @@ number space. For different parameters, different types of encoding make sense. situations are reflected by the different discrete parameter types BayBE offers. ``` -### ``NumericalDiscreteParameter`` +### NumericalDiscreteParameter This is the right type for parameters that have numerical values. We support sets with equidistant values like ``(1, 2, 3, 4, 5)`` but also unevenly spaced sets of numbers like ``(0.2, 1.0, 2.0, 5.0, 10.0, 50.0)``. @@ -66,8 +75,8 @@ NumericalDiscreteParameter( ) ``` -### ``CategoricalParameter`` -A ``CategoricalParameter`` supports sets of strings as labels. +### CategoricalParameter +A [`CategoricalParameter`] supports sets of strings as labels. This is most suitable if the experimental choices cannot easily be translated into a number. Examples for this could be vendors like ``("Vendor A", "Vendor B", "Vendor C")`` or @@ -97,14 +106,14 @@ simply because the number 1 is closer to 2 than to 3. Hence, for an arbitrary set of labels, such an ordering cannot generally be assumed. In the particular case of substances, it not even possible to describe the similarity between labels by ordering along one single dimension. -For this reason, we also provide the ``SubstanceParameter``, which encodes labels +For this reason, we also provide the [`SubstanceParameter`], which encodes labels corresponding to small molecules with chemical descriptors, capturing their similarities much better and without the need for the user to think about ordering and similarity in the first place. -This concept is generalized in the ``CustomDiscreteParameter``, where the user can +This concept is generalized in the [`CustomDiscreteParameter`], where the user can provide their own custom set of descriptors for each label. -### ``SubstanceParameter`` +### SubstanceParameter Instead of ``values``, this parameter accepts ``data`` in form of a dictionary. The items correspond to pairs of labels and [SMILES](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system). SMILES are string-based representations of molecular structures. @@ -127,11 +136,42 @@ SubstanceParameter( ) ``` -The ``encoding`` option defines what kind of descriptors are calculated: -* ``MORDRED``: 2D descriptors from the [Mordred package](https://mordred-descriptor.github.io/documentation/master/). - Since the original package is now unmaintained, baybe requires the community replacement [mordredcommunity](https://github.com/JacksonBurns/mordred-community) -* ``RDKIT``: 2D descriptors from the [RDKit package](https://www.rdkit.org/) -* ``MORGAN_FP``: Morgan fingerprints calculated with RDKit (1024 bits, radius 4) +The ``encoding`` defines what kind of descriptors are calculated using the +[scikit-fingerprints] package. +It can be specified either by passing the corresponding [`SubstanceEncoding`] member +(click to see full list of options) or its string representation, e.g. use +[`SubstanceParameter.MORDRED`](baybe.parameters.enum.SubstanceEncoding.MORDRED) +or its string alias `"MORDRED"` to select the {class}`~skfp.fingerprints.MordredFingerprint`. + +Here are examples of a few popular fingerprints: +* {attr}`~baybe.parameters.enum.SubstanceEncoding.ECFP`: Extended Connectivity FingerPrint, +which is a circular topological fingerprint similar to Morgan fingerprint. +* {attr}`~baybe.parameters.enum.SubstanceEncoding.MORDRED`: Chemical descriptor based fingerprint. +* {attr}`~baybe.parameters.enum.SubstanceEncoding.RDKIT`: The RDKit fingerprint, which is based on hashing of molecular subgraphs. + +You can customize the fingerprint computation by passing arguments of the corresponding +[scikit-fingerprints] class to the `kwargs_fingerprint` argument the [`SubstanceParameter`] constructor. +Similarly, for fingerprints requiring conformers, +the configuration options for conformer computation can be specified via `kwargs_conformer`. + +```python +from baybe.parameters import SubstanceParameter + +SubstanceParameter( + name="Solvent", + data={ + "Water": "O", + "1-Octanol": "CCCCCCCCO", + "Toluene": "CC1=CC=CC=C1", + }, + encoding="ECFP", + kwargs_fingerprint={ + "radius": 4, # Set maximum radius of resulting subgraphs + "fp_size": 1024, # Change the number of computed bits + }, +) + +``` These calculations will typically result in 500 to 1500 numbers per molecule. To avoid detrimental effects on the surrogate model fit, we reduce the number of @@ -142,10 +182,10 @@ This usually reduces the number of descriptors to 10-50, depending on the specif items in ``data``. ```{warning} -The descriptors calculated for a ``SubstanceParameter`` were developed to describe +The descriptors calculated for a [`SubstanceParameter`] were developed to describe small molecules and are not suitable for other substances. If you deal with large molecules like polymers or arbitrary substance mixtures, we recommend to provide your -own descriptors via the ``CustomParameter``. +own descriptors via the [`CustomDiscreteParameter`]. ``` In the following example from an application you can see @@ -169,14 +209,14 @@ The ``SubstanceParameter`` is only available if BayBE was installed with the additional ``chem`` dependency. ``` -### ``CustomDiscreteParameter`` +### CustomDiscreteParameter The ``encoding`` concept introduced above is generalized by the -``CustomParameter``. +[`CustomDiscreteParameter`]. Here, the user is expected to provide their own descriptors for the encoding. Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the -``SubstanceParameter``. +[`SubstanceParameter`]. Still, one could provide experimental measurements or common metrics used to classify polymers: @@ -199,15 +239,15 @@ CustomDiscreteParameter( ) ``` -With the ``CustomParameter``, you can also encode parameter labels that have +With the [`CustomDiscreteParameter`], you can also encode parameter labels that have nothing to do with substances. For example, a parameter corresponding to the choice of a vendor is typically not easily encoded with standard means. In BayBE's framework, you can provide numbers corresponding e.g. to delivery time, reliability or average price of the vendor to encode the labels via the -``CustomParameter``. +[`CustomDiscreteParameter`]. -### ``TaskParameter`` +### TaskParameter Often, several experimental campaigns involve similar or even identical parameters but still have one or more differences. For example, when optimizing reagents in a chemical reaction, the reactants remain @@ -216,7 +256,7 @@ Similarly, in a mixture development for cell culture media, the cell type is fix hence not a parameter. However, once we plan to mix data from several campaigns, both reactants and cell lines can also be considered parameters in that they encode the necessary context. -BayBE is able to process such context information with the `TaskParameter`. +BayBE is able to process such context information with the [`TaskParameter`]. In many cases, this can drastically increase the optimization performance due to the enlarged data corpus. diff --git a/examples/Backtesting/full_lookup.py b/examples/Backtesting/full_lookup.py index 330c066a5..8f0dd6238 100644 --- a/examples/Backtesting/full_lookup.py +++ b/examples/Backtesting/full_lookup.py @@ -96,7 +96,7 @@ # First let us create three campaigns that each use a different chemical encoding to # treat substances. -substance_encodings = ["MORDRED", "RDKIT", "MORGAN_FP"] +substance_encodings = ["MORDRED", "RDKIT2DDESCRIPTORS", "ECFP"] scenarios = { encoding: Campaign( searchspace=SearchSpace.from_product( diff --git a/examples/Backtesting/full_lookup_dark.svg b/examples/Backtesting/full_lookup_dark.svg index bda220f8e..d2bbb8447 100644 --- a/examples/Backtesting/full_lookup_dark.svg +++ b/examples/Backtesting/full_lookup_dark.svg @@ -6,11 +6,11 @@ - 2024-08-02T17:36:40.679469 + 2024-10-09T14:57:50.630496 image/svg+xml - Matplotlib v3.9.1, https://matplotlib.org/ + Matplotlib v3.9.2, https://matplotlib.org/ @@ -41,269 +41,269 @@ z - - - + + - - - + + - - - + + - - - + + - - - + + - - + @@ -343,7 +343,7 @@ z - + @@ -394,7 +394,7 @@ z - + @@ -408,7 +408,7 @@ z - + @@ -448,7 +448,7 @@ z - + @@ -462,7 +462,7 @@ z - + @@ -510,7 +510,7 @@ z - + @@ -524,7 +524,7 @@ z - + @@ -837,17 +837,17 @@ z - - + - + @@ -856,12 +856,12 @@ L -3.5 0 - + - + @@ -870,12 +870,12 @@ L -3.5 0 - + - + @@ -884,12 +884,12 @@ L -3.5 0 - + - + - + - + - + - + - + - + - + - + @@ -1194,29 +1194,29 @@ z - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + @@ -1510,21 +1510,21 @@ L 1555.2 69.12 - - + - - + - + - - + - - + + + @@ -1872,62 +1893,34 @@ z + + + + + + + + + + + + + - - + - - + + - - - - - - - - - - - - + + + + - - + - + - - + - + @@ -2044,7 +2011,7 @@ L 247.029063 223.407813 - + diff --git a/examples/Backtesting/full_lookup_light.svg b/examples/Backtesting/full_lookup_light.svg index 58d251c47..a766d7671 100644 --- a/examples/Backtesting/full_lookup_light.svg +++ b/examples/Backtesting/full_lookup_light.svg @@ -6,11 +6,11 @@ - 2024-08-02T17:36:40.703138 + 2024-10-09T14:57:50.668380 image/svg+xml - Matplotlib v3.9.1, https://matplotlib.org/ + Matplotlib v3.9.2, https://matplotlib.org/ @@ -41,269 +41,269 @@ z - - - + + - - - + + - - - + + - - - + + - - - + + - - + @@ -343,7 +343,7 @@ z - + @@ -394,7 +394,7 @@ z - + @@ -408,7 +408,7 @@ z - + @@ -448,7 +448,7 @@ z - + @@ -462,7 +462,7 @@ z - + @@ -510,7 +510,7 @@ z - + @@ -524,7 +524,7 @@ z - + @@ -837,17 +837,17 @@ z - - + - + @@ -856,12 +856,12 @@ L -3.5 0 - + - + @@ -870,12 +870,12 @@ L -3.5 0 - + - + @@ -884,12 +884,12 @@ L -3.5 0 - + - + - + - + - + - + - + - + - + - + @@ -1194,29 +1194,29 @@ z - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + @@ -1510,21 +1510,21 @@ L 1555.2 69.12 - - + - - + - + - - + - - + + + @@ -1872,62 +1893,34 @@ z + + + + + + + + + + + + + - - + - - + + - - - - - - - - - - - - + + + + - - + - + - - + - + @@ -2044,7 +2011,7 @@ L 247.029063 223.407813 - + diff --git a/mypy.ini b/mypy.ini index 5b33bab7c..2eb5718bf 100644 --- a/mypy.ini +++ b/mypy.ini @@ -26,9 +26,6 @@ ignore_missing_imports = True [mypy-joblib.*] ignore_missing_imports = True -[mypy-mordred] -ignore_missing_imports = True - [mypy-mpl_toolkits.mplot3d] ignore_missing_imports = True @@ -47,6 +44,9 @@ ignore_missing_imports = True [mypy-scipy.stats] ignore_missing_imports = True +[mypy-skfp.*] +ignore_missing_imports = True + [mypy-sklearn.*] ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index 8f416e7dc..69298cc60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,8 +68,7 @@ Issues = "https://github.com/emdgroup/baybe/issues/" [project.optional-dependencies] chem = [ - "rdkit>=2022.3.4", - "mordredcommunity>=1.2.0", + "scikit-fingerprints>=1.7.0", ] onnx = [ @@ -96,13 +95,13 @@ dev = [ docs = [ "baybe[examples]", # docs cannot be built without running examples "furo>=2023.09.10", - "jupyter>=1.0.0", - "jupytext>=1.16.1", - "myst-parser>=2.0.0", - "sphinx>=7.1.1", - "sphinx-autodoc-typehints>=1.24.0", + "jupyter>=1.1.1", + "jupytext>=1.16.4", + "myst-parser>=4.0.0", + "sphinx>=8.0.2", + "sphinx-autodoc-typehints>=2.4.4", "sphinx-copybutton==0.5.2", - "sphinxcontrib-bibtex>=2.6.2 ", + "sphinxcontrib-bibtex>=2.6.2", ] examples = [ diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py index 10b067409..8b56bc1ca 100644 --- a/tests/hypothesis_strategies/parameters.py +++ b/tests/hypothesis_strategies/parameters.py @@ -140,7 +140,13 @@ def substance_parameters(draw: st.DrawFn): name = draw(parameter_names) data = draw(substance_data()) decorrelate = draw(decorrelations) - encoding = draw(st.sampled_from(SubstanceEncoding)) + + # Ignore deprecated encodings + encodings = list(SubstanceEncoding) + encodings.remove(SubstanceEncoding.MORGAN_FP) + encodings.remove(SubstanceEncoding.RDKIT) + encoding = draw(st.sampled_from(encodings)) + return SubstanceParameter( name=name, data=data, decorrelate=decorrelate, encoding=encoding ) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index 0b818f5b6..0a8dd854f 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -1,10 +1,13 @@ """Deprecation tests.""" import warnings +from unittest.mock import patch import pandas as pd import pytest +from pytest import param +from baybe._optional.info import CHEM_INSTALLED from baybe.acquisition.base import AcquisitionFunction from baybe.constraints import ( ContinuousLinearConstraint, @@ -17,6 +20,7 @@ from baybe.objectives.base import Objective from baybe.objectives.desirability import DesirabilityObjective from baybe.objectives.single import SingleTargetObjective +from baybe.parameters.enum import SubstanceEncoding from baybe.parameters.numerical import NumericalContinuousParameter from baybe.recommenders.pure.bayesian import ( BotorchRecommender, @@ -237,3 +241,33 @@ def test_target_transform_interface(): numerical.transform(data=pd.DataFrame(columns=["num"])) with pytest.warns(DeprecationWarning): binary.transform(data=pd.DataFrame(columns=["bin"])) + + +@pytest.mark.parametrize( + ("deprecated", "replacement"), + [ + param(SubstanceEncoding.MORGAN_FP, "ECFPFingerprint", id="morgan"), + param(SubstanceEncoding.RDKIT, "RDKit2DDescriptorsFingerprint", id="rdkit"), + ], +) +@pytest.mark.skipif( + not CHEM_INSTALLED, reason="Optional chem dependency not installed." +) +def test_deprecated_encodings(deprecated, replacement): + """Deprecated encoding raises a warning and uses correct replacement.""" + import skfp.fingerprints + + from baybe.utils.chemistry import smiles_to_fingerprint_features + + path = f"skfp.fingerprints.{replacement}" + + with patch(path, wraps=getattr(skfp.fingerprints, replacement)) as patched: + # Assert warning + with pytest.warns(DeprecationWarning): + smiles_to_fingerprint_features(["C"], deprecated) + + # Check that equivalent is used instead of deprecated encoding + if deprecated is SubstanceEncoding.MORGAN_FP: + patched.assert_called_once_with(**{"fp_size": 1024, "radius": 4}) + else: + patched.assert_called_once() diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py new file mode 100644 index 000000000..96782caa3 --- /dev/null +++ b/tests/test_fingerprints.py @@ -0,0 +1,56 @@ +"""Tests for fingerprint generation.""" + +import pytest + +from baybe._optional.info import CHEM_INSTALLED +from baybe.parameters.enum import SubstanceEncoding + +test_cases: list[tuple[SubstanceEncoding, dict, dict]] = [ + (enc, {}, {}) + for enc in SubstanceEncoding + if enc + not in { # Ignore deprecated encodings + SubstanceEncoding.MORGAN_FP, + SubstanceEncoding.RDKIT, + } +] + +ECFP = SubstanceEncoding.ECFP + + +@pytest.mark.skipif( + not CHEM_INSTALLED, reason="Optional chem dependency not installed." +) +@pytest.mark.parametrize( + ("encoding", "kw_fp", "kw_conf"), + test_cases + + [ # Add some custom tests + (ECFP, {"fp_size": 64}, {}), + (ECFP, {"fp_size": 512}, {}), + (ECFP, {"radius": 4}, {}), + (ECFP, {"fp_size": 512, "radius": 4}, {}), + (ECFP, {}, {"max_gen_attempts": 5000}), + ], +) +def test_fingerprint_kwargs(encoding, kw_fp, kw_conf): + """Test all fingerprint computations.""" + from baybe.utils.chemistry import smiles_to_fingerprint_features + + smiles = ["CC(N(C)C)=O", "CCCC#N"] + x = smiles_to_fingerprint_features( + smiles=smiles, + encoding=encoding, + prefix="", + kwargs_conformer=kw_conf, + kwargs_fingerprint=kw_fp, + ) + + assert x.shape[0] == len(smiles), ( + "The number of fingerprint embedding rows does not match the number of " + "molecules." + ) + if "fp_size" in kw_fp: + assert x.shape[1] == kw_fp["fp_size"], ( + "The fingerprint dimension parameter was ignored, fingerprints have a " + "wrong number of dimensions." + )