Skip to content

Commit

Permalink
68 makescaffoldgeneric set all carbons to wild card symbols to allow …
Browse files Browse the repository at this point in the history
…a substrcutre search (#69)
  • Loading branch information
c-w-feldmann authored Sep 6, 2024
1 parent fa9363b commit 4f69b9f
Show file tree
Hide file tree
Showing 2 changed files with 209 additions and 1 deletion.
110 changes: 109 additions & 1 deletion molpipeline/mol2mol/scaffolds.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

from __future__ import annotations

from typing import Any, Optional

try:
from typing import Self # pylint: disable=no-name-in-module
except ImportError:
from typing_extensions import Self

from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold as RDKIT_MurckoScaffold

from molpipeline.abstract_pipeline_elements.core import (
Expand Down Expand Up @@ -38,6 +46,43 @@ class MakeScaffoldGeneric(_MolToMolPipelineElement):
Done to make scaffolds less speciffic.
"""

def __init__(
self,
generic_atoms: bool = False,
generic_bonds: bool = False,
name: str = "MakeScaffoldGeneric",
n_jobs: int = 1,
uuid: Optional[str] = None,
) -> None:
"""Initialize MakeScaffoldGeneric.
Note
----
Making atoms or bonds generic will generate SMARTS strings instead of SMILES strings.
This can be useful to search for scaffolds and substructures in data sets.
Per default, the scaffold is returned as SMILES string with all atoms set to carbon and all bonds are single bonds.
Parameters
----------
generic_atoms: bool
If True, all atoms in the molecule are set to generic atoms (*).
generic_bonds: bool
If True, all bonds in the molecule are set to any bonds.
name: str
Name of pipeline element.
n_jobs: int
Number of jobs to use for parallelization.
uuid: Optional[str]
UUID of pipeline element.
Returns
-------
None
"""
self.generic_atoms = generic_atoms
self.generic_bonds = generic_bonds
super().__init__(name=name, n_jobs=n_jobs, uuid=uuid)

def pretransform_single(self, value: RDKitMol) -> OptionalMol:
"""Set all atoms to carbon and all bonds to single bond and return mol object.
Expand All @@ -52,4 +97,67 @@ def pretransform_single(self, value: RDKitMol) -> OptionalMol:
Molecule where all atoms are carbon and all bonds are single bonds.
If transformation failed, it returns InvalidInstance.
"""
return RDKIT_MurckoScaffold.MakeScaffoldGeneric(value)
scaffold = RDKIT_MurckoScaffold.MakeScaffoldGeneric(value)
if self.generic_atoms:
for atom in scaffold.GetAtoms():
atom.SetAtomicNum(0)
if self.generic_bonds:
for bond in scaffold.GetBonds():
bond.SetBondType(Chem.rdchem.BondType.UNSPECIFIED)
return scaffold

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Get parameters of pipeline element.
Parameters
----------
deep: bool
If True, return the parameters of the pipeline element.
Returns
-------
dict[str, Any]
Parameters of the pipeline element.
"""
parent_params = super().get_params()
if deep:
parent_params.update(
{
"generic_atoms": bool(self.generic_atoms),
"generic_bonds": bool(self.generic_bonds),
}
)
else:
parent_params.update(
{
"generic_atoms": self.generic_atoms,
"generic_bonds": self.generic_bonds,
}
)
return parent_params

def set_params(self, **parameters: dict[str, Any]) -> Self:
"""Set parameters of pipeline element.
Parameters
----------
parameters: dict[str, Any]
Parameters to set.
Returns
-------
Self
Pipeline element with set parameters.
"""
param_copy = parameters.copy()
generic_atoms = param_copy.pop("generic_atoms", None)
generic_bonds = param_copy.pop("generic_bonds", None)
if generic_atoms is not None:
if not isinstance(generic_atoms, bool):
raise ValueError("generic_atoms must be a boolean.")
self.generic_atoms = generic_atoms
if generic_bonds is not None:
if not isinstance(generic_bonds, bool):
raise ValueError("generic_bonds must be a boolean.")
self.generic_bonds = generic_bonds
return self
100 changes: 100 additions & 0 deletions tests/test_elements/test_mol2mol/test_mol2scaffold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Test the mol2scaffold module."""

from typing import Any
from unittest import TestCase

from molpipeline import Pipeline
from molpipeline.any2mol import AutoToMol
from molpipeline.mol2any import MolToSmiles
from molpipeline.mol2mol.scaffolds import MakeScaffoldGeneric, MurckoScaffold


class TestMurckoScaffold(TestCase):
"""Test the MurckoScaffold class."""

def test_murcko_scaffold_generation_pipeline(self) -> None:
"""Test the scaffold generation."""
scaffold_pipeline = Pipeline(
steps=[
("smiles_to_mol", AutoToMol()),
("murcko_scaffold", MurckoScaffold()),
("scaffold_to_smiles", MolToSmiles()),
]
)
smiles_list = ["Cc1ccc(=O)[nH]c1", "O=CC1CCC(c2ccccc2)CC1", "CCC"]
expected_scaffold_list = ["O=c1cccc[nH]1", "c1ccc(C2CCCCC2)cc1", ""]

scaffold_list = scaffold_pipeline.transform(smiles_list)
self.assertListEqual(expected_scaffold_list, scaffold_list)


class TestMakeScaffoldGeneric(TestCase):
"""Test the MakeScaffoldGeneric class."""

def setUp(self) -> None:
"""Set up the pipeline and common variables."""
self.generic_scaffold_pipeline = Pipeline(
steps=[
("smiles_to_mol", AutoToMol()),
("murcko_scaffold", MurckoScaffold()),
("make_scaffold_generic", MakeScaffoldGeneric()),
("scaffold_to_smiles", MolToSmiles()),
]
)
self.smiles_list = ["Cc1ccc(=O)[nH]c1", "O=CC1CCC(c2ccccc2)CC1", "CCC"]

def check_generic_scaffold(
self, params: dict[str, Any], expected_scaffold_list: list[str]
) -> None:
"""Helper function to set parameters and check the results.
Parameters
----------
params: dict[str, Any]
Parameters to set for the pipeline.
expected_scaffold_list: list[str]
Expected output of the pipeline.
"""
self.generic_scaffold_pipeline.set_params(**params)
generic_scaffold_list = self.generic_scaffold_pipeline.transform(
self.smiles_list
)
self.assertListEqual(expected_scaffold_list, generic_scaffold_list)

def test_generic_scaffold_generation_pipeline(self) -> None:
"""Test the generic scaffold generation."""
self.check_generic_scaffold(
params={}, expected_scaffold_list=["CC1CCCCC1", "C1CCC(C2CCCCC2)CC1", ""]
)

# Test the generic scaffold generation with generic atoms
self.check_generic_scaffold(
params={"make_scaffold_generic__generic_atoms": True},
expected_scaffold_list=["**1*****1", "*1***(*2*****2)**1", ""],
)

# Test the generic scaffold generation with generic bonds
self.check_generic_scaffold(
params={
"make_scaffold_generic__generic_atoms": False,
"make_scaffold_generic__generic_bonds": True,
},
expected_scaffold_list=[
"C~C1~C~C~C~C~C~1",
"C1~C~C~C(~C2~C~C~C~C~C~2)~C~C~1",
"",
],
)

# Test the generic scaffold generation with generic atoms and bonds
self.check_generic_scaffold(
params={
"make_scaffold_generic__generic_atoms": True,
"make_scaffold_generic__generic_bonds": True,
},
expected_scaffold_list=[
"*~*1~*~*~*~*~*~1",
"*1~*~*~*(~*2~*~*~*~*~*~2)~*~*~1",
"",
],
)

0 comments on commit 4f69b9f

Please sign in to comment.