Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mol2any: get feature names from elements #106

Merged
merged 12 commits into from
Nov 21, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class MolToFingerprintPipelineElement(MolToAnyPipelineElement, abc.ABC):
"""Abstract class for PipelineElements which transform molecules to integer vectors."""

_n_bits: int
_feature_names: list[str]
_output_type = "binary"
_return_as: OutputDatatype

Expand Down Expand Up @@ -71,6 +72,11 @@ def n_bits(self) -> int:
"""Get number of bits in (or size of) fingerprint."""
return self._n_bits

@property
def feature_names(self) -> list[str]:
"""Get feature names."""
return self._feature_names[:]

@overload
def assemble_output( # type: ignore
self, value_list: Iterable[npt.NDArray[np.int_]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class MolToDescriptorPipelineElement(MolToAnyPipelineElement):

_standardizer: Optional[AnyTransformer]
_output_type = "float"
_feature_names: list[str]

def __init__(
self,
Expand Down Expand Up @@ -66,6 +67,11 @@ def __init__(
def n_features(self) -> int:
"""Return the number of features."""

@property
def feature_names(self) -> list[str]:
"""Return a copy of the feature names."""
return self._feature_names[:]

def assemble_output(
self,
value_list: Iterable[npt.NDArray[np.float64]],
Expand Down
30 changes: 29 additions & 1 deletion molpipeline/mol2any/mol2concatinated_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class MolToConcatenatedVector(MolToAnyPipelineElement):
def __init__(
self,
element_list: list[tuple[str, MolToAnyPipelineElement]],
feature_names_prefix: Optional[str] = None,
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved
name: str = "MolToConcatenatedVector",
n_jobs: int = 1,
uuid: Optional[str] = None,
Expand All @@ -43,6 +44,8 @@ def __init__(
----------
element_list: list[MolToAnyPipelineElement]
List of Pipeline Elements of which the output is concatenated.
feature_names_prefix: str, optional (default=None)
Prefix for feature names. If None, the name of the pipeline element is used.
name: str, optional (default="MolToConcatenatedVector")
name of pipeline.
n_jobs: int, optional (default=1)
Expand All @@ -53,6 +56,9 @@ def __init__(
Additional keyword arguments. Can be used to set parameters of the pipeline elements.
"""
self._element_list = element_list
if len(element_list) == 0:
raise ValueError("element_list must contain at least one element.")
self._feature_names_prefix = feature_names_prefix
super().__init__(name=name, n_jobs=n_jobs, uuid=uuid)
output_types = set()
for _, element in self._element_list:
Expand Down Expand Up @@ -82,11 +88,33 @@ def n_features(self) -> int:
elif hasattr(element, "n_bits"):
feature_count += element.n_bits
else:
raise AssertionError(
raise ValueError(
f"Element {element} does not have n_features or n_bits."
)
return feature_count

@property
def feature_names(self) -> list[str]:
"""Return the feature names of concatenated elements."""
feature_names = []
for name, element in self._element_list:
if self._feature_names_prefix is None:
# use element name as prefix
prefix = name
else:
# use user specified prefix
prefix = self._feature_names_prefix

if hasattr(element, "feature_names"):
feature_names.extend(
[f"{prefix}__{feature}" for feature in element.feature_names]
)
else:
raise ValueError(
f"Element {element} does not have feature_names attribute."
)
return feature_names

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Return all parameters defining the object.

Expand Down
1 change: 1 addition & 0 deletions molpipeline/mol2any/mol2maccs_key_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class MolToMACCSFP(MolToFingerprintPipelineElement):
"""

_n_bits = 167 # MACCS keys have 166 bits + 1 bit for an all-zero vector (bit 0)
_feature_names = [f"maccs_{i}" for i in range(_n_bits)]

def pretransform_single(
self, value: RDKitMol
Expand Down
1 change: 1 addition & 0 deletions molpipeline/mol2any/mol2morgan_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def __init__(
f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})"
)
self._n_bits = n_bits
self._feature_names = [f"morgan_{i}" for i in range(self._n_bits)]

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Return all parameters defining the object.
Expand Down
3 changes: 2 additions & 1 deletion molpipeline/mol2any/mol2net_charge.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__(
UUID of the pipeline element, by default None
"""
self._descriptor_list = ["NetCharge"]
self._feature_names = self._descriptor_list
self._charge_method = charge_method
# pylint: disable=R0801
super().__init__(
Expand All @@ -72,7 +73,7 @@ def n_features(self) -> int:

@property
def descriptor_list(self) -> list[str]:
"""Return a copy of the descriptor list."""
"""Return a copy of the descriptor list. Alias of `feature_names`."""
return self._descriptor_list[:]

def _get_net_charge_gasteiger(
Expand Down
5 changes: 3 additions & 2 deletions molpipeline/mol2any/mol2path_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
class Mol2PathFP(
MolToRDKitGenFPElement
): # pylint: disable=too-many-instance-attributes
"""Folded Morgan Fingerprint.
"""Folded Path Fingerprint.

Feature-mapping to vector-positions is arbitrary.

Expand Down Expand Up @@ -99,9 +99,10 @@ def __init__(
)
if not isinstance(n_bits, int) or n_bits < 1:
raise ValueError(
f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})"
f"Number of bits has to be a positive integer, which is > 0! (Received: {n_bits})"
)
self._n_bits = n_bits
self._feature_names = [f"path_{i}" for i in range(self._n_bits)]
self._min_path = min_path
self._max_path = max_path
self._use_hs = use_hs
Expand Down
3 changes: 2 additions & 1 deletion molpipeline/mol2any/mol2rdkit_phys_chem.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def __init__(
UUID of the PipelineElement. If None, a new UUID is generated.
"""
self.descriptor_list = descriptor_list # type: ignore
self._feature_names = self._descriptor_list
self._return_with_errors = return_with_errors
self._log_exceptions = log_exceptions
super().__init__(
Expand All @@ -88,7 +89,7 @@ def n_features(self) -> int:

@property
def descriptor_list(self) -> list[str]:
"""Return a copy of the descriptor list."""
"""Return a copy of the descriptor list. Alias of `feature_names`."""
return self._descriptor_list[:]

@descriptor_list.setter
Expand Down
74 changes: 74 additions & 0 deletions tests/test_elements/test_mol2any/test_mol2concatenated.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import itertools
import unittest
from typing import Any, Literal, get_args

Expand Down Expand Up @@ -91,6 +92,11 @@ def test_generation(self) -> None:
self.assertTrue(np.allclose(output, output2))
self.assertTrue(np.allclose(output, output3))

def test_empty_element_list(self) -> None:
"""Test if an empty element list raises an error."""
with self.assertRaises(ValueError):
MolToConcatenatedVector([])

def test_n_features(self) -> None:
"""Test getting the number of features in the concatenated vector."""

Expand Down Expand Up @@ -131,6 +137,74 @@ def test_n_features(self) -> None:
net_charge_elem[1].n_features + 16 + physchem_elem[1].n_features,
)

def test_features_names(self) -> None: # pylint: disable-msg=too-many-locals
"""Test getting the names of features in the concatenated vector."""

physchem_elem = (
"RDKitPhysChem",
MolToRDKitPhysChem(),
)
net_charge_elem = ("NetCharge", MolToNetCharge())
morgan_elem = (
"MorganFP",
MolToMorganFP(n_bits=16),
)
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved
path_elem = (
"PathFP",
MolToMorganFP(n_bits=15),
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved
)
maccs_elem = (
"MACCSFP",
MolToMorganFP(n_bits=14),
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved
)

elements = [physchem_elem, net_charge_elem, morgan_elem, path_elem, maccs_elem]

for feature_names_prefix in [None, "my_prefix"]:
# test all subsets are compatible
powerset = itertools.chain.from_iterable(
itertools.combinations(elements, r) for r in range(len(elements) + 1)
)
# skip empty subset
next(powerset)

for elements_subset in powerset:
conc_elem = MolToConcatenatedVector(
list(elements_subset), feature_names_prefix=feature_names_prefix
)
feature_names = conc_elem.feature_names
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved

# test a feature names and n_features are consistent
self.assertEqual(
len(feature_names),
conc_elem.n_features,
)

seen_names = 0
for elem_name, elem in elements_subset:
self.assertTrue(hasattr(elem, "feature_names"))
elem_feature_names = elem.feature_names # type: ignore[attr-defined]
elem_n_features = len(elem_feature_names)
relevant_names = feature_names[
seen_names : seen_names + elem_n_features
]
prefixes, feat_names = map(
list, zip(*[name.split("__") for name in relevant_names])
)
# test feature names are the same
self.assertListEqual(elem_feature_names, feat_names)

if feature_names_prefix is not None:
# test prefixes are the same user given prefix
self.assertTrue(
all(prefix == feature_names_prefix for prefix in prefixes)
)
else:
# test prefixes are the same as element names
self.assertTrue(all(prefix == elem_name for prefix in prefixes))

seen_names += elem_n_features


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ def test_setter_getter_error_handling(self) -> None:
}
self.assertRaises(ValueError, mol_fp.set_params, **params)

def test_feature_names(self) -> None:
"""Test if the feature names are correct."""
mol_fp = MolToMACCSFP()
feature_names = mol_fp.feature_names
self.assertEqual(len(feature_names), mol_fp.n_bits)
# feature names should be unique
self.assertEqual(len(feature_names), len(set(feature_names)))


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ def test_bit2atom_mapping(self) -> None:
np_fp = fingerprints_to_numpy(fp)
self.assertEqual(np.nonzero(np_fp)[0].shape[0], len(mapping)) # type: ignore

def test_feature_names(self) -> None:
"""Test if the feature names are correct."""
mol_fp = MolToMorganFP(n_bits=1024)
feature_names = mol_fp.feature_names
self.assertEqual(len(feature_names), 1024)
# feature names should be unique
self.assertEqual(len(feature_names), len(set(feature_names)))


if __name__ == "__main__":
unittest.main()
8 changes: 8 additions & 0 deletions tests/test_elements/test_mol2any/test_mol2path_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ def test_setter_getter_error_handling(self) -> None:
}
self.assertRaises(ValueError, mol_fp.set_params, **params)

def test_feature_names(self) -> None:
"""Test if the feature names are correct."""
mol_fp = Mol2PathFP(n_bits=1024)
feature_names = mol_fp.feature_names
self.assertEqual(len(feature_names), 1024)
# feature names should be unique
self.assertEqual(len(feature_names), len(set(feature_names)))


if __name__ == "__main__":
unittest.main()
Loading