Skip to content

Commit

Permalink
mol2any: get feature names from elements (#106)
Browse files Browse the repository at this point in the history
mol2any: get feature names from elements

    - Feature names could not be extracted from fingerprint
      pipeline elements.
    - Added common interface to get names for fingerprints
      and descriptors.
  • Loading branch information
JochenSiegWork authored Nov 21, 2024
1 parent 2a3ef6e commit a2ad223
Show file tree
Hide file tree
Showing 13 changed files with 417 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class MolToFingerprintPipelineElement(MolToAnyPipelineElement, abc.ABC):
"""Abstract class for PipelineElements which transform molecules to integer vectors."""

_n_bits: int
_feature_names: list[str]
_output_type = "binary"
_return_as: OutputDatatype

Expand Down Expand Up @@ -71,6 +72,11 @@ def n_bits(self) -> int:
"""Get number of bits in (or size of) fingerprint."""
return self._n_bits

@property
def feature_names(self) -> list[str]:
"""Get feature names."""
return self._feature_names[:]

@overload
def assemble_output( # type: ignore
self, value_list: Iterable[npt.NDArray[np.int_]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class MolToDescriptorPipelineElement(MolToAnyPipelineElement):

_standardizer: Optional[AnyTransformer]
_output_type = "float"
_feature_names: list[str]

def __init__(
self,
Expand Down Expand Up @@ -66,6 +67,11 @@ def __init__(
def n_features(self) -> int:
"""Return the number of features."""

@property
def feature_names(self) -> list[str]:
"""Return a copy of the feature names."""
return self._feature_names[:]

def assemble_output(
self,
value_list: Iterable[npt.NDArray[np.float64]],
Expand Down
164 changes: 147 additions & 17 deletions molpipeline/mol2any/mol2concatinated_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import numpy as np
import numpy.typing as npt
from loguru import logger
from sklearn.base import clone

from molpipeline.abstract_pipeline_elements.core import (
Expand All @@ -32,6 +33,7 @@ class MolToConcatenatedVector(MolToAnyPipelineElement):
def __init__(
self,
element_list: list[tuple[str, MolToAnyPipelineElement]],
use_feature_names_prefix: bool = True,
name: str = "MolToConcatenatedVector",
n_jobs: int = 1,
uuid: Optional[str] = None,
Expand All @@ -43,6 +45,10 @@ def __init__(
----------
element_list: list[MolToAnyPipelineElement]
List of Pipeline Elements of which the output is concatenated.
use_feature_names_prefix: bool, optional (default=True)
If True, will add the pipeline element's name as prefix to feature names.
If False, only the feature names are used. This can lead to duplicate
feature names.
name: str, optional (default="MolToConcatenatedVector")
name of pipeline.
n_jobs: int, optional (default=1)
Expand All @@ -53,17 +59,15 @@ def __init__(
Additional keyword arguments. Can be used to set parameters of the pipeline elements.
"""
self._element_list = element_list
if len(element_list) == 0:
raise ValueError("element_list must contain at least one element.")
self._use_feature_names_prefix = use_feature_names_prefix
super().__init__(name=name, n_jobs=n_jobs, uuid=uuid)
output_types = set()
for _, element in self._element_list:
element.n_jobs = self.n_jobs
output_types.add(element.output_type)
if len(output_types) == 1:
self._output_type = output_types.pop()
else:
self._output_type = "mixed"
self._requires_fitting = any(
element[1]._requires_fitting for element in element_list
# set element execution details
self._set_element_execution_details(self._element_list)
# set feature names
self._feature_names = self._create_feature_names(
self._element_list, self._use_feature_names_prefix
)
self.set_params(**kwargs)

Expand All @@ -82,11 +86,88 @@ def n_features(self) -> int:
elif hasattr(element, "n_bits"):
feature_count += element.n_bits
else:
raise AssertionError(
raise ValueError(
f"Element {element} does not have n_features or n_bits."
)
return feature_count

@property
def feature_names(self) -> list[str]:
"""Return the feature names of concatenated elements."""
return self._feature_names[:]

@staticmethod
def _create_feature_names(
element_list: list[tuple[str, MolToAnyPipelineElement]],
use_feature_names_prefix: bool,
) -> list[str]:
"""Create feature names for concatenated vector from its elements.
Parameters
----------
element_list: list[tuple[str, MolToAnyPipelineElement]]
List of pipeline elements.
use_feature_names_prefix: bool
If True, will add the pipeline element's name as prefix to feature names.
If False, only the feature names are used. This can lead to duplicate
feature names.
Raises
------
ValueError
If element does not have feature_names attribute.
Returns
-------
list[str]
List of feature names.
"""
feature_names = []
for name, element in element_list:
if not hasattr(element, "feature_names"):
raise ValueError(
f"Element {element} does not have feature_names attribute."
)

if use_feature_names_prefix:
# use element name as prefix
feature_names.extend(
[f"{name}__{feature}" for feature in element.feature_names] # type: ignore[attr-defined]
)
else:
feature_names.extend(element.feature_names) # type: ignore[attr-defined]

if len(feature_names) != len(set(feature_names)):
logger.warning(
"Feature names in MolToConcatenatedVector are not unique."
" Set use_feature_names_prefix=True and use unique pipeline element"
" names to avoid this."
)
return feature_names

def _set_element_execution_details(
self, element_list: list[tuple[str, MolToAnyPipelineElement]]
) -> None:
"""Set output type and requires fitting for the concatenated vector.
Parameters
----------
element_list: list[tuple[str, MolToAnyPipelineElement]]
List of pipeline elements.
"""
output_types = set()
for _, element in self._element_list:
element.n_jobs = self.n_jobs
output_types.add(element.output_type)
if len(output_types) == 1:
self._output_type = output_types.pop()
else:
self._output_type = "mixed"
self._requires_fitting = any(
element[1]._requires_fitting # pylint: disable=protected-access
for element in element_list
)

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Return all parameters defining the object.
Expand All @@ -105,31 +186,47 @@ def get_params(self, deep: bool = True) -> dict[str, Any]:
parameters["element_list"] = [
(str(name), clone(ele)) for name, ele in self.element_list
]
parameters["use_feature_names_prefix"] = bool(
self._use_feature_names_prefix
)
else:
parameters["element_list"] = self.element_list
parameters["use_feature_names_prefix"] = self._use_feature_names_prefix
for name, element in self.element_list:
for key, value in element.get_params(deep=deep).items():
parameters[f"{name}__{key}"] = value

return parameters

def set_params(self, **parameters: Any) -> Self:
"""Set parameters.
def _set_element_list(
self, parameter_copy: dict[str, Any], **parameters: Any
) -> tuple[dict[str, Any], dict[str, Any]]:
"""Set the element list and run necessary configurations.
Parameters
----------
parameter_copy: dict[str, Any]
Copy of parameters.
parameters: Any
Parameters to set.
Original parameters.
Raises
------
ValueError
If element_list is empty.
Returns
-------
Self
Mol2ConcatenatedVector object with updated parameters.
tuple[dict[str, Any], dict[str, Any]]
Updated parameter_copy and parameters.
"""
parameter_copy = dict(parameters)
element_list = parameter_copy.pop("element_list", None)
if element_list is not None:
self._element_list = element_list
if len(element_list) == 0:
raise ValueError("element_list must contain at least one element.")
# reset element execution details
self._set_element_execution_details(self._element_list)
step_params: dict[str, dict[str, Any]] = {}
step_dict = dict(self._element_list)
to_delete_list = []
Expand All @@ -150,6 +247,39 @@ def set_params(self, **parameters: Any) -> Self:
_ = parameter_copy.pop(to_delete, None)
for step, params in step_params.items():
step_dict[step].set_params(**params)
return parameter_copy, parameters

def set_params(self, **parameters: Any) -> Self:
"""Set parameters.
Parameters
----------
parameters: Any
Parameters to set.
Returns
-------
Self
Mol2ConcatenatedVector object with updated parameters.
"""
parameter_copy = dict(parameters)

# handle element_list
parameter_copy, parameters = self._set_element_list(
parameter_copy, **parameters
)

# handle use_feature_names_prefix
use_feature_names_prefix = parameter_copy.pop("use_feature_names_prefix", None)
if use_feature_names_prefix is not None:
self._use_feature_names_prefix = use_feature_names_prefix
# reset feature names
self._feature_names = self._create_feature_names(
self._element_list,
self._use_feature_names_prefix, # type: ignore[arg-type]
)

# set parameters of super
super().set_params(**parameter_copy)
return self

Expand Down
1 change: 1 addition & 0 deletions molpipeline/mol2any/mol2maccs_key_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class MolToMACCSFP(MolToFingerprintPipelineElement):
"""

_n_bits = 167 # MACCS keys have 166 bits + 1 bit for an all-zero vector (bit 0)
_feature_names = [f"maccs_{i}" for i in range(_n_bits)]

def pretransform_single(
self, value: RDKitMol
Expand Down
1 change: 1 addition & 0 deletions molpipeline/mol2any/mol2morgan_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def __init__(
f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})"
)
self._n_bits = n_bits
self._feature_names = [f"morgan_{i}" for i in range(self._n_bits)]

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Return all parameters defining the object.
Expand Down
3 changes: 2 additions & 1 deletion molpipeline/mol2any/mol2net_charge.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__(
UUID of the pipeline element, by default None
"""
self._descriptor_list = ["NetCharge"]
self._feature_names = self._descriptor_list
self._charge_method = charge_method
# pylint: disable=R0801
super().__init__(
Expand All @@ -72,7 +73,7 @@ def n_features(self) -> int:

@property
def descriptor_list(self) -> list[str]:
"""Return a copy of the descriptor list."""
"""Return a copy of the descriptor list. Alias of `feature_names`."""
return self._descriptor_list[:]

def _get_net_charge_gasteiger(
Expand Down
5 changes: 3 additions & 2 deletions molpipeline/mol2any/mol2path_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
class Mol2PathFP(
MolToRDKitGenFPElement
): # pylint: disable=too-many-instance-attributes
"""Folded Morgan Fingerprint.
"""Folded Path Fingerprint.
Feature-mapping to vector-positions is arbitrary.
Expand Down Expand Up @@ -99,9 +99,10 @@ def __init__(
)
if not isinstance(n_bits, int) or n_bits < 1:
raise ValueError(
f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})"
f"Number of bits has to be a positive integer, which is > 0! (Received: {n_bits})"
)
self._n_bits = n_bits
self._feature_names = [f"path_{i}" for i in range(self._n_bits)]
self._min_path = min_path
self._max_path = max_path
self._use_hs = use_hs
Expand Down
3 changes: 2 additions & 1 deletion molpipeline/mol2any/mol2rdkit_phys_chem.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def __init__(
UUID of the PipelineElement. If None, a new UUID is generated.
"""
self.descriptor_list = descriptor_list # type: ignore
self._feature_names = self._descriptor_list
self._return_with_errors = return_with_errors
self._log_exceptions = log_exceptions
super().__init__(
Expand All @@ -88,7 +89,7 @@ def n_features(self) -> int:

@property
def descriptor_list(self) -> list[str]:
"""Return a copy of the descriptor list."""
"""Return a copy of the descriptor list. Alias of `feature_names`."""
return self._descriptor_list[:]

@descriptor_list.setter
Expand Down
Loading

0 comments on commit a2ad223

Please sign in to comment.